## Classification

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

## Data preparation

In [None]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
numerical_features = ['annual_income']

for f in categorical_features:
    df[f] = df[f].fillna('NA')

for f in numerical_features:
    df[f] = df[f].fillna(0.0)

In [None]:
df.industry.value_counts()

In [None]:
df.industry.mode()

In [None]:
numerical_features = list(df.dtypes[df.dtypes != 'object'].keys())
numerical_features

In [None]:
correlations = { k: [] for k in numerical_features}

for col in numerical_features:
    for col2 in numerical_features:
        correlations[col].append(df[col].corr(df[col2])) 

correlation_matrix = pd.DataFrame(correlations)
correlation_matrix.index = numerical_features

correlation_matrix

## Split the data

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
y_full_train = df_full_train.converted
y_test = df_test.converted
y_train = df_train.converted
y_val = df_val.converted

df_full_train = df_full_train.drop(columns=['converted'])
df_test = df_test.drop(columns= ['converted'])
df_train = df_train.drop(columns=['converted'])
df_val = df_val.drop(columns=['converted'])

In [None]:
categorical_features = list(df_train.dtypes[df_train.dtypes == 'object'].keys())
categorical_features

In [None]:
mi = df_train[categorical_features].apply(lambda s: mutual_info_score(s, y_train))
mi.sort_values(ascending=False)

In [None]:
numerical_features = list(df_train.dtypes[df.dtypes != 'object'].keys())
numerical_features + categorical_features

In [None]:
dicts = df_train[categorical_features + numerical_features].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
dicts = df_val[categorical_features + numerical_features].to_dict(orient='records')
X_val = dv.fit_transform(dicts)
y_pred = model.predict(X_val)
round((y_val == y_pred).mean(), 2)

In [None]:
dicts = df_train[categorical_features + numerical_features].to_dict(orient='records')
X_train = dv.fit_transform(dicts)
model.fit(X_train, y_train)

dicts = df_val[categorical_features + numerical_features].to_dict(orient='records')
X_val = dv.fit_transform(dicts)
y_pred = model.predict(X_val)
original_accuracy = (y_val == y_pred).mean()
print('Original:', original_accuracy)

for f in categorical_features + numerical_features:
    smaller_features = [feature for feature in categorical_features + numerical_features if feature != f]
    dicts_smaller = df_train[smaller_features].to_dict(orient='records')
    X_train = dv.fit_transform(dicts_smaller)
    model.fit(X_train, y_train)

    dicts = df_val[smaller_features].to_dict(orient='records')
    X_val = dv.fit_transform(dicts)
    y_pred = model.predict(X_val)
    print(f'Without {f} difference:', original_accuracy - (y_val == y_pred).mean())

In [None]:
for C in [0.01, 0.1, 1, 10, 100]:
    dicts = df_train[categorical_features + numerical_features].to_dict(orient='records')
    X_train = dv.fit_transform(dicts)
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    dicts = df_val[categorical_features + numerical_features].to_dict(orient='records')
    X_val = dv.fit_transform(dicts)
    y_pred = model.predict(X_val)
    accuracy = round((y_val == y_pred).mean(), 3)
    print(f"{C}:", accuracy)