In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

print('hello')
# Read data
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')


hello


In [25]:
# Data Preparation

numerical = ['number_of_courses_viewed','annual_income','interaction_count','lead_score','converted']
categorical = list(set(df.columns)-set(numerical))

df[numerical] = df[numerical].fillna(0)
df[categorical] = df[categorical].fillna('NA')

df[categorical] = df[categorical].astype('category')

df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [26]:
# Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution. Use train_test_split function for that with random_state=1

df_all_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_all_train, test_size=0.25, random_state=1)

df_all_train.reset_index(inplace=True)
df_train.reset_index(inplace=True)
df_val.reset_index(inplace=True)
df_test.reset_index(inplace=True)

y_all_train = df_all_train['converted'].values
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values


In [27]:
# Question 1: ROC AUC feature importance

# Which numerical variable (among the following 4) has the highest AUC?
# lead_score
# number_of_courses_viewed
# interaction_count
# annual_income

target_columns = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']

for c in target_columns:
    y_predicted = df_train[c].values
    print('%s: %.3f' % (c, round(roc_auc_score(y_train, y_predicted),2)))


# Answers: number_of_courses_viewed
#lead_score: 0.610
#number_of_courses_viewed: 0.760
#interaction_count: 0.740
#annual_income: 0.550


lead_score: 0.610
number_of_courses_viewed: 0.760
interaction_count: 0.740
annual_income: 0.550


In [28]:
# Question 2: Training the model

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
#converted_decision = (y_pred > 0.5)

print("score: %.3f" % (roc_auc_score(y_val, y_pred)))


score: 0.883


In [38]:
# Question 3: Precision and Recall

thresholds = np.arange(0,1.01,0.01)

actual_positive = (y_val==1)
actual_negative = (y_val==0)

precisions = []
recalls = []
tps = []
fps = []
fns = []
tns = []

for th in thresholds:
    predicted_positive = (y_pred >= th)
    predicted_negative = (y_pred < th)

    tps.append(actual_positive & predicted_positive).sum())
    fps.append(actual_negative & predicted_positive).sum())
    fns.append(actual_positive & predicted_negative).sum())
    tns.append(actual_negative & predicted_negative).sum())

    #precisions.append(tp / (tp + fp))
    #recalls.append(tp / (tp + fn))

pred_results = pd.Dataframe(cols=['threshold','tp','fp','fn','tn'], data=[thresholds, tps, fps, fns, tns])
pred_results['precision'] = pred_results['tp']/(pred_results['tp']+pred_results['fp'])
pred_results['recall'] = pred_results['tp']/(pred_results['tp']+pred_results['fn'])

plt.plot(pred_results['threshold'], pred_results['precision'], label='precision')
plt.plot(pred_results['threshold'], pred_results['recall'], label='recall')
plt.legend()
plt.show()

SyntaxError: unmatched ')' (2526106128.py, line 19)

In [30]:
# Question 4: F1 Score


In [31]:
# Question 5: 5-Fold CV


In [32]:
# Question 6: Hyperparameter Tuning