In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression

In [18]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [19]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [20]:
# Clean-up of the data
for col in ['lead_source', 'industry', 'employment_status', 'location']:
    df[col] = df[col].fillna('NA')
for col in ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']:
    df[col] = df[col].fillna(0.0)


In [21]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [22]:
df[['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']].corr()


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [49]:
df_full_train, df_test = train_test_split(df.drop(columns=['converted']), test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
print(len(df_train), len(df_test), len(df_val))

y_full_train, y_test = train_test_split(df['converted'], test_size=0.2, random_state=42)
y_train, y_val = train_test_split(y_full_train, test_size=0.25, random_state=42)
print(len(y_train), len(y_test), len(y_val))

876 293 293
876 293 293


In [50]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [52]:
for col in ['lead_source', 'industry', 'employment_status', 'location']:
    print(col, mutual_info_score(df_train[col], y_train))

lead_source 0.03539624379726594
industry 0.011574521435657112
employment_status 0.012937677269442782
location 0.004464157884038034


In [54]:
for col in ['lead_source', 'industry', 'employment_status', 'location']:
    df_train[col].replace({j: i for i, j in enumerate(df_train[col].unique())}, inplace=True)
    df_val[col].replace({j: i for i, j in enumerate(df_val[col].unique())}, inplace=True)
    df_test[col].replace({j: i for i, j in enumerate(df_test[col].unique())}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].replace({j: i for i, j in enumerate(df_train[col].unique())}, inplace=True)
  df_train[col].replace({j: i for i, j in enumerate(df_train[col].unique())}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_val[col].replace({j: i for i, j in enumerate(df

In [56]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(df_train, y_train)

In [57]:
y_pred = model.predict_proba(df_val)[:, 1]

In [64]:
np.round(((y_pred > 0.5) == y_val).sum() / len(y_val), 2)

0.71

In [65]:
for col in ['lead_source', 'industry', 'employment_status', 'location']:
    model_xfeature = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_xfeature.fit(df_train.drop(columns=[col]), y_train)
    y_pred_xfeature = model_xfeature.predict_proba(df_val.drop(columns=[col]))[:, 1]
    print(col, ((y_pred > 0.5) == y_val).sum() / len(y_val) - ((y_pred_xfeature > 0.5) == y_val).sum() / len(y_val))
    

lead_source -0.0034129692832763903
industry 0.03754266211604096
employment_status -0.023890784982935176
location 0.010238907849829393


In [69]:
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(df_train, y_train)
    y_pred = model.predict_proba(df_val)[:, 1]
    print(c, np.round(((y_pred > 0.5) == y_val).sum() / len(y_val), 10))

0.01 0.7133105802
0.1 0.7133105802
1 0.7098976109
10 0.7098976109
100 0.7098976109
