In [1]:
# Load libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
data = df.drop(columns = ['converted'])
target = df.converted
data.shape

(1462, 8)

In [3]:
data.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94
1,social_media,retail,1,46992.0,employed,south_america,1,0.8
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69
3,paid_ads,retail,2,83843.0,,australia,1,0.87
4,referral,education,3,85012.0,self_employed,europe,3,0.62


In [4]:
categorical_columns = data.select_dtypes(include='object').columns
categorical_columns

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')

In [5]:
data[categorical_columns].nunique()

lead_source          5
industry             7
employment_status    4
location             7
dtype: int64

In [6]:
target.value_counts()

converted
1    905
0    557
Name: count, dtype: int64

In [7]:
data.industry.value_counts()

industry
retail           203
finance          200
other            198
education        187
healthcare       187
technology       179
manufacturing    174
Name: count, dtype: int64

In [8]:
data = data.fillna({col: 'NA' for col in data.select_dtypes(include = 'object')} |
                   {col: 0.0 for col in data.select_dtypes(exclude = 'object')})

In [9]:
data.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
dtype: int64

In [10]:
data.corr(numeric_only = True)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [11]:
train_size = 0.6
val_size = 0.2
test_size = 0.2
random_state = 42

data_full_train, data_test, y_full_train, y_test = train_test_split(data, target, test_size=test_size, random_state=random_state)
data_train, data_val, y_train, y_val = train_test_split(data_full_train, y_full_train, test_size=val_size / (1 - test_size), random_state=random_state)

In [12]:
def compute_mutual_info_score(series):
    return mutual_info_score(series, y_train)

mi = data_train.select_dtypes(include='object').apply(compute_mutual_info_score)
mi.sort_values(ascending=False)


lead_source          0.035396
employment_status    0.012938
industry             0.011575
location             0.004464
dtype: float64

In [13]:
def train_a_model_and_return_accuracy(data_train, data_val, 
                                      categorical_columns,
                                      column_to_remove = None,
                                      c = 0.1) : 

    if column_to_remove : 
        data_train = data_train.drop(columns = column_to_remove).copy()
        data_val = data_val.drop(columns = column_to_remove).copy()
        categorical_columns = categorical_columns.drop(column_to_remove).copy()
    
    categorical_preprocessor = OneHotEncoder()

    preprocessor = ColumnTransformer(
        [("one-hot-encoder", categorical_preprocessor, categorical_columns)],
        remainder = 'passthrough'
    )

    model = make_pipeline(preprocessor, 
                          LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42))

    model.fit(data_train, y_train)

    accuracy = (model.predict(data_val) == y_val).mean()

    return accuracy

In [14]:
original_accuracy = train_a_model_and_return_accuracy(data_train, data_val, categorical_columns)
original_accuracy

np.float64(0.6996587030716723)

In [15]:
for col in categorical_columns : 

    accuracy = train_a_model_and_return_accuracy(data_train, data_val, categorical_columns,
                                     column_to_remove = col)

    print('%s : %s' %(col, accuracy.round(5)))
    print('%s diff : %s' %(col, abs(original_accuracy - accuracy).round(5)))

lead_source : 0.69966
lead_source diff : 0.0
industry : 0.69966
industry diff : 0.0
employment_status : 0.69625
employment_status diff : 0.00341
location : 0.70648
location diff : 0.00683


In [16]:
for c in [0.01, 0.1, 1, 10, 100] : 
# for c in [1e-6, 1e-3, 1e-1, 1, 10, 1e3, 1e6]:

    accuracy = train_a_model_and_return_accuracy(data_train, data_val, categorical_columns,
                                     c = c)

    print('%s : %s' %(c, accuracy.round(5)))

0.01 : 0.69966
0.1 : 0.69966
1 : 0.69966
10 : 0.69966
100 : 0.69966
