In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [3]:
data.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
data["industry"].value_counts()

industry
retail           203
finance          200
other            198
education        187
healthcare       187
technology       179
manufacturing    174
Name: count, dtype: int64

In [5]:
data.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
data.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [9]:
numerical = ["number_of_courses_viewed","annual_income","lead_score","converted"]

In [10]:
categorical_fill = ['lead_source', 'industry', 'employment_status', 'location']
numerical_fill = ['annual_income']

data_fill = data.copy()
data_fill[categorical_fill] = data_fill[categorical_fill].fillna('NA')
data_fill[numerical_fill] = data_fill[numerical_fill].fillna(0)

In [16]:
data_fill[numerical].corrwith(data_fill["converted"])

number_of_courses_viewed    0.435914
annual_income               0.053131
lead_score                  0.193673
converted                   1.000000
dtype: float64

In [15]:
data_fill[["interaction_count"]].corrwith(data_fill["lead_score"])

interaction_count    0.009888
dtype: float64

In [14]:
data_fill[["number_of_courses_viewed"]].corrwith(data_fill["lead_score"])

number_of_courses_viewed   -0.004879
dtype: float64

In [17]:
data_fill[["number_of_courses_viewed"]].corrwith(data_fill["interaction_count"])

number_of_courses_viewed   -0.023565
dtype: float64

In [18]:
data_fill[["annual_income"]].corrwith(data_fill["interaction_count"])

annual_income    0.027036
dtype: float64

In [19]:
df_full_train, df_test = train_test_split(data_fill, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [20]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [21]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [22]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
mutual_info_score(df_train.industry, y_train)

0.011574521435657112

In [25]:
mutual_info_score(df_train.location, y_train)

0.004464157884038034

In [26]:
mutual_info_score(df_train.lead_source, y_train)

0.03539624379726594

In [27]:
mutual_info_score(df_train.employment_status, y_train)

0.012937677269442782

In [28]:
from sklearn.feature_extraction import DictVectorizer

In [29]:
dicts = df_train[categorical_fill].iloc[:10].to_dict(orient='records')

In [33]:
dv = DictVectorizer(sparse=False)

dv.fit(dicts)
dv.transform(dicts)

array([[0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
        1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        1., 0., 0.,

In [35]:
train_dict = df_train[categorical_fill+numerical_fill].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_fill+numerical_fill].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [36]:
from sklearn.linear_model import LogisticRegression

In [75]:
model = LogisticRegression(solver='liblinear', C=10.0, max_iter=1000, random_state=42)

In [76]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [77]:
model.coef_

array([[ 8.30763518e-06, -3.90620012e-07,  1.89241371e-06,
         1.16535943e-06,  2.10575663e-06, -2.68834880e-06,
        -5.91807581e-07,  2.58684470e-06, -3.37602441e-07,
         3.38507369e-07,  5.13862931e-07,  2.60582264e-07,
        -4.28362405e-07, -2.57463872e-07,  1.42944528e-06,
         7.67924071e-08,  6.23065009e-07, -3.75210304e-06,
         4.15405721e-06, -4.46695904e-07,  3.63902389e-07,
         1.86711796e-07, -6.86955468e-09,  5.04745501e-07,
         1.35901063e-06,  1.00353221e-06, -4.40631109e-07,
        -8.85840900e-07]])

In [78]:
model.intercept_[0]

np.float64(2.0845609678287462e-06)

In [79]:
# y_pred = model.predict(X_val)

In [80]:
# df_pred = pd.DataFrame()
# df_pred['correct'] =

In [81]:
y_pred_prob = model.predict_proba(X_val)[:,1]

In [82]:
y_pred_prob

array([0.60678597, 0.62142501, 0.61648419, 0.50000051, 0.61051327,
       0.67775533, 0.50000105, 0.61349645, 0.62161736, 0.62715266,
       0.6039411 , 0.61308276, 0.63717167, 0.62451985, 0.66757899,
       0.60903823, 0.63323719, 0.63137709, 0.60986626, 0.57193871,
       0.58123555, 0.6394019 , 0.62715263, 0.54680385, 0.65226838,
       0.57728601, 0.62929935, 0.66113914, 0.63340143, 0.50000098,
       0.50000201, 0.65681978, 0.6059327 , 0.50000281, 0.64887889,
       0.60485345, 0.6646669 , 0.5462966 , 0.61327766, 0.59302015,
       0.49999959, 0.64845404, 0.57509893, 0.50000108, 0.61503673,
       0.63157534, 0.65042556, 0.55703208, 0.63164319, 0.62631847,
       0.64635962, 0.66533247, 0.59792669, 0.60003823, 0.50000202,
       0.67148286, 0.64305556, 0.65372434, 0.61970726, 0.65369191,
       0.62481091, 0.65416416, 0.6998578 , 0.60134161, 0.62206847,
       0.60382744, 0.60371099, 0.63823517, 0.59249579, 0.50000137,
       0.64887193, 0.64131108, 0.50000007, 0.64502177, 0.62688

In [83]:
converted_preds = (y_pred_prob > 0.5).astype(int)

In [84]:
converted_preds

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

In [85]:
df_pred = pd.DataFrame()
df_pred["converted"] = converted_preds

In [86]:
df_pred['converted'].mean()

np.float64(0.9795221843003413)

In [87]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'lead_source=NA',
       'lead_source=events', 'lead_source=organic_search',
       'lead_source=paid_ads', 'lead_source=referral',
       'lead_source=social_media', 'location=NA', 'location=africa',
       'location=asia', 'location=australia', 'location=europe',
       'location=middle_east', 'location=north_america',
       'location=south_america'], dtype=object)

In [62]:
len(model.coef_[0])

28

In [68]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(8)))

{'annual_income': np.float64(8.31e-06),
 'employment_status=NA': np.float64(-3.9e-07),
 'employment_status=employed': np.float64(1.89e-06),
 'employment_status=self_employed': np.float64(1.17e-06),
 'employment_status=student': np.float64(2.11e-06),
 'employment_status=unemployed': np.float64(-2.69e-06),
 'industry=NA': np.float64(-5.9e-07),
 'industry=education': np.float64(2.59e-06),
 'industry=finance': np.float64(-3.4e-07),
 'industry=healthcare': np.float64(3.4e-07),
 'industry=manufacturing': np.float64(5.1e-07),
 'industry=other': np.float64(2.6e-07),
 'industry=retail': np.float64(-4.3e-07),
 'industry=technology': np.float64(-2.6e-07),
 'lead_source=NA': np.float64(1.43e-06),
 'lead_source=events': np.float64(8e-08),
 'lead_source=organic_search': np.float64(6.2e-07),
 'lead_source=paid_ads': np.float64(-3.75e-06),
 'lead_source=referral': np.float64(4.15e-06),
 'lead_source=social_media': np.float64(-4.5e-07),
 'location=NA': np.float64(3.6e-07),
 'location=africa': np.float6

In [69]:
c =[0.01, 0.1, 1, 10, 100]

In [73]:
for c_val in c:
    model = LogisticRegression(solver='liblinear', C=c_val, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    df_pred = pd.DataFrame()
    y_pred_prob = model.predict_proba(X_val)[:,1]
    converted_preds = (y_pred_prob > 0.5).astype(int)
    df_pred["converted"] = converted_preds
    acc = df_pred['converted'].mean()

    print(f"C : {c_val} : {acc}")

C : 0.01 : 0.9795221843003413
C : 0.1 : 0.9795221843003413
C : 1 : 0.9795221843003413
C : 10 : 0.9795221843003413
C : 100 : 0.9795221843003413


In [74]:
df_pred

Unnamed: 0,converted
0,1
1,1
2,1
3,1
4,1
...,...
288,1
289,1
290,1
291,1
