In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from IPython.display import display

%matplotlib inline

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 04:12:24--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-13 04:12:25 (5.27 MB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]



In [45]:
df = pd.read_csv('course_lead_scoring.csv')

In [46]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [47]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [48]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [49]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [50]:
df[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [51]:
df[numerical].isnull().sum()

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
dtype: int64

In [52]:
df[categorical] = df[categorical].fillna('NA')

In [53]:
df[numerical] = df[numerical].fillna(0.0)

In [54]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [55]:
df[numerical].corrwith(df.converted)

number_of_courses_viewed    0.435914
annual_income               0.053131
interaction_count           0.374573
lead_score                  0.193673
dtype: float64

In [56]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [57]:
len(df_full_train), len(df_train), len(df_val), len(df_test)

(1169, 876, 293, 293)

In [58]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [59]:
y_full_train = df_full_train.converted.values
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [60]:
del df_full_train['converted']
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [61]:
def mutual_info_score_clculator(series):
    return mutual_info_score(series,y_train)

In [62]:
mi = df_train[categorical].apply(mutual_info_score_clculator).round(2)

In [63]:
mi.sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [71]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [72]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [73]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [74]:
y_pred = model.predict_proba(X_val)[:,1]

In [75]:
churn_decision = (y_pred >= 0.5)

In [76]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [77]:
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.611922,1,0,False
1,0.799826,1,1,True
2,0.530213,1,0,False
3,0.471315,0,0,True
4,0.570661,1,0,False
...,...,...,...,...
288,0.419342,0,0,True
289,0.710539,1,1,True
290,0.418185,0,0,True
291,0.744835,1,1,True


In [78]:
df_pred.correct.astype(int).mean()

np.float64(0.6996587030716723)

<h3>Finding the most usefull feature</h3>

In [82]:
test_features = ['lead_source', 'industry', 'location', 'employment_status', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

def test_feature_effect(features):
    train_dicts = df_train[features].to_dict(orient='records')
    val_dicts = df_val[features].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    baseline_accuracy = np.mean(churn_decision == y_val)
    
    for feature in features:
        reduced = [f for f in features if f != feature]
        train_dicts = df_train[reduced].to_dict(orient='records')
        val_dicts = df_val[reduced].to_dict(orient='records')

        dv = DictVectorizer(sparse=False)
        X_train = dv.fit_transform(train_dicts)
        X_val = dv.transform(val_dicts)

        model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict_proba(X_val)[:, 1]
        churn_decision = (y_pred >= 0.5)
        accuracy = np.mean(churn_decision == y_val)

        print(f"Without '{feature}': accuracy drop = {baseline_accuracy - accuracy:.3f}")

test_feature_effect(test_features)


Without 'lead_source': accuracy drop = -0.003
Without 'industry': accuracy drop = 0.000
Without 'location': accuracy drop = -0.010
Without 'employment_status': accuracy drop = 0.003
Without 'number_of_courses_viewed': accuracy drop = 0.143
Without 'annual_income': accuracy drop = -0.154
Without 'interaction_count': accuracy drop = 0.143
Without 'lead_score': accuracy drop = -0.007


In [81]:
    for c in [0.01, 0.1, 1, 10, 100]:
        
        train_dicts = df_train[categorical + numerical].to_dict(orient='records')
        dv = DictVectorizer(sparse=False)
        X_train = dv.fit_transform(train_dicts)
        
        model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
        model.fit(X_train, y_train)
        
        val_dicts = df_val[categorical + numerical].to_dict(orient='records')
        X_val = dv.transform(val_dicts)
        
        y_pred = model.predict_proba(X_val)[:,1]
        churn_decision = (y_pred >= 0.5)
        
        df_pred = pd.DataFrame()
        df_pred['probability'] = y_pred
        df_pred['prediction'] = churn_decision.astype(int)
        df_pred['actual'] = y_val
        df_pred['correct'] = df_pred.prediction == df_pred.actual
        
        print(df_pred.correct.astype(int).mean())

0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723
