# Imports

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score
from sklearn.metrics import roc_auc_score, accuracy_score

# Load, display, first columns

In [34]:
leads = pd.read_csv("Leads.csv")
leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [35]:
leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

# Data preparation and data cleaning

In [36]:
# Handle 'Select' values by replacing them with NaN
leads.replace('Select', np.nan, inplace=True)

# Drop columns with a high percentage of missing values (more than 30%)
missing_percentage = leads.isnull().sum() / len(leads) * 100
cols_to_drop = missing_percentage[missing_percentage > 30].index
leads.drop(cols_to_drop, axis=1, inplace=True)

# Impute missing values in numerical columns with the median
numerical_cols = leads.select_dtypes(include=np.number).columns
for col in numerical_cols:
    leads[col] = leads[col].fillna(leads[col].median())

# Impute missing values in categorical columns with the mode
categorical_cols = leads.select_dtypes(include=object).columns
for col in categorical_cols:
    leads[col] = leads[col].fillna(leads[col].mode()[0])

# Split the data into training, validation, and testing sets

In [37]:
leads_full_train, leads_test = train_test_split(leads, test_size=0.2, random_state=1)
leads_train, leads_val = train_test_split(leads_full_train, test_size=0.25, random_state=1)

In [38]:
# Reset indices
leads_train = leads_train.reset_index(drop=True)
leads_val = leads_val.reset_index(drop=True)
leads_test = leads_test.reset_index(drop=True)

In [39]:
# Separate the target variable
y_train = leads_train.Converted.values
y_val = leads_val.Converted.values
y_test = leads_test.Converted.values

In [40]:
# Remove the target
del leads_train['Converted']
del leads_val['Converted']
del leads_test['Converted']

# EDA

In [41]:
# Check the distribution of the target variable
global_conversion_rate = leads_full_train.Converted.mean()
print("Global Conversion Rate:", global_conversion_rate)

Global Conversion Rate: 0.38311688311688313


# Feature importance: Conversion rate and risk ratio, Mutual information

In [42]:
for c in categorical_cols:
    print(c)
    df_group = leads_full_train.groupby(c).Converted.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_conversion_rate
    df_group['risk'] = df_group['mean'] / global_conversion_rate
    display(df_group)
    print()
    print()

Prospect ID


Unnamed: 0_level_0,mean,count,diff,risk
Prospect ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000104b9-23e4-4ddc-8caa-8629fe8ad7f4,1.0,1,0.616883,2.610169
0006d10a-eb01-4ba9-92e2-ad78588b2a40,0.0,1,-0.383117,0.000000
0011be30-fa97-465b-8e44-0ae83dff7eed,0.0,1,-0.383117,0.000000
0011f23e-9fd9-4256-b316-efc2e2639b0d,0.0,1,-0.383117,0.000000
001e6e14-2183-47ab-a405-108e44bc2e66,1.0,1,0.616883,2.610169
...,...,...,...,...
ffd99338-2e6b-4c3f-8650-68b94ea5e07f,0.0,1,-0.383117,0.000000
ffec8e24-0c99-4345-89f1-e3ad6689764f,1.0,1,0.616883,2.610169
fff076a3-fe95-4c79-9401-e15846be8086,0.0,1,-0.383117,0.000000
fff49ad0-6015-448c-a7cc-f454c39ffdda,0.0,1,-0.383117,0.000000




Lead Origin


Unnamed: 0_level_0,mean,count,diff,risk
Lead Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
API,0.308414,2876,-0.074702,0.805014
Landing Page Submission,0.360677,3901,-0.02244,0.941427
Lead Add Form,0.919861,574,0.536744,2.400992
Lead Import,0.225,40,-0.158117,0.587288
Quick Add Form,1.0,1,0.616883,2.610169




Lead Source


Unnamed: 0_level_0,mean,count,diff,risk
Lead Source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Click2call,0.75,4,0.366883,1.957627
Direct Traffic,0.317624,2037,-0.065493,0.829052
Facebook,0.225,40,-0.158117,0.587288
Google,0.405091,2318,0.021974,1.057355
Live Chat,1.0,2,0.616883,2.610169
NC_EDM,1.0,1,0.616883,2.610169
Olark Chat,0.251768,1414,-0.131349,0.657157
Organic Search,0.37432,919,-0.008797,0.977038
Pay per Click Ads,0.0,1,-0.383117,0.0
Press_Release,0.0,2,-0.383117,0.0




Do Not Email


Unnamed: 0_level_0,mean,count,diff,risk
Do Not Email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.401265,6801,0.018148,1.047368
Yes,0.174281,591,-0.208836,0.454903




Do Not Call


Unnamed: 0_level_0,mean,count,diff,risk
Do Not Call,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383033,7391,-8.3e-05,0.999782
Yes,1.0,1,0.616883,2.610169




Last Activity


Unnamed: 0_level_0,mean,count,diff,risk
Last Activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Approached upfront,1.0,8,0.616883,2.610169
Converted to Lead,0.126437,348,-0.25668,0.330021
Email Bounced,0.092308,260,-0.290809,0.240939
Email Link Clicked,0.239437,213,-0.14368,0.62497
Email Marked Spam,1.0,2,0.616883,2.610169
Email Opened,0.375311,2811,-0.007806,0.979626
Email Received,1.0,2,0.616883,2.610169
Form Submitted on Website,0.255102,98,-0.128015,0.66586
Had a Phone Conversation,0.727273,22,0.344156,1.898305
Olark Chat Conversation,0.086185,789,-0.296932,0.224958




Country


Unnamed: 0_level_0,mean,count,diff,risk
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia/Pacific Region,0.5,2,0.116883,1.305085
Australia,0.222222,9,-0.160895,0.580038
Bahrain,0.666667,6,0.28355,1.740113
Bangladesh,0.5,2,0.116883,1.305085
Belgium,0.0,2,-0.383117,0.0
Canada,0.0,4,-0.383117,0.0
China,0.0,1,-0.383117,0.0
Denmark,1.0,1,0.616883,2.610169
France,0.6,5,0.216883,1.566102
Germany,0.25,4,-0.133117,0.652542




What is your current occupation


Unnamed: 0_level_0,mean,count,diff,risk
What is your current occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Businessman,0.5,6,0.116883,1.305085
Housewife,1.0,8,0.616883,2.610169
Other,0.615385,13,0.232268,1.606258
Student,0.359281,167,-0.023835,0.937785
Unemployed,0.337451,6638,-0.045666,0.880804
Working Professional,0.916071,560,0.532955,2.391102




What matters most to you in choosing a course


Unnamed: 0_level_0,mean,count,diff,risk
What matters most to you in choosing a course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Better Career Prospects,0.383085,7390,-3.2e-05,0.999917
Flexibility & Convenience,0.5,2,0.116883,1.305085




Search


Unnamed: 0_level_0,mean,count,diff,risk
Search,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383146,7381,2.9e-05,1.000076
Yes,0.363636,11,-0.019481,0.949153




Magazine


Unnamed: 0_level_0,mean,count,diff,risk
Magazine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383117,7392,0.0,1.0




Newspaper Article


Unnamed: 0_level_0,mean,count,diff,risk
Newspaper Article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383033,7391,-8.3e-05,0.999782
Yes,1.0,1,0.616883,2.610169




X Education Forums


Unnamed: 0_level_0,mean,count,diff,risk
X Education Forums,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383117,7392,0.0,1.0




Newspaper


Unnamed: 0_level_0,mean,count,diff,risk
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383169,7391,5.2e-05,1.000135
Yes,0.0,1,-0.383117,0.0




Digital Advertisement


Unnamed: 0_level_0,mean,count,diff,risk
Digital Advertisement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383221,7390,0.000104,1.000271
Yes,0.0,2,-0.383117,0.0




Through Recommendations


Unnamed: 0_level_0,mean,count,diff,risk
Through Recommendations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.382918,7388,-0.000199,0.999482
Yes,0.75,4,0.366883,1.957627




Receive More Updates About Our Courses


Unnamed: 0_level_0,mean,count,diff,risk
Receive More Updates About Our Courses,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383117,7392,0.0,1.0




Update me on Supply Chain Content


Unnamed: 0_level_0,mean,count,diff,risk
Update me on Supply Chain Content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383117,7392,0.0,1.0




Get updates on DM Content


Unnamed: 0_level_0,mean,count,diff,risk
Get updates on DM Content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383117,7392,0.0,1.0




I agree to pay the amount through cheque


Unnamed: 0_level_0,mean,count,diff,risk
I agree to pay the amount through cheque,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.383117,7392,0.0,1.0




A free copy of Mastering The Interview


Unnamed: 0_level_0,mean,count,diff,risk
A free copy of Mastering The Interview,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.39578,5071,0.012663,1.033053
Yes,0.35545,2321,-0.027667,0.927785




Last Notable Activity


Unnamed: 0_level_0,mean,count,diff,risk
Last Notable Activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Approached upfront,1.0,1,0.616883,2.610169
Email Bounced,0.16,50,-0.223117,0.417627
Email Link Clicked,0.222222,135,-0.160895,0.580038
Email Marked Spam,1.0,2,0.616883,2.610169
Email Opened,0.364362,2256,-0.018755,0.951046
Email Received,1.0,1,0.616883,2.610169
Form Submitted on Website,0.0,1,-0.383117,0.0
Had a Phone Conversation,0.9,10,0.516883,2.349153
Modified,0.23043,2721,-0.152687,0.601461
Olark Chat Conversation,0.156863,153,-0.226254,0.409438






In [43]:
def mutual_info_conversion_score(series):
    return mutual_info_score(series, leads_full_train.Converted)

mi = leads_full_train[categorical_cols].apply(mutual_info_conversion_score)
mi.sort_values(ascending=False)

Prospect ID                                      0.665569
Last Activity                                    0.084629
Last Notable Activity                            0.072517
Lead Source                                      0.061448
Lead Origin                                      0.055599
What is your current occupation                  0.053127
Do Not Email                                     0.008911
Country                                          0.003266
A free copy of Mastering The Interview           0.000745
Through Recommendations                          0.000151
Digital Advertisement                            0.000131
Do Not Call                                      0.000130
Newspaper Article                                0.000130
Newspaper                                        0.000065
What matters most to you in choosing a course    0.000008
Search                                           0.000001
Magazine                                         0.000000
Receive More U

# Train the model

In [44]:
def train(X_train, y_train, C=1.0):
    # Convert DataFrame to a list of dictionaries
    train_dicts = X_train.to_dict(orient='records')  # Include all columns

    # Create a DictVectorizer object
    dv = DictVectorizer(sparse=False)

    # Fit the DictVectorizer object on the training data
    X_train_encoded = dv.fit_transform(train_dicts)

    # Create a LogisticRegression object
    model = LogisticRegression(C=C, max_iter=1000)

    # Fit the LogisticRegression object on the training data
    model.fit(X_train_encoded, y_train)

    return dv, model

In [45]:
dv, model = train(leads_full_train, leads_full_train.Converted.values, C=1.0)

# Evaluate the model

In [46]:
# Use the trained model to predict on the test set
y_pred = model.predict_proba(dv.transform(leads_test.to_dict(orient='records')))[:, 1]  # Include all columns

In [47]:
# Calculate the AUC score
auc = roc_auc_score(y_test, y_pred)
print(f"AUC Score: {auc:.4f}")

AUC Score: 0.8664
