## Data Preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline

In [2]:
df = pd.read_csv('Leads.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
Prospect ID,7927b2df-8bba-4d29-b9a2-b6e0beafe620,2a272436-5132-4136-86fa-dcc88c88f482,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,3256f628-e534-4826-9d63-4a8b88782852
Lead Number,660737,660728,660727,660719,660681
Lead Origin,API,API,Landing Page Submission,Landing Page Submission,Landing Page Submission
Lead Source,Olark Chat,Organic Search,Direct Traffic,Direct Traffic,Google
Do Not Email,No,No,No,No,No
Do Not Call,No,No,No,No,No
Converted,0,0,1,0,1
TotalVisits,0.0,5.0,2.0,1.0,2.0
Total Time Spent on Website,0,674,1532,305,1428
Page Views Per Visit,0.0,2.5,2.0,1.0,1.0


In [5]:
df.columns = df.columns.str.lower().str.replace(' ','_')
df.columns

Index(['prospect_id', 'lead_number', 'lead_origin', 'lead_source',
       'do_not_email', 'do_not_call', 'converted', 'totalvisits',
       'total_time_spent_on_website', 'page_views_per_visit', 'last_activity',
       'country', 'specialization', 'how_did_you_hear_about_x_education',
       'what_is_your_current_occupation',
       'what_matters_most_to_you_in_choosing_a_course', 'search', 'magazine',
       'newspaper_article', 'x_education_forums', 'newspaper',
       'digital_advertisement', 'through_recommendations',
       'receive_more_updates_about_our_courses', 'tags', 'lead_quality',
       'update_me_on_supply_chain_content', 'get_updates_on_dm_content',
       'lead_profile', 'city', 'asymmetrique_activity_index',
       'asymmetrique_profile_index', 'asymmetrique_activity_score',
       'asymmetrique_profile_score',
       'i_agree_to_pay_the_amount_through_cheque',
       'a_free_copy_of_mastering_the_interview', 'last_notable_activity'],
      dtype='object')

In [6]:
# Categorical features

categorical = df.dtypes[df.dtypes == 'object'].index
categorical

Index(['prospect_id', 'lead_origin', 'lead_source', 'do_not_email',
       'do_not_call', 'last_activity', 'country', 'specialization',
       'how_did_you_hear_about_x_education', 'what_is_your_current_occupation',
       'what_matters_most_to_you_in_choosing_a_course', 'search', 'magazine',
       'newspaper_article', 'x_education_forums', 'newspaper',
       'digital_advertisement', 'through_recommendations',
       'receive_more_updates_about_our_courses', 'tags', 'lead_quality',
       'update_me_on_supply_chain_content', 'get_updates_on_dm_content',
       'lead_profile', 'city', 'asymmetrique_activity_index',
       'asymmetrique_profile_index',
       'i_agree_to_pay_the_amount_through_cheque',
       'a_free_copy_of_mastering_the_interview', 'last_notable_activity'],
      dtype='object')

In [7]:
# set the contents of the dataframe .str.lower().str.replace(' ','_')

for c in df[categorical]:
    df[c] = df[c].str.lower().str.replace(' ','_')

df.head(10)

Unnamed: 0,prospect_id,lead_number,lead_origin,lead_source,do_not_email,do_not_call,converted,totalvisits,total_time_spent_on_website,page_views_per_visit,...,get_updates_on_dm_content,lead_profile,city,asymmetrique_activity_index,asymmetrique_profile_index,asymmetrique_activity_score,asymmetrique_profile_score,i_agree_to_pay_the_amount_through_cheque,a_free_copy_of_mastering_the_interview,last_notable_activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,api,olark_chat,no,no,0,0.0,0,0.0,...,no,select,select,02.medium,02.medium,15.0,15.0,no,no,modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,api,organic_search,no,no,0,5.0,674,2.5,...,no,select,select,02.medium,02.medium,15.0,15.0,no,no,email_opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,landing_page_submission,direct_traffic,no,no,1,2.0,1532,2.0,...,no,potential_lead,mumbai,02.medium,01.high,14.0,20.0,no,yes,email_opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,landing_page_submission,direct_traffic,no,no,0,1.0,305,1.0,...,no,select,mumbai,02.medium,01.high,13.0,17.0,no,no,modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,landing_page_submission,google,no,no,1,2.0,1428,1.0,...,no,select,mumbai,02.medium,01.high,15.0,18.0,no,no,modified
5,2058ef08-2858-443e-a01f-a9237db2f5ce,660680,api,olark_chat,no,no,0,0.0,0,0.0,...,no,,,01.high,02.medium,17.0,15.0,no,no,modified
6,9fae7df4-169d-489b-afe4-0f3d752542ed,660673,landing_page_submission,google,no,no,1,2.0,1640,2.0,...,no,potential_lead,mumbai,02.medium,01.high,14.0,20.0,no,no,modified
7,20ef72a2-fb3b-45e0-924e-551c5fa59095,660664,api,olark_chat,no,no,0,0.0,0,0.0,...,no,,,02.medium,02.medium,15.0,15.0,no,no,modified
8,cfa0128c-a0da-4656-9d47-0aa4e67bf690,660624,landing_page_submission,direct_traffic,no,no,0,2.0,71,2.0,...,no,,thane_&_outskirts,02.medium,02.medium,14.0,14.0,no,yes,email_opened
9,af465dfc-7204-4130-9e05-33231863c4b5,660616,api,google,no,no,0,4.0,58,4.0,...,no,,mumbai,02.medium,02.medium,13.0,16.0,no,no,email_opened


In [8]:
df.dtypes

prospect_id                                       object
lead_number                                        int64
lead_origin                                       object
lead_source                                       object
do_not_email                                      object
do_not_call                                       object
converted                                          int64
totalvisits                                      float64
total_time_spent_on_website                        int64
page_views_per_visit                             float64
last_activity                                     object
country                                           object
specialization                                    object
how_did_you_hear_about_x_education                object
what_is_your_current_occupation                   object
what_matters_most_to_you_in_choosing_a_course     object
search                                            object
magazine                       

In [9]:
df.head(3).T

Unnamed: 0,0,1,2
prospect_id,7927b2df-8bba-4d29-b9a2-b6e0beafe620,2a272436-5132-4136-86fa-dcc88c88f482,8cc8c611-a219-4f35-ad23-fdfd2656bd8a
lead_number,660737,660728,660727
lead_origin,api,api,landing_page_submission
lead_source,olark_chat,organic_search,direct_traffic
do_not_email,no,no,no
do_not_call,no,no,no
converted,0,0,1
totalvisits,0.0,5.0,2.0
total_time_spent_on_website,0,674,1532
page_views_per_visit,0.0,2.5,2.0


In [10]:
df.isna().sum()

prospect_id                                         0
lead_number                                         0
lead_origin                                         0
lead_source                                        36
do_not_email                                        0
do_not_call                                         0
converted                                           0
totalvisits                                       137
total_time_spent_on_website                         0
page_views_per_visit                              137
last_activity                                     103
country                                          2461
specialization                                   1438
how_did_you_hear_about_x_education               2207
what_is_your_current_occupation                  2690
what_matters_most_to_you_in_choosing_a_course    2709
search                                              0
magazine                                            0
newspaper_article           

In [11]:
df.lead_quality = df.lead_quality.fillna('unknown')

In [12]:
df.tags.value_counts()

tags
will_revert_after_reading_the_email                  2072
ringing                                              1203
interested_in_other_courses                           513
already_a_student                                     465
closed_by_horizzon                                    358
switched_off                                          240
busy                                                  186
lost_to_eins                                          175
not_doing_further_education                           145
interested__in_full_time_mba                          117
graduation_in_progress                                111
invalid_number                                         83
diploma_holder_(not_eligible)                          63
wrong_number_given                                     47
opp_hangup                                             33
number_not_provided                                    27
in_touch_with_eins                                     12
lost_to_o

In [13]:
df['totalvisits'] = pd.to_numeric(df['totalvisits']).fillna(0)

df['page_views_per_visit'] = pd.to_numeric(df['page_views_per_visit']).fillna(0)

In [14]:
df.isna().sum()

prospect_id                                         0
lead_number                                         0
lead_origin                                         0
lead_source                                        36
do_not_email                                        0
do_not_call                                         0
converted                                           0
totalvisits                                         0
total_time_spent_on_website                         0
page_views_per_visit                                0
last_activity                                     103
country                                          2461
specialization                                   1438
how_did_you_hear_about_x_education               2207
what_is_your_current_occupation                  2690
what_matters_most_to_you_in_choosing_a_course    2709
search                                              0
magazine                                            0
newspaper_article           

In [15]:
df = df.fillna('unknown')

In [16]:
df

Unnamed: 0,prospect_id,lead_number,lead_origin,lead_source,do_not_email,do_not_call,converted,totalvisits,total_time_spent_on_website,page_views_per_visit,...,get_updates_on_dm_content,lead_profile,city,asymmetrique_activity_index,asymmetrique_profile_index,asymmetrique_activity_score,asymmetrique_profile_score,i_agree_to_pay_the_amount_through_cheque,a_free_copy_of_mastering_the_interview,last_notable_activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,api,olark_chat,no,no,0,0.0,0,0.00,...,no,select,select,02.medium,02.medium,15.0,15.0,no,no,modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,api,organic_search,no,no,0,5.0,674,2.50,...,no,select,select,02.medium,02.medium,15.0,15.0,no,no,email_opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,landing_page_submission,direct_traffic,no,no,1,2.0,1532,2.00,...,no,potential_lead,mumbai,02.medium,01.high,14.0,20.0,no,yes,email_opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,landing_page_submission,direct_traffic,no,no,0,1.0,305,1.00,...,no,select,mumbai,02.medium,01.high,13.0,17.0,no,no,modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,landing_page_submission,google,no,no,1,2.0,1428,1.00,...,no,select,mumbai,02.medium,01.high,15.0,18.0,no,no,modified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,19d6451e-fcd6-407c-b83b-48e1af805ea9,579564,landing_page_submission,direct_traffic,yes,no,1,8.0,1845,2.67,...,no,potential_lead,mumbai,02.medium,01.high,15.0,17.0,no,no,email_marked_spam
9236,82a7005b-7196-4d56-95ce-a79f937a158d,579546,landing_page_submission,direct_traffic,no,no,0,2.0,238,2.00,...,no,potential_lead,mumbai,02.medium,01.high,14.0,19.0,no,yes,sms_sent
9237,aac550fe-a586-452d-8d3c-f1b62c94e02c,579545,landing_page_submission,direct_traffic,yes,no,0,2.0,199,2.00,...,no,potential_lead,mumbai,02.medium,01.high,13.0,20.0,no,yes,sms_sent
9238,5330a7d1-2f2b-4df4-85d6-64ca2f6b95b9,579538,landing_page_submission,google,no,no,1,3.0,499,3.00,...,no,unknown,other_metro_cities,02.medium,02.medium,15.0,16.0,no,no,sms_sent


## Validation Framework

In [17]:
categorical

Index(['prospect_id', 'lead_origin', 'lead_source', 'do_not_email',
       'do_not_call', 'last_activity', 'country', 'specialization',
       'how_did_you_hear_about_x_education', 'what_is_your_current_occupation',
       'what_matters_most_to_you_in_choosing_a_course', 'search', 'magazine',
       'newspaper_article', 'x_education_forums', 'newspaper',
       'digital_advertisement', 'through_recommendations',
       'receive_more_updates_about_our_courses', 'tags', 'lead_quality',
       'update_me_on_supply_chain_content', 'get_updates_on_dm_content',
       'lead_profile', 'city', 'asymmetrique_activity_index',
       'asymmetrique_profile_index',
       'i_agree_to_pay_the_amount_through_cheque',
       'a_free_copy_of_mastering_the_interview', 'last_notable_activity'],
      dtype='object')

Splitting Training dataset

In [18]:
from sklearn.model_selection import train_test_split 

In [19]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=1)
df_train,df_val = train_test_split(df,test_size=0.25,random_state=1)

In [20]:
df_full_train.shape

(7392, 37)

In [21]:
df_train.shape

(6930, 37)

In [22]:
df_val.shape

(2310, 37)

## EDA

In [23]:
df.converted.value_counts(normalize=True)

converted
0    0.61461
1    0.38539
Name: proportion, dtype: float64

In [24]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

df_train.drop('converted',axis=1,inplace=True)
df_val.drop('converted',axis=1,inplace=True)
df_test.drop('converted',axis=1,inplace=True)

In [25]:
df_train.columns

Index(['prospect_id', 'lead_number', 'lead_origin', 'lead_source',
       'do_not_email', 'do_not_call', 'totalvisits',
       'total_time_spent_on_website', 'page_views_per_visit', 'last_activity',
       'country', 'specialization', 'how_did_you_hear_about_x_education',
       'what_is_your_current_occupation',
       'what_matters_most_to_you_in_choosing_a_course', 'search', 'magazine',
       'newspaper_article', 'x_education_forums', 'newspaper',
       'digital_advertisement', 'through_recommendations',
       'receive_more_updates_about_our_courses', 'tags', 'lead_quality',
       'update_me_on_supply_chain_content', 'get_updates_on_dm_content',
       'lead_profile', 'city', 'asymmetrique_activity_index',
       'asymmetrique_profile_index', 'asymmetrique_activity_score',
       'asymmetrique_profile_score',
       'i_agree_to_pay_the_amount_through_cheque',
       'a_free_copy_of_mastering_the_interview', 'last_notable_activity'],
      dtype='object')

In [26]:
numerical = list(df_train.dtypes[df_train.dtypes == 'int'].index)
numerical

['lead_number', 'total_time_spent_on_website']

## Feature Importance

In [32]:
from IPython.display import display

In [33]:
for col in categorical:
    df_group = df_full_train.groupby(by=col).converted.agg(['mean'])
    df_group['diff'] = df_group['mean'] - df_full_train.converted.mean()
    df_group['risk'] = df_group['mean'] / df_full_train.converted.mean()
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
prospect_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000104b9-23e4-4ddc-8caa-8629fe8ad7f4,1.0,0.616883,2.610169
0006d10a-eb01-4ba9-92e2-ad78588b2a40,0.0,-0.383117,0.000000
0011be30-fa97-465b-8e44-0ae83dff7eed,0.0,-0.383117,0.000000
0011f23e-9fd9-4256-b316-efc2e2639b0d,0.0,-0.383117,0.000000
001e6e14-2183-47ab-a405-108e44bc2e66,1.0,0.616883,2.610169
...,...,...,...
ffd99338-2e6b-4c3f-8650-68b94ea5e07f,0.0,-0.383117,0.000000
ffec8e24-0c99-4345-89f1-e3ad6689764f,1.0,0.616883,2.610169
fff076a3-fe95-4c79-9401-e15846be8086,0.0,-0.383117,0.000000
fff49ad0-6015-448c-a7cc-f454c39ffdda,0.0,-0.383117,0.000000


Unnamed: 0_level_0,mean,diff,risk
lead_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
api,0.308414,-0.074702,0.805014
landing_page_submission,0.360677,-0.02244,0.941427
lead_add_form,0.919861,0.536744,2.400992
lead_import,0.225,-0.158117,0.587288
quick_add_form,1.0,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,risk
lead_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bing,0.166667,-0.21645,0.435028
blog,0.0,-0.383117,0.0
click2call,0.75,0.366883,1.957627
direct_traffic,0.317624,-0.065493,0.829052
facebook,0.225,-0.158117,0.587288
google,0.4,0.016883,1.044068
live_chat,1.0,0.616883,2.610169
nc_edm,1.0,0.616883,2.610169
olark_chat,0.251768,-0.131349,0.657157
organic_search,0.37432,-0.008797,0.977038


Unnamed: 0_level_0,mean,diff,risk
do_not_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.401265,0.018148,1.047368
yes,0.174281,-0.208836,0.454903


Unnamed: 0_level_0,mean,diff,risk
do_not_call,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383033,-8.3e-05,0.999782
yes,1.0,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,risk
last_activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
approached_upfront,1.0,0.616883,2.610169
converted_to_lead,0.126437,-0.25668,0.330021
email_bounced,0.092308,-0.290809,0.240939
email_link_clicked,0.239437,-0.14368,0.62497
email_marked_spam,1.0,0.616883,2.610169
email_opened,0.363137,-0.01998,0.947848
email_received,1.0,0.616883,2.610169
form_submitted_on_website,0.255102,-0.128015,0.66586
had_a_phone_conversation,0.727273,0.344156,1.898305
olark_chat_conversation,0.086185,-0.296932,0.224958


Unnamed: 0_level_0,mean,diff,risk
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
asia/pacific_region,0.5,0.116883,1.305085
australia,0.222222,-0.160895,0.580038
bahrain,0.666667,0.28355,1.740113
bangladesh,0.5,0.116883,1.305085
belgium,0.0,-0.383117,0.0
canada,0.0,-0.383117,0.0
china,0.0,-0.383117,0.0
denmark,1.0,0.616883,2.610169
france,0.6,0.216883,1.566102
germany,0.25,-0.133117,0.652542


Unnamed: 0_level_0,mean,diff,risk
specialization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"banking,_investment_and_insurance",0.458333,0.075216,1.196328
business_administration,0.457478,0.074361,1.194095
e-business,0.408163,0.025046,1.065375
e-commerce,0.356322,-0.026795,0.93006
finance_management,0.453865,0.070748,1.184665
healthcare_management,0.504065,0.120948,1.315695
hospitality_management,0.363636,-0.019481,0.949153
human_resource_management,0.455474,0.072358,1.188866
international_business,0.361702,-0.021415,0.944104
it_projects_management,0.375427,-0.00769,0.979927


Unnamed: 0_level_0,mean,diff,risk
how_did_you_hear_about_x_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
advertisements,0.45098,0.067864,1.177135
email,0.421053,0.037936,1.099019
multiple_sources,0.349593,-0.033523,0.912498
online_search,0.425868,0.042751,1.111586
other,0.412903,0.029786,1.077747
select,0.474021,0.090904,1.237276
sms,0.2,-0.183117,0.522034
social_media,0.428571,0.045455,1.118644
student_of_someschool,0.47541,0.092293,1.2409
unknown,0.132151,-0.250966,0.344937


Unnamed: 0_level_0,mean,diff,risk
what_is_your_current_occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
businessman,0.5,0.116883,1.305085
housewife,1.0,0.616883,2.610169
other,0.615385,0.232268,1.606258
student,0.359281,-0.023835,0.937785
unemployed,0.429491,0.046375,1.121045
unknown,0.143326,-0.239791,0.374104
working_professional,0.916071,0.532955,2.391102


Unnamed: 0_level_0,mean,diff,risk
what_matters_most_to_you_in_choosing_a_course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
better_career_prospects,0.482146,0.099029,1.258483
flexibility_&_convenience,0.5,0.116883,1.305085
unknown,0.142127,-0.24099,0.370976


Unnamed: 0_level_0,mean,diff,risk
search,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383146,2.9e-05,1.000076
yes,0.363636,-0.019481,0.949153


Unnamed: 0_level_0,mean,diff,risk
magazine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,risk
newspaper_article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383033,-8.3e-05,0.999782
yes,1.0,0.616883,2.610169


Unnamed: 0_level_0,mean,diff,risk
x_education_forums,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,risk
newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383169,5.2e-05,1.000135
yes,0.0,-0.383117,0.0


Unnamed: 0_level_0,mean,diff,risk
digital_advertisement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383221,0.000104,1.000271
yes,0.0,-0.383117,0.0


Unnamed: 0_level_0,mean,diff,risk
through_recommendations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.382918,-0.000199,0.999482
yes,0.75,0.366883,1.957627


Unnamed: 0_level_0,mean,diff,risk
receive_more_updates_about_our_courses,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,risk
tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
already_a_student,0.007752,-0.375365,0.020234
busy,0.567742,0.184625,1.481903
closed_by_horizzon,0.996454,0.613337,2.600914
diploma_holder_(not_eligible),0.020833,-0.362284,0.054379
graduation_in_progress,0.06383,-0.319287,0.166607
in_confusion_whether_part_time_or_dlp,0.25,-0.133117,0.652542
in_touch_with_eins,0.272727,-0.11039,0.711864
interested__in_full_time_mba,0.010989,-0.372128,0.028683
interested_in_next_batch,1.0,0.616883,2.610169
interested_in_other_courses,0.02381,-0.359307,0.062147


Unnamed: 0_level_0,mean,diff,risk
lead_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
high_in_relevance,0.948718,0.565601,2.476315
low_in_relevance,0.803063,0.419947,2.096132
might_be,0.753392,0.370275,1.96648
not_sure,0.242597,-0.14052,0.633219
unknown,0.214605,-0.168512,0.560155
worst,0.020408,-0.362709,0.053269


Unnamed: 0_level_0,mean,diff,risk
update_me_on_supply_chain_content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,risk
get_updates_on_dm_content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,risk
lead_profile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dual_specialization_student,1.0,0.616883,2.610169
lateral_student,0.952381,0.569264,2.485876
other_leads,0.351621,-0.031496,0.91779
potential_lead,0.781591,0.398474,2.040086
select,0.404403,0.021286,1.05556
student_of_someschool,0.024752,-0.358364,0.064608
unknown,0.142127,-0.24099,0.370976


Unnamed: 0_level_0,mean,diff,risk
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mumbai,0.401318,0.018201,1.047509
other_cities,0.408015,0.024898,1.064987
other_cities_of_maharashtra,0.437326,0.054209,1.141495
other_metro_cities,0.401961,0.018844,1.049186
select,0.485479,0.102363,1.267184
thane_&_outskirts,0.447412,0.064295,1.167822
tier_ii_cities,0.339286,-0.043831,0.885593
unknown,0.107239,-0.275878,0.279911


Unnamed: 0_level_0,mean,diff,risk
asymmetrique_activity_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01.high,0.313522,-0.069595,0.818344
02.medium,0.428008,0.044891,1.117173
03.low,0.092715,-0.290402,0.242002
unknown,0.382519,-0.000598,0.998438


Unnamed: 0_level_0,mean,diff,risk
asymmetrique_profile_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01.high,0.482525,0.099408,1.259473
02.medium,0.304191,-0.078926,0.79399
03.low,0.416667,0.03355,1.087571
unknown,0.382519,-0.000598,0.998438


Unnamed: 0_level_0,mean,diff,risk
i_agree_to_pay_the_amount_through_cheque,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.383117,0.0,1.0


Unnamed: 0_level_0,mean,diff,risk
a_free_copy_of_mastering_the_interview,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.39578,0.012663,1.033053
yes,0.35545,-0.027667,0.927785


Unnamed: 0_level_0,mean,diff,risk
last_notable_activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
approached_upfront,1.0,0.616883,2.610169
email_bounced,0.16,-0.223117,0.417627
email_link_clicked,0.222222,-0.160895,0.580038
email_marked_spam,1.0,0.616883,2.610169
email_opened,0.364362,-0.018755,0.951046
email_received,1.0,0.616883,2.610169
form_submitted_on_website,0.0,-0.383117,0.0
had_a_phone_conversation,0.9,0.516883,2.349153
modified,0.23043,-0.152687,0.601461
olark_chat_conversation,0.156863,-0.226254,0.409438


## One hot encoding

In [34]:
from sklearn.feature_extraction import DictVectorizer

In [38]:
categorical = list(categorical)

In [40]:
train_dict = df_train[categorical+numerical].to_dict(orient='records')
train_dict[0]

{'prospect_id': 'ac3fb028-26b5-42b8-991e-e380733f05ce',
 'lead_origin': 'landing_page_submission',
 'lead_source': 'direct_traffic',
 'do_not_email': 'no',
 'do_not_call': 'no',
 'last_activity': 'sms_sent',
 'country': 'india',
 'specialization': 'business_administration',
 'how_did_you_hear_about_x_education': 'select',
 'what_is_your_current_occupation': 'unemployed',
 'what_matters_most_to_you_in_choosing_a_course': 'better_career_prospects',
 'search': 'no',
 'magazine': 'no',
 'newspaper_article': 'no',
 'x_education_forums': 'no',
 'newspaper': 'no',
 'digital_advertisement': 'no',
 'through_recommendations': 'no',
 'receive_more_updates_about_our_courses': 'no',
 'tags': 'will_revert_after_reading_the_email',
 'lead_quality': 'high_in_relevance',
 'update_me_on_supply_chain_content': 'no',
 'get_updates_on_dm_content': 'no',
 'lead_profile': 'potential_lead',
 'city': 'mumbai',
 'asymmetrique_activity_index': 'unknown',
 'asymmetrique_profile_index': 'unknown',
 'i_agree_to_pay

In [None]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)