# Data Preparation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

In [4]:
data = pd.read_csv('lead_scoring.csv')

In [5]:
data.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Jakarta,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Jakarta,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Jakarta,02.Medium,01.High,15.0,18.0,No,No,Modified


In [6]:
data.tail()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
9235,19d6451e-fcd6-407c-b83b-48e1af805ea9,579564,Landing Page Submission,Direct Traffic,Yes,No,1,8.0,1845,2.67,...,No,Potential Lead,Jakarta,02.Medium,01.High,15.0,17.0,No,No,Email Marked Spam
9236,82a7005b-7196-4d56-95ce-a79f937a158d,579546,Landing Page Submission,Direct Traffic,No,No,0,2.0,238,2.0,...,No,Potential Lead,Jakarta,02.Medium,01.High,14.0,19.0,No,Yes,SMS Sent
9237,aac550fe-a586-452d-8d3c-f1b62c94e02c,579545,Landing Page Submission,Direct Traffic,Yes,No,0,2.0,199,2.0,...,No,Potential Lead,Jakarta,02.Medium,01.High,13.0,20.0,No,Yes,SMS Sent
9238,5330a7d1-2f2b-4df4-85d6-64ca2f6b95b9,579538,Landing Page Submission,Google,No,No,1,3.0,499,3.0,...,No,,Other Metro Cities,02.Medium,02.Medium,15.0,16.0,No,No,SMS Sent
9239,571b5c8e-a5b2-4d57-8574-f2ffb06fdeff,579533,Landing Page Submission,Direct Traffic,No,No,1,6.0,1279,3.0,...,No,Potential Lead,Other Cities,02.Medium,01.High,15.0,18.0,No,Yes,Modified


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Prospect ID                                     9240 non-null   object 
 1   Lead Number                                     9240 non-null   int64  
 2   Lead Origin                                     9240 non-null   object 
 3   Lead Source                                     9204 non-null   object 
 4   Do Not Email                                    9240 non-null   object 
 5   Do Not Call                                     9240 non-null   object 
 6   Converted                                       9240 non-null   int64  
 7   TotalVisits                                     9103 non-null   float64
 8   Total Time Spent on Website                     9240 non-null   int64  
 9   Page Views Per Visit                     

In [8]:
data['Converted'].unique()

array([0, 1])

In [9]:
data.isnull().sum()

Prospect ID                                          0
Lead Number                                          0
Lead Origin                                          0
Lead Source                                         36
Do Not Email                                         0
Do Not Call                                          0
Converted                                            0
TotalVisits                                        137
Total Time Spent on Website                          0
Page Views Per Visit                               137
Last Activity                                      103
Country                                           2461
Specialization                                    1438
How did you hear about Madugital                  2207
What is your current occupation                   2690
What matters most to you in choosing a product    2709
Search                                               0
Magazine                                             0
Newspaper 

In [10]:
data_dropped = data.dropna()
data_dropped.isnull().sum()

Prospect ID                                       0
Lead Number                                       0
Lead Origin                                       0
Lead Source                                       0
Do Not Email                                      0
Do Not Call                                       0
Converted                                         0
TotalVisits                                       0
Total Time Spent on Website                       0
Page Views Per Visit                              0
Last Activity                                     0
Country                                           0
Specialization                                    0
How did you hear about Madugital                  0
What is your current occupation                   0
What matters most to you in choosing a product    0
Search                                            0
Magazine                                          0
Newspaper Article                                 0
Madugital Te

In [11]:
data_dropped.shape

(1943, 37)

In [12]:
data.shape

(9240, 37)

In [13]:
data.loc[1], (ignore_index:=True)

(Prospect ID                                       2a272436-5132-4136-86fa-dcc88c88f482
 Lead Number                                                                     660728
 Lead Origin                                                                        API
 Lead Source                                                             Organic Search
 Do Not Email                                                                        No
 Do Not Call                                                                         No
 Converted                                                                            0
 TotalVisits                                                                        5.0
 Total Time Spent on Website                                                        674
 Page Views Per Visit                                                               2.5
 Last Activity                                                             Email Opened
 Country                        

In [14]:
data.drop_duplicates(keep=False)

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.00,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.50,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.00,...,No,Potential Lead,Jakarta,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.00,...,No,Select,Jakarta,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.00,...,No,Select,Jakarta,02.Medium,01.High,15.0,18.0,No,No,Modified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,19d6451e-fcd6-407c-b83b-48e1af805ea9,579564,Landing Page Submission,Direct Traffic,Yes,No,1,8.0,1845,2.67,...,No,Potential Lead,Jakarta,02.Medium,01.High,15.0,17.0,No,No,Email Marked Spam
9236,82a7005b-7196-4d56-95ce-a79f937a158d,579546,Landing Page Submission,Direct Traffic,No,No,0,2.0,238,2.00,...,No,Potential Lead,Jakarta,02.Medium,01.High,14.0,19.0,No,Yes,SMS Sent
9237,aac550fe-a586-452d-8d3c-f1b62c94e02c,579545,Landing Page Submission,Direct Traffic,Yes,No,0,2.0,199,2.00,...,No,Potential Lead,Jakarta,02.Medium,01.High,13.0,20.0,No,Yes,SMS Sent
9238,5330a7d1-2f2b-4df4-85d6-64ca2f6b95b9,579538,Landing Page Submission,Google,No,No,1,3.0,499,3.00,...,No,,Other Metro Cities,02.Medium,02.Medium,15.0,16.0,No,No,SMS Sent


In [15]:
data['Total Time Spent on Website'].mean()

np.float64(487.6982683982684)

In [16]:
data['Converted'].mean()

np.float64(0.3853896103896104)

In [17]:
data.isna().mean() * 100 # persentase baris yang kosong dari keseluruhan

Prospect ID                                        0.000000
Lead Number                                        0.000000
Lead Origin                                        0.000000
Lead Source                                        0.389610
Do Not Email                                       0.000000
Do Not Call                                        0.000000
Converted                                          0.000000
TotalVisits                                        1.482684
Total Time Spent on Website                        0.000000
Page Views Per Visit                               1.482684
Last Activity                                      1.114719
Country                                           26.634199
Specialization                                    15.562771
How did you hear about Madugital                  23.885281
What is your current occupation                   29.112554
What matters most to you in choosing a product    29.318182
Search                                  

In [18]:
data["Country"].unique()

array([nan, 'Indonesia', 'Russia', 'Kuwait', 'Oman',
       'United Arab Emirates', 'United States', 'Australia',
       'United Kingdom', 'Bahrain', 'Ghana', 'Singapore', 'Qatar',
       'Saudi Arabia', 'Belgium', 'France', 'Sri Lanka', 'China',
       'Canada', 'Netherlands', 'Sweden', 'Nigeria', 'Hong Kong',
       'Germany', 'Asia/Pacific Region', 'Uganda', 'Kenya', 'Italy',
       'South Africa', 'Tanzania', 'unknown', 'Malaysia', 'Liberia',
       'Switzerland', 'Denmark', 'Philippines', 'Bangladesh', 'Vietnam',
       'India'], dtype=object)

In [19]:
# mengisi missing records di kolom country
data_dropped = data.copy()
data_dropped['Country'] = data_dropped['Country'].fillna('unknown')

In [20]:
# check nilai-nilai yang ada di kolom What is your current occupation
data["What is your current occupation"].unique()

array(['Unemployed', 'Student', nan, 'Working Professional',
       'Businessman', 'Other', 'Housewife'], dtype=object)

In [21]:
# mengisi missing records di kolom What is your current occupation
data_dropped['What is your current occupation'] = data_dropped['What is your current occupation'].fillna('Other')

In [22]:
# check nilai-nilai yang ada di kolom What is How did you hear about Madugital
data["How did you hear about Madugital"].unique()

array(['Select', 'Word Of Mouth', 'Other', nan, 'Online Search',
       'Multiple Sources', 'Advertisements', 'Student of SomeSchool',
       'Email', 'Social Media', 'SMS'], dtype=object)

In [23]:
# mengisi missing records di kolom How did you hear about Madugital
data_dropped['How did you hear about Madugital'] = data_dropped['How did you hear about Madugital'].fillna('Select')

In [24]:
data_dropped.isna().mean() * 100 # persentase baris yang kosong dari keseluruhan

Prospect ID                                        0.000000
Lead Number                                        0.000000
Lead Origin                                        0.000000
Lead Source                                        0.389610
Do Not Email                                       0.000000
Do Not Call                                        0.000000
Converted                                          0.000000
TotalVisits                                        1.482684
Total Time Spent on Website                        0.000000
Page Views Per Visit                               1.482684
Last Activity                                      1.114719
Country                                            0.000000
Specialization                                    15.562771
How did you hear about Madugital                   0.000000
What is your current occupation                    0.000000
What matters most to you in choosing a product    29.318182
Search                                  

In [25]:
# menghitung hubungan antara dua variable
pd.crosstab(data_dropped['Country'], data_dropped['Converted'], normalize=True) # normalize=True

Converted,0,1
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia/Pacific Region,0.000108,0.000108
Australia,0.001082,0.000325
Bahrain,0.000325,0.000433
Bangladesh,0.000108,0.000108
Belgium,0.000216,0.0
Canada,0.000433,0.0
China,0.000216,0.0
Denmark,0.0,0.000108
France,0.000325,0.000325
Germany,0.000325,0.000108


In [26]:
pd.crosstab(data_dropped['Country'], data_dropped['What is your current occupation'], normalize=True) # normalize=True

What is your current occupation,Businessman,Housewife,Other,Student,Unemployed,Working Professional
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Asia/Pacific Region,0.0,0.0,0.000216,0.0,0.0,0.0
Australia,0.0,0.0,0.000541,0.0,0.000758,0.000108
Bahrain,0.0,0.0,0.000216,0.0,0.000216,0.000325
Bangladesh,0.0,0.0,0.000108,0.0,0.000108,0.0
Belgium,0.0,0.0,0.000216,0.0,0.0,0.0
Canada,0.0,0.0,0.000216,0.0,0.000216,0.0
China,0.0,0.0,0.000108,0.0,0.000108,0.0
Denmark,0.0,0.0,0.0,0.0,0.0,0.000108
France,0.0,0.0,0.000108,0.000108,0.000433,0.0
Germany,0.0,0.0,0.000216,0.0,0.000216,0.0


In [27]:
# pandas dummies
dummies = pd.get_dummies(data_dropped[['Country', 'What is your current occupation', 'How did you hear about Madugital']], drop_first=True)
final_data = data_dropped.join(dummies)
dummies.head()

Unnamed: 0,Country_Australia,Country_Bahrain,Country_Bangladesh,Country_Belgium,Country_Canada,Country_China,Country_Denmark,Country_France,Country_Germany,Country_Ghana,...,What is your current occupation_Working Professional,How did you hear about Madugital_Email,How did you hear about Madugital_Multiple Sources,How did you hear about Madugital_Online Search,How did you hear about Madugital_Other,How did you hear about Madugital_SMS,How did you hear about Madugital_Select,How did you hear about Madugital_Social Media,How did you hear about Madugital_Student of SomeSchool,How did you hear about Madugital_Word Of Mouth
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [28]:
final_data.columns

Index(['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source',
       'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'How did you hear about Madugital',
       'What is your current occupation',
       'What matters most to you in choosing a product', 'Search', 'Magazine',
       'Newspaper Article', 'Madugital Telegram', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Products', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity',
       'Country_Australia', 'Count

In [30]:
final_data = final_data.drop(['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source',
       'Do Not Email', 'Do Not Call', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'How did you hear about Madugital',
       'What is your current occupation',
       'What matters most to you in choosing a product', 'Search', 'Magazine',
       'Newspaper Article', 'Madugital Telegram', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Products', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity'], axis = 1)

In [31]:
train, test = train_test_split(final_data, test_size=0.3, random_state=0)

In [32]:
train.shape, test.shape

((6468, 52), (2772, 52))

In [33]:
train.columns

Index(['Converted', 'Country_Australia', 'Country_Bahrain',
       'Country_Bangladesh', 'Country_Belgium', 'Country_Canada',
       'Country_China', 'Country_Denmark', 'Country_France', 'Country_Germany',
       'Country_Ghana', 'Country_Hong Kong', 'Country_India',
       'Country_Indonesia', 'Country_Italy', 'Country_Kenya', 'Country_Kuwait',
       'Country_Liberia', 'Country_Malaysia', 'Country_Netherlands',
       'Country_Nigeria', 'Country_Oman', 'Country_Philippines',
       'Country_Qatar', 'Country_Russia', 'Country_Saudi Arabia',
       'Country_Singapore', 'Country_South Africa', 'Country_Sri Lanka',
       'Country_Sweden', 'Country_Switzerland', 'Country_Tanzania',
       'Country_Uganda', 'Country_United Arab Emirates',
       'Country_United Kingdom', 'Country_United States', 'Country_Vietnam',
       'Country_unknown', 'What is your current occupation_Housewife',
       'What is your current occupation_Other',
       'What is your current occupation_Student',
       '

In [34]:
test.columns

Index(['Converted', 'Country_Australia', 'Country_Bahrain',
       'Country_Bangladesh', 'Country_Belgium', 'Country_Canada',
       'Country_China', 'Country_Denmark', 'Country_France', 'Country_Germany',
       'Country_Ghana', 'Country_Hong Kong', 'Country_India',
       'Country_Indonesia', 'Country_Italy', 'Country_Kenya', 'Country_Kuwait',
       'Country_Liberia', 'Country_Malaysia', 'Country_Netherlands',
       'Country_Nigeria', 'Country_Oman', 'Country_Philippines',
       'Country_Qatar', 'Country_Russia', 'Country_Saudi Arabia',
       'Country_Singapore', 'Country_South Africa', 'Country_Sri Lanka',
       'Country_Sweden', 'Country_Switzerland', 'Country_Tanzania',
       'Country_Uganda', 'Country_United Arab Emirates',
       'Country_United Kingdom', 'Country_United States', 'Country_Vietnam',
       'Country_unknown', 'What is your current occupation_Housewife',
       'What is your current occupation_Other',
       'What is your current occupation_Student',
       '

In [36]:
X_train = train.drop(['Converted', 'How did you hear about Madugital_Email',
       'How did you hear about Madugital_Multiple Sources',
       'How did you hear about Madugital_Online Search',
       'How did you hear about Madugital_Other',
       'How did you hear about Madugital_SMS',
       'How did you hear about Madugital_Select',
       'How did you hear about Madugital_Social Media',
       'How did you hear about Madugital_Student of SomeSchool',
       'How did you hear about Madugital_Word Of Mouth'], axis = 1)
y_train = train['Converted']
X_test = test.drop(['Converted', 'How did you hear about Madugital_Email',
       'How did you hear about Madugital_Multiple Sources',
       'How did you hear about Madugital_Online Search',
       'How did you hear about Madugital_Other',
       'How did you hear about Madugital_SMS',
       'How did you hear about Madugital_Select',
       'How did you hear about Madugital_Social Media',
       'How did you hear about Madugital_Student of SomeSchool',
       'How did you hear about Madugital_Word Of Mouth'], axis = 1)
y_test = test['Converted']

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [37]:
predictions = dt.predict(X_test)
predictions

array([0, 1, 1, ..., 0, 0, 1])

In [38]:
y_test

2212    0
3034    1
4492    0
5063    0
7645    0
       ..
7966    1
9071    0
6208    0
6589    0
818     1
Name: Converted, Length: 2772, dtype: int64

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_pred = dt.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(acc, prec, recall)

0.7027417027417028 0.7159090909090909 0.40458715596330275
