In [202]:
# Supress unnecessary warnings

import warnings
warnings.filterwarnings('ignore')

In [203]:
# Import the NumPy and Pandas packages

import numpy as np
import pandas as pd

In [204]:
# Read the dataset

# Save it into a dataframe called "Leads"
leads = pd.read_csv("Leads.csv")

In [205]:
# Look at the first few entries

# head
leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [206]:
# Inspect the shape of the dataset

leads.shape

(9240, 37)

In [207]:
# Inspect the different column in the dataset

leads.columns

Index(['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source',
       'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'How did you hear about X Education',
       'What is your current occupation',
       'What matters most to you in choosing a course', 'Search', 'Magazine',
       'Newspaper Article', 'X Education Forums', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Courses', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')

In [208]:
# Check the summary of the dataset

leads.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [209]:
# Check the info to see the types of the feature variables and the null values present

leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

Looks like there are quite a few categorical variables present in this dataset for which we will 
need to create dummy variables. Also, there are a lot of null values present as well, 
so we will need to treat them accordingly.

## Data Cleaning and Preparation

In [210]:
# Check the number of missing values in each column

leads.isnull().sum().sort_values(ascending=False)

Lead Quality                                     4767
Asymmetrique Activity Index                      4218
Asymmetrique Profile Score                       4218
Asymmetrique Activity Score                      4218
Asymmetrique Profile Index                       4218
Tags                                             3353
Lead Profile                                     2709
What matters most to you in choosing a course    2709
What is your current occupation                  2690
Country                                          2461
How did you hear about X Education               2207
Specialization                                   1438
City                                             1420
Page Views Per Visit                              137
TotalVisits                                       137
Last Activity                                     103
Lead Source                                        36
Receive More Updates About Our Courses              0
I agree to pay the amount th

As you can see there are a lot of columns which have high number of missing values. Clearly, 
these columns are not useful. Since, there are 9000 datapoints in our dataframe, let's 
eliminate the columns having greater than 3000 missing values as they are of no use to us.

In [211]:
# Drop all the columns in which greater than 3000 missing values are present

for col in leads.columns:
    if leads[col].isnull().sum() > 3000:
        leads.drop(col, axis=1, inplace=True)

In [212]:
# Check the number of null values again

leads.isnull().sum().sort_values(ascending=False)

What matters most to you in choosing a course    2709
Lead Profile                                     2709
What is your current occupation                  2690
Country                                          2461
How did you hear about X Education               2207
Specialization                                   1438
City                                             1420
Page Views Per Visit                              137
TotalVisits                                       137
Last Activity                                     103
Lead Source                                        36
Get updates on DM Content                           0
Newspaper                                           0
I agree to pay the amount through cheque            0
A free copy of Mastering The Interview              0
Update me on Supply Chain Content                   0
Receive More Updates About Our Courses              0
Through Recommendations                             0
Digital Advertisement       

As we might be able to interpret, the variable City won't be of any use in our analysis. 
So it's best that we drop it.

In [213]:
# drop City
leads.drop('City', axis=1, inplace=True)

In [214]:
# Same goes for the variable 'Country'

# drop Country
leads.drop('Country', axis=1, inplace=True)

In [215]:
# Let's now check the percentage of missing values in each column
# (60/200) * 100 = 30.0

round(((leads.isnull().sum()/len(leads.index))*100), 2).sort_values(ascending=False)

What matters most to you in choosing a course    29.32
Lead Profile                                     29.32
What is your current occupation                  29.11
How did you hear about X Education               23.89
Specialization                                   15.56
TotalVisits                                       1.48
Page Views Per Visit                              1.48
Last Activity                                     1.11
Lead Source                                       0.39
Get updates on DM Content                         0.00
Update me on Supply Chain Content                 0.00
X Education Forums                                0.00
I agree to pay the amount through cheque          0.00
A free copy of Mastering The Interview            0.00
Receive More Updates About Our Courses            0.00
Through Recommendations                           0.00
Digital Advertisement                             0.00
Newspaper                                         0.00
Prospect I

In [216]:
# Check the number of null values again

leads.isnull().sum().sort_values(ascending=False)

What matters most to you in choosing a course    2709
Lead Profile                                     2709
What is your current occupation                  2690
How did you hear about X Education               2207
Specialization                                   1438
TotalVisits                                       137
Page Views Per Visit                              137
Last Activity                                     103
Lead Source                                        36
Get updates on DM Content                           0
Update me on Supply Chain Content                   0
X Education Forums                                  0
I agree to pay the amount through cheque            0
A free copy of Mastering The Interview              0
Receive More Updates About Our Courses              0
Through Recommendations                             0
Digital Advertisement                               0
Newspaper                                           0
Prospect ID                 

Now recall that there are a few columns in which there is a level called 'Select' which basically 
means that the student had not selected the option for that particular column which is why it 
shows 'Select'. These values are as good as missing values and hence we need to identify the value 
counts of the level 'Select' in all the columns that 
it is present.

In [217]:
# Get the value counts of all the columns

for col in leads.columns:
    print(leads[col].astype('category').value_counts())
    print('-'*50)

Prospect ID
000104b9-23e4-4ddc-8caa-8629fe8ad7f4    1
a7a319ea-b6ae-4c6b-afc5-183b933d10b5    1
aa27a0af-eeab-4007-a770-fa8a93fa53c8    1
aa30ebb2-8476-41ce-9258-37cc025110d3    1
aa405742-17ac-4c65-b19e-ab91c241cc53    1
                                       ..
539eb309-df36-4a89-ac58-6d3651393910    1
539ffa32-1be7-4fe1-b04c-faf1bab763cf    1
53aabd84-5dcc-4299-bbe3-62f3764b07b1    1
53ac14bd-2bb2-4315-a21c-94562d1b6b2d    1
fffb0e5e-9f92-4017-9f42-781a69da4154    1
Name: count, Length: 9240, dtype: int64
--------------------------------------------------
Lead Number
579533    1
629593    1
630390    1
630403    1
630405    1
         ..
602534    1
602540    1
602557    1
602561    1
660737    1
Name: count, Length: 9240, dtype: int64
--------------------------------------------------
Lead Origin
Landing Page Submission    4886
API                        3580
Lead Add Form               718
Lead Import                  55
Quick Add Form                1
Name: count, dtype: int64
--

The following three columns now have the level 'Select'. Let's check them once again.

In [218]:
leads.value_counts("Lead Profile")

Lead Profile
Select                         4146
Potential Lead                 1613
Other Leads                     487
Student of SomeSchool           241
Lateral Student                  24
Dual Specialization Student      20
Name: count, dtype: int64

In [219]:
leads.value_counts("How did you hear about X Education")

How did you hear about X Education
Select                   5043
Online Search             808
Word Of Mouth             348
Student of SomeSchool     310
Other                     186
Multiple Sources          152
Advertisements             70
Social Media               67
Email                      26
SMS                        23
Name: count, dtype: int64

In [220]:
leads.value_counts("Specialization")

Specialization
Select                               1942
Finance Management                    976
Human Resource Management             848
Marketing Management                  838
Operations Management                 503
Business Administration               403
IT Projects Management                366
Supply Chain Management               349
Banking, Investment And Insurance     338
Media and Advertising                 203
Travel and Tourism                    203
International Business                178
Healthcare Management                 159
Hospitality Management                114
E-COMMERCE                            112
Retail Management                     100
Rural and Agribusiness                 73
E-Business                             57
Services Excellence                    40
Name: count, dtype: int64

Clearly the levels Lead Profile and How did you hear about X Education have a lot of rows which have the value Select which is of no use to the analysis so it's best that we drop them.

In [221]:
# drop Lead Profile and How did you hear about X Education cols
leads.drop(['Lead Profile', 'How did you hear about X Education'], axis=1, inplace=True)

Also notice that when we got the value counts of all the columns, there were a few columns in which only one value was majorly present for all the data points. These include Do Not Call, Search, Magazine, Newspaper Article, X Education Forums, Newspaper, Digital Advertisement, Through Recommendations, Receive More Updates About Our Courses, Update me on Supply Chain Content, Get updates on DM Content, I agree to pay the amount through cheque. Since practically all of the values for these variables are No, it's best that we drop these columns as they won't help with our analysis.

In [222]:
# drop all of the above mentioned columns

columns_to_drop = [
    'Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 
    'X Education Forums', 'Newspaper', 'Digital Advertisement', 
    'Through Recommendations', 'Receive More Updates About Our Courses', 
    'Update me on Supply Chain Content', 'Get updates on DM Content', 
    'I agree to pay the amount through cheque'
]

# Drop the columns if they exist in the DataFrame
leads.drop(columns=[col for col in columns_to_drop if col in leads.columns], inplace=True)


Also, the variable What matters most to you in choosing a course has the level Better Career Prospects 6528 times while the other two levels appear once twice and once respectively. So we should drop this column as well.

In [223]:
leads.value_counts("What matters most to you in choosing a course")

What matters most to you in choosing a course
Better Career Prospects      6528
Flexibility & Convenience       2
Other                           1
Name: count, dtype: int64

In [224]:
# Drop the null value rows present in the variable 'What matters most to you in choosing a course'
leads = leads.dropna(subset=['What matters most to you in choosing a course'])

## Display the updated datafram
leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,What is your current occupation,What matters most to you in choosing a course,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,0,0.0,0,0.0,Page Visited on Website,Select,Unemployed,Better Career Prospects,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,0,5.0,674,2.5,Email Opened,Select,Unemployed,Better Career Prospects,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,1,2.0,1532,2.0,Email Opened,Business Administration,Student,Better Career Prospects,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,0,1.0,305,1.0,Unreachable,Media and Advertising,Unemployed,Better Career Prospects,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,1,2.0,1428,1.0,Converted to Lead,Select,Unemployed,Better Career Prospects,No,Modified


In [225]:
# Check the number of null values again

leads.isnull().sum().sort_values(ascending=False)

TotalVisits                                      130
Page Views Per Visit                             130
Last Activity                                    103
Lead Source                                       36
Prospect ID                                        0
Lead Number                                        0
Lead Origin                                        0
Do Not Email                                       0
Converted                                          0
Total Time Spent on Website                        0
Specialization                                     0
What is your current occupation                    0
What matters most to you in choosing a course      0
A free copy of Mastering The Interview             0
Last Notable Activity                              0
dtype: int64

Now, there's the column What is your current occupation which has a lot of null values. Now you can drop the entire row but since we have already lost so many feature variables, we choose not to drop it as it might turn out to be significant in the analysis. So let's just drop the null rows for the column What is you current occupation.

In [226]:
leads = leads[~pd.isnull(leads['What matters most to you in choosing a course'])]

In [227]:
# Check the null values again

leads.isnull().sum().sort_values(ascending=False)

TotalVisits                                      130
Page Views Per Visit                             130
Last Activity                                    103
Lead Source                                       36
Prospect ID                                        0
Lead Number                                        0
Lead Origin                                        0
Do Not Email                                       0
Converted                                          0
Total Time Spent on Website                        0
Specialization                                     0
What is your current occupation                    0
What matters most to you in choosing a course      0
A free copy of Mastering The Interview             0
Last Notable Activity                              0
dtype: int64

Since now the number of null values present in the columns are quite small we can simply drop the rows in which these null values are present.

In [228]:
# Drop the null value rows in the column 'TotalVisits'

leads = leads[~pd.isnull(leads['TotalVisits'])]

In [229]:
# Check the null values again

leads.isnull().sum().sort_values(ascending=False)

Lead Source                                      29
Prospect ID                                       0
Lead Number                                       0
Lead Origin                                       0
Do Not Email                                      0
Converted                                         0
TotalVisits                                       0
Total Time Spent on Website                       0
Page Views Per Visit                              0
Last Activity                                     0
Specialization                                    0
What is your current occupation                   0
What matters most to you in choosing a course     0
A free copy of Mastering The Interview            0
Last Notable Activity                             0
dtype: int64

In [230]:
# Drop the null values rows in the column 'Lead Source'

leads = leads[~pd.isnull(leads['Lead Source'])]

In [231]:
# Check the number of null values again

leads.isnull().sum().sort_values(ascending=False)

Prospect ID                                      0
Lead Number                                      0
Lead Origin                                      0
Lead Source                                      0
Do Not Email                                     0
Converted                                        0
TotalVisits                                      0
Total Time Spent on Website                      0
Page Views Per Visit                             0
Last Activity                                    0
Specialization                                   0
What is your current occupation                  0
What matters most to you in choosing a course    0
A free copy of Mastering The Interview           0
Last Notable Activity                            0
dtype: int64

In [232]:
# Drop the null values rows in the column 'Specialization'

leads = leads[~pd.isnull(leads['Specialization'])]

In [233]:
# Check the number of null values again

leads.isnull().sum().sort_values(ascending=False)

Prospect ID                                      0
Lead Number                                      0
Lead Origin                                      0
Lead Source                                      0
Do Not Email                                     0
Converted                                        0
TotalVisits                                      0
Total Time Spent on Website                      0
Page Views Per Visit                             0
Last Activity                                    0
Specialization                                   0
What is your current occupation                  0
What matters most to you in choosing a course    0
A free copy of Mastering The Interview           0
Last Notable Activity                            0
dtype: int64

Now our data doesn't have any null values. Let's now check the percentage of rows that we have retained.

In [234]:
print(len(leads.index))
print(len(leads.index)/9240)

6372
0.6896103896103896


We still have around 69% of the rows which seems good enough.

In [235]:
# Let's look at the dataset again

leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,What is your current occupation,What matters most to you in choosing a course,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,0,0.0,0,0.0,Page Visited on Website,Select,Unemployed,Better Career Prospects,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,0,5.0,674,2.5,Email Opened,Select,Unemployed,Better Career Prospects,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,1,2.0,1532,2.0,Email Opened,Business Administration,Student,Better Career Prospects,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,0,1.0,305,1.0,Unreachable,Media and Advertising,Unemployed,Better Career Prospects,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,1,2.0,1428,1.0,Converted to Lead,Select,Unemployed,Better Career Prospects,No,Modified


Now, clearly the variables Prospect ID and Lead Number won't be of any use in the analysis, so it's best that we drop these two variables.

In [236]:
# Now, clearly the variables `Prospect ID` and `Lead Number` won't be of any use in the analysis, 
# so it's best that we drop these two variables.
leads.drop(['Prospect ID', 'Lead Number'], axis=1, inplace=True)

In [237]:
leads.head()

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,What is your current occupation,What matters most to you in choosing a course,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,No,0,0.0,0,0.0,Page Visited on Website,Select,Unemployed,Better Career Prospects,No,Modified
1,API,Organic Search,No,0,5.0,674,2.5,Email Opened,Select,Unemployed,Better Career Prospects,No,Email Opened
2,Landing Page Submission,Direct Traffic,No,1,2.0,1532,2.0,Email Opened,Business Administration,Student,Better Career Prospects,Yes,Email Opened
3,Landing Page Submission,Direct Traffic,No,0,1.0,305,1.0,Unreachable,Media and Advertising,Unemployed,Better Career Prospects,No,Modified
4,Landing Page Submission,Google,No,1,2.0,1428,1.0,Converted to Lead,Select,Unemployed,Better Career Prospects,No,Modified


In [238]:
# Specialization => {Select, MBA, BE}
# Specialization_Select, Specialization_MBA, Specialization_BE
# Drop Specialization_Select

### Dummy Variable Creation

The next step is to deal with the categorical variables present in the dataset. So first take a look at which variables are actually categorical variables.

In [239]:
# Check the columns which are of type 'object'

temp = leads.select_dtypes(include=['object'])
temp.columns

Index(['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
       'Specialization', 'What is your current occupation',
       'What matters most to you in choosing a course',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')

In [240]:
# Create dummy variables using the 'get_dummies' command
dummy = pd.get_dummies(leads[['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                              'What is your current occupation','A free copy of Mastering The Interview', 
                              'Last Notable Activity']], drop_first=True)

# Add the results to the master dataframe
leads = pd.concat([leads, dummy], axis=1)

In [241]:
# Drop the variables for which the dummy variables have been created

leads = leads.drop(['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                   'Specialization', 'What is your current occupation',
                   'A free copy of Mastering The Interview', 'Last Notable Activity'], axis=1)

In [242]:
# Let's take a look at the dataset again

leads.head()

Unnamed: 0,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,What matters most to you in choosing a course,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Direct Traffic,Lead Source_Facebook,...,Last Notable Activity_Email Opened,Last Notable Activity_Email Received,Last Notable Activity_Had a Phone Conversation,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked
0,0,0.0,0,0.0,Better Career Prospects,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,0,5.0,674,2.5,Better Career Prospects,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,1,2.0,1532,2.0,Better Career Prospects,True,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False
3,0,1.0,305,1.0,Better Career Prospects,True,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,1,2.0,1428,1.0,Better Career Prospects,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


### Test-Train Split
The next step is to split the dataset into training an testing sets.

In [243]:
# Import the required library

from sklearn.model_selection import train_test_split

In [244]:
# Put all the feature variables in X

X = leads.drop(['Converted'], axis=1)
X.head()

Unnamed: 0,TotalVisits,Total Time Spent on Website,Page Views Per Visit,What matters most to you in choosing a course,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,...,Last Notable Activity_Email Opened,Last Notable Activity_Email Received,Last Notable Activity_Had a Phone Conversation,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked
0,0.0,0,0.0,Better Career Prospects,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,5.0,674,2.5,Better Career Prospects,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,2.0,1532,2.0,Better Career Prospects,True,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
3,1.0,305,1.0,Better Career Prospects,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
4,2.0,1428,1.0,Better Career Prospects,True,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [245]:
# Put the target variable in y

y = leads['Converted']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Converted, dtype: int64

In [246]:
# Split the dataset into 70% train and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

### Scaling

Now there are a few numeric variables present in the dataset which have different scales. So let's go ahead and scale these variables.

In [247]:
# Import MinMax scaler

from sklearn.preprocessing import MinMaxScaler

In [248]:
# Scale the three numeric features present in the dataset - 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit'


scaler = MinMaxScaler()
X_train[['TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']] = scaler.fit_transform(X_train[['TotalVisits', 
                                            'Total Time Spent on Website', 'Page Views Per Visit']])
X_train.head()


Unnamed: 0,TotalVisits,Total Time Spent on Website,Page Views Per Visit,What matters most to you in choosing a course,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,...,Last Notable Activity_Email Opened,Last Notable Activity_Email Received,Last Notable Activity_Had a Phone Conversation,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked
4170,0.015936,0.408891,0.25,Better Career Prospects,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
6812,0.063745,0.539613,0.125,Better Career Prospects,True,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
7717,0.0,0.0,0.0,Better Career Prospects,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
207,0.01992,0.15757,0.3125,Better Career Prospects,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2044,0.007968,0.163292,0.125,Better Career Prospects,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


### Looking at the correlations

Let's now look at the correlations. Since the number of variables are pretty high, it's better that we look at the table instead of plotting a heatmap

In [249]:
# Select only the numeric columns
numeric_cols = leads.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation matrix for numeric columns only
correlation_matrix = leads[numeric_cols].corr()

# Display the correlation matrix
print(correlation_matrix)

                             Converted  TotalVisits  \
Converted                     1.000000     0.009349   
TotalVisits                   0.009349     1.000000   
Total Time Spent on Website   0.313277     0.212987   
Page Views Per Visit         -0.063002     0.499610   

                             Total Time Spent on Website  Page Views Per Visit  
Converted                                       0.313277             -0.063002  
TotalVisits                                     0.212987              0.499610  
Total Time Spent on Website                     1.000000              0.304302  
Page Views Per Visit                            0.304302              1.000000  


In [253]:
# Looking at the correlation table

leads.corr()

ValueError: could not convert string to float: 'Better Career Prospects'

## Step 2: Model Building

Let's now move to model building. As you can see that there are a lot of variables present in the dataset which we cannot deal with. So the best way to approach this is to select a small set of features from this pool of variables using RFE.

In [190]:
# Import 'LogisticRegression' and create a LogisticRegression object

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [192]:
# Drop non-numeric columns
X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])

# Now fit RFE
rfe = RFE(estimator=logreg, n_features_to_select=15)
rfe.fit(X_train_numeric, y_train)

In [193]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoder = LabelEncoder()
X_train['What matters most to you in choosing a course'] = label_encoder.fit_transform(X_train['What matters most to you in choosing a course'])

# Now fit RFE
rfe = RFE(estimator=logreg, n_features_to_select=15)
rfe.fit(X_train, y_train)

In [194]:
# Let's take a look at which features have been selected by RFE

list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('TotalVisits', True, 1),
 ('Total Time Spent on Website', True, 1),
 ('Page Views Per Visit', False, 6),
 ('What matters most to you in choosing a course', False, 14),
 ('Lead Origin_Landing Page Submission', False, 33),
 ('Lead Origin_Lead Add Form', True, 1),
 ('Lead Origin_Lead Import', False, 24),
 ('Lead Source_Direct Traffic', False, 8),
 ('Lead Source_Facebook', False, 25),
 ('Lead Source_Google', False, 17),
 ('Lead Source_Live Chat', False, 32),
 ('Lead Source_Olark Chat', True, 1),
 ('Lead Source_Organic Search', False, 16),
 ('Lead Source_Pay per Click Ads', False, 27),
 ('Lead Source_Press_Release', False, 39),
 ('Lead Source_Reference', True, 1),
 ('Lead Source_Referral Sites', False, 18),
 ('Lead Source_Social Media', False, 35),
 ('Lead Source_WeLearn', False, 30),
 ('Lead Source_Welingak Website', True, 1),
 ('Lead Source_bing', False, 34),
 ('Lead Source_testone', False, 20),
 ('Do Not Email_Yes', True, 1),
 ('Last Activity_Converted to Lead', False, 7),
 ('Last Acti

In [195]:
# Put all the columns selected by RFE in the variable 'col'

col = X_train.columns[rfe.support_]

Now we have all the variables selected by RFE and since we care about the statistics part, i.e. the p-values and the VIFs, let's use these variables to create a logistic regression model using statsmodels.

In [196]:
# Select only the columns selected by RFE

X_train = X_train[col]

In [260]:
# Import statsmodels

import statsmodels.api as sm

In [261]:
# Fit a logistic Regression model on X_train after adding a constant and output the summary

X_train_sm = sm.add_constant(X_train)
logm2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [199]:
X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])
X_train_sm = sm.add_constant(X_train_numeric)
logm2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
res = logm2.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:              Converted   No. Observations:                 4460
Model:                            GLM   Df Residuals:                     4457
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2857.2
Date:                Sat, 18 Jan 2025   Deviance:                       5714.5
Time:                        11:56:32   Pearson chi2:                 4.48e+04
No. Iterations:                     4   Pseudo R-squ. (CS):            0.09826
Covariance Type:            nonrobust                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [200]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoder = LabelEncoder()
X_train['What matters most to you in choosing a course'] = label_encoder.fit_transform(X_train['What matters most to you in choosing a course'])

# Add constant
X_train_sm = sm.add_constant(X_train)

# Fit the GLM model
logm2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
res = logm2.fit()
print(res.summary())

KeyError: 'What matters most to you in choosing a course'

In [262]:
# Import 'variance_inflation_factor'

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [263]:
# Make a VIF dataframe for all the variables present

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [265]:
# VIFs seem to be in a decent range except for three variables. 

# Let's first drop the variable `Lead Source_Reference` since it has a high p-value as well as a high VIF.
X_train.drop('Lead Source_Reference', axis=1, inplace=True)

In [266]:
# Refit the model with the new set of features

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [267]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Add a constant to the features to account for the intercept
X_train_vif = sm.add_constant(X_train)

# Create a DataFrame to hold VIF values
vif = pd.DataFrame()
vif["Feature"] = X_train_vif.columns
vif["VIF"] = [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]

# Display the VIF DataFrame
print(vif)

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [268]:
X_train.drop('Last Notable Activity_Had a Phone Conversation', axis = 1, inplace = True)

In [269]:
# Add a constant to the updated X_train
X_train_sm = sm.add_constant(X_train)

# Fit the logistic regression model
logm1 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
result = logm1.fit()

# Display the summary of the model
print(result.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

Drop What is your current occupation_Working Professional.

In [271]:
X_train.drop('What is your current occupation_Housewife', axis = 1, inplace = True)

In [272]:
# Add a constant to the updated X_train
X_train_sm = sm.add_constant(X_train)

# Fit the logistic regression model
logm1 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
result = logm1.fit()

# Display the summary of the model
print(result.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

Drop What is your current occupation_Working Professional

In [274]:
X_train.drop('What is your current occupation_Working Professional', axis = 1, inplace = True)

In [275]:
# Add a constant to the updated X_train
X_train_sm = sm.add_constant(X_train)

# Fit the logistic regression model
logm1 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
result = logm1.fit()

# Display the summary of the model
print(result.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [276]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Add a constant for calculating VIF
X_train_vif = sm.add_constant(X_train)

# Create a DataFrame for VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_vif.values, i+1) for i in range(len(X_train.columns))]

print(vif_data)


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

## Step 3: Model Evaluation

Now, both the p-values and VIFs seem decent enough for all the variables. So let's go ahead and make predictions using this final set of features.

In [277]:
# Use 'predict' to predict the probabilities on the train set

y_train_pred = res.predict(sm.add_constant(X_train))
y_train_pred[:10]

ValueError: shapes (4460,55) and (3,) not aligned: 55 (dim 1) != 3 (dim 0)

#### Creating a dataframe with the actual conversion flag and the predicted probabilities

In [278]:
# Create a new dataframe containing the actual conversion flag and the probabilities predicted by the model

y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final.head()

NameError: name 'y_train_pred' is not defined