## Importing Necessary Libraries¶

In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

## Load "leads" Data

In [127]:
leads_df=pd.read_csv('Leads.csv')

## Inspect Data Quality

In [129]:
leads_df.shape

(9240, 37)

In [130]:
leads_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [131]:
leads_df.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [132]:
leads_df.isnull().mean()*100

Prospect ID                                       0.000000
Lead Number                                       0.000000
Lead Origin                                       0.000000
Lead Source                                       0.389610
Do Not Email                                      0.000000
Do Not Call                                       0.000000
Converted                                         0.000000
TotalVisits                                       1.482684
Total Time Spent on Website                       0.000000
Page Views Per Visit                              1.482684
Last Activity                                     1.114719
Country                                          26.634199
Specialization                                   15.562771
How did you hear about X Education               23.885281
What is your current occupation                  29.112554
What matters most to you in choosing a course    29.318182
Search                                            0.0000

## Drop cols with majority of the values as NULL (> 45%)

In [134]:
leads_cut_off=45
leads_cols_to_drop=list(leads_df.columns[100*leads_df.isnull().mean()>leads_cut_off])
print(leads_cols_to_drop)

['Lead Quality', 'Asymmetrique Activity Index', 'Asymmetrique Profile Index', 'Asymmetrique Activity Score', 'Asymmetrique Profile Score']


In [135]:
leads_df1 = leads_df.drop(leads_cols_to_drop,axis=1)

In [136]:
leads_df1.shape

(9240, 32)

## Drop "Prospect ID" and "Lead Number" as these are the ID cols

In [138]:
leads_df1.drop("Prospect ID",axis=1, inplace=True)
leads_df1.drop("Lead Number",axis=1, inplace=True)

In [139]:
leads_df1.shape

(9240, 30)

## Categorical variables with level as 'Select'

In [141]:
leads_df1["Specialization"].value_counts(normalize=True)*100   #24.891054

Specialization
Select                               24.891054
Finance Management                   12.509613
Human Resource Management            10.869008
Marketing Management                 10.740836
Operations Management                 6.447065
Business Administration               5.165342
IT Projects Management                4.691105
Supply Chain Management               4.473212
Banking, Investment And Insurance     4.332223
Travel and Tourism                    2.601897
Media and Advertising                 2.601897
International Business                2.281466
Healthcare Management                 2.037939
Hospitality Management                1.461164
E-COMMERCE                            1.435529
Retail Management                     1.281723
Rural and Agribusiness                0.935658
E-Business                            0.730582
Services Excellence                   0.512689
Name: proportion, dtype: float64

In [142]:
leads_df1["How did you hear about X Education"].value_counts(normalize=True)*100  # 71.704820

How did you hear about X Education
Select                   71.704820
Online Search            11.488696
Word Of Mouth             4.948102
Student of SomeSchool     4.407792
Other                     2.644675
Multiple Sources          2.161240
Advertisements            0.995308
Social Media              0.952652
Email                     0.369686
SMS                       0.327030
Name: proportion, dtype: float64

In [143]:
leads_df1["Lead Profile"].value_counts(normalize=True)*100 #63.481856

Lead Profile
Select                         63.481856
Potential Lead                 24.697596
Other Leads                     7.456745
Student of SomeSchool           3.690093
Lateral Student                 0.367478
Dual Specialization Student     0.306232
Name: proportion, dtype: float64

In [144]:
leads_df1["City"].value_counts(normalize=True)*100  # 28.759591

City
Mumbai                         41.202046
Select                         28.759591
Thane & Outskirts               9.616368
Other Cities                    8.772379
Other Cities of Maharashtra     5.843990
Other Metro Cities              4.859335
Tier II Cities                  0.946292
Name: proportion, dtype: float64

## Drop cols "How did you hear about X Education" and "Lead Profile" as these have very high NULL (Select) values

In [146]:
leads_df1.drop("How did you hear about X Education",axis=1, inplace=True)
leads_df1.drop("Lead Profile",axis=1, inplace=True)

In [147]:
leads_df1.shape

(9240, 28)

## Replace 'Select' to NaN in cols "Specialization" and "City"

In [149]:
leads_df1.replace({'Select': np.nan}, inplace=True)
leads_df1.replace({'Select': np.nan}, inplace=True)

### recommended sensitivity and specificity %?

In [151]:
leads_df1.nunique()

Lead Origin                                         5
Lead Source                                        21
Do Not Email                                        2
Do Not Call                                         2
Converted                                           2
TotalVisits                                        41
Total Time Spent on Website                      1731
Page Views Per Visit                              114
Last Activity                                      17
Country                                            38
Specialization                                     18
What is your current occupation                     6
What matters most to you in choosing a course       3
Search                                              2
Magazine                                            1
Newspaper Article                                   2
X Education Forums                                  2
Newspaper                                           2
Digital Advertisement       

## Identify the categorical and numerical cols
Any col having less than 50 uniq values, its categorical Col
Any col having more than 50 uniq values, its Numerical Col

In [153]:
leads_cat_cols=[]
leads_num_cols=[]
for col in leads_df1.columns:
    if leads_df1[col].nunique()>=50:
        leads_num_cols.append(col)
    else:
        leads_cat_cols.append(col)

In [154]:
len(leads_cat_cols)

26

In [155]:
len(leads_num_cols)

2

#### - fill in the missing values with median for numerical cols
#### - fill in the missing values with mode for categorical cols

In [157]:
# Replacing missing values with median in case of numerical columns
for col in leads_num_cols:
    med = leads_df1[col].median()
    leads_df1[col] = leads_df1[col].fillna(med)

In [158]:
# Replacing missing values with mode in case of categorical columns
for col in leads_cat_cols:
    mod = leads_df1[col].mode()[0]
    leads_df1[col] = leads_df1[col].fillna(mod)

In [159]:
leads_df1.isnull().mean()*100

Lead Origin                                      0.0
Lead Source                                      0.0
Do Not Email                                     0.0
Do Not Call                                      0.0
Converted                                        0.0
TotalVisits                                      0.0
Total Time Spent on Website                      0.0
Page Views Per Visit                             0.0
Last Activity                                    0.0
Country                                          0.0
Specialization                                   0.0
What is your current occupation                  0.0
What matters most to you in choosing a course    0.0
Search                                           0.0
Magazine                                         0.0
Newspaper Article                                0.0
X Education Forums                               0.0
Newspaper                                        0.0
Digital Advertisement                         