In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [17]:
df = pd.read_csv('E:\Leads.csv')
df.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [18]:
df.shape

(9240, 37)

In [19]:
df.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


### Initial data preparation

In [20]:
#lowerCase the columns names 
df.columns = df.columns.str.lower().str.replace(' ','_')

#select columns with string values 
string_columns= list(df.dtypes[df.dtypes==object].index)

#Lowercases and replaces spaces with underscoresfor values in all string columns of the DataFrame 
for col in string_columns:
    df[col]=df[col].str.lower().str.replace(' ','_')

* Cheking for missing values

In [21]:
df.isnull().sum()

prospect_id                                         0
lead_number                                         0
lead_origin                                         0
lead_source                                        36
do_not_email                                        0
do_not_call                                         0
converted                                           0
totalvisits                                       137
total_time_spent_on_website                         0
page_views_per_visit                              137
last_activity                                     103
country                                          2461
specialization                                   1438
how_did_you_hear_about_x_education               2207
what_is_your_current_occupation                  2690
what_matters_most_to_you_in_choosing_a_course    2709
search                                              0
magazine                                            0
newspaper_article           

### Handeling missing Values
#### mode for categorical variables
#### mean for numerical variables

In [22]:
categorical=['lead_origin','lead_source','do_not_email','do_not_call'
             ,'last_activity','country','specialization', 'how_did_you_hear_about_x_education',
             'what_is_your_current_occupation','what_matters_most_to_you_in_choosing_a_course','search',
            'magazine','newspaper_article','digital_advertisement', 'through_recommendations','receive_more_updates_about_our_courses',
            'tags', 'lead_quality','update_me_on_supply_chain_content','lead_profile','city',
            'asymmetrique_activity_index','asymmetrique_profile_index', 'i_agree_to_pay_the_amount_through_cheque',
            'a_free_copy_of_mastering_the_interview','last_notable_activity']
numerical=['totalvisits','total_time_spent_on_website','page_views_per_visit','asymmetrique_activity_score','asymmetrique_profile_score']

In [23]:

for cat in categorical:
    df[cat]= df[cat].fillna(df[cat].mode()[0])
for num in numerical :
    df[num]= df[num].fillna(df[num].mean())

In [24]:
def calculate_mi(series):
    return mutual_info_score(series,df.converted)

In [25]:
df_mi = df[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name="MI")
df_mi

Unnamed: 0,MI
tags,0.20663
lead_quality,0.107821
last_activity,0.084493
lead_profile,0.082869
last_notable_activity,0.07386
lead_source,0.06156
lead_origin,0.056251
what_is_your_current_occupation,0.053389
specialization,0.014624
city,0.012024


* We drop these features which are the least important

In [28]:
df.drop(['do_not_call',
'through_recommendations',
'what_matters_most_to_you_in_choosing_a_course',
'digital_advertisement',
'newspaper_article',
'search',
'receive_more_updates_about_our_courses',
'update_me_on_supply_chain_content',
'magazine',
'i_agree_to_pay_the_amount_through_cheque'],axis=1,inplace=True

)

In [29]:
df.shape[1]

27

### Correlation Coefficient( linear relationship between variables and the target variables)

In [31]:
df[numerical].corrwith(df.converted)

totalvisits                    0.030091
total_time_spent_on_website    0.362483
page_views_per_visit          -0.003294
asymmetrique_activity_score    0.123491
asymmetrique_profile_score     0.160700
dtype: float64

In [33]:
new_categorical =['lead_origin','lead_source','do_not_email'
             ,'last_activity','country','specialization', 'how_did_you_hear_about_x_education',
             'what_is_your_current_occupation',
            'tags', 'lead_quality','lead_profile','city',
            'asymmetrique_activity_index','asymmetrique_profile_index',
            'a_free_copy_of_mastering_the_interview','last_notable_activity']

In [34]:
df_train_full, df_test= train_test_split(df,test_size=0.2,random_state=1)

df_train, df_val= train_test_split(df_train_full,test_size=0.33, random_state=11)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [35]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

### FEATURE ENGINEERING

* One Hot Encoding

In [36]:
train_dic = df_train[new_categorical+ numerical].to_dict(orient='records')
train_dic

[{'lead_origin': 'lead_add_form',
  'lead_source': 'reference',
  'do_not_email': 'no',
  'last_activity': 'email_opened',
  'country': 'india',
  'specialization': 'media_and_advertising',
  'how_did_you_hear_about_x_education': 'select',
  'what_is_your_current_occupation': 'unemployed',
  'tags': 'want_to_take_admission_but_has_financial_problems',
  'lead_quality': 'low_in_relevance',
  'lead_profile': 'potential_lead',
  'city': 'mumbai',
  'asymmetrique_activity_index': '02.medium',
  'asymmetrique_profile_index': '02.medium',
  'a_free_copy_of_mastering_the_interview': 'no',
  'last_notable_activity': 'email_opened',
  'totalvisits': 0.0,
  'total_time_spent_on_website': 0,
  'page_views_per_visit': 0.0,
  'asymmetrique_activity_score': 14.306252489048187,
  'asymmetrique_profile_score': 16.344882516925527},
 {'lead_origin': 'landing_page_submission',
  'lead_source': 'direct_traffic',
  'do_not_email': 'yes',
  'last_activity': 'sms_sent',
  'country': 'india',
  'specializatio

In [40]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dic)

In [41]:
X_train = dv.transform(train_dic)

### Logistic Regression Model

* Training Logistic Regression

In [42]:
model = LogisticRegression(solver='liblinear',random_state=1)
model.fit(X_train,y_train)

* Cross Validation

In [44]:
val_dic = df_val[new_categorical+numerical].to_dict(orient='records')
X_val = dv.transform(val_dic)

In [45]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([3.02457766e-04, 9.38885449e-01, 9.41555341e-01, ...,
       3.98445853e-01, 3.87249833e-01, 3.51191295e-03])

In [46]:
converted = y_pred>0.5

### Evaluation Logistic Regression

* Accuracy

In [47]:
accuracy = (y_val==converted).mean()
print("accuracy: ", accuracy)

accuracy:  0.925


* Confusion Matrix

In [48]:
t = 0.5
predict_converted = (y_pred >= t)
predict_no_converted= (y_pred < t)

actual_converted = (y_val == 1)
actual_no_converted = (y_val == 0)

true_positive = (predict_converted & actual_converted).sum()
false_positive = (predict_converted & actual_no_converted).sum()
false_negative = (predict_no_converted & actual_converted).sum()
true_negative = (predict_no_converted & actual_no_converted).sum()

In [49]:
confusion_table = np.array(
[[true_negative, false_positive],
[false_negative, true_positive]])
confusion_table

array([[1410,   90],
       [  93,  847]])

In [50]:
confusion_table / confusion_table.sum()*100

array([[57.78688525,  3.68852459],
       [ 3.81147541, 34.71311475]])

* Precision

In [51]:
P = true_positive/(false_positive+true_positive)
P 

0.9039487726787621

* Recall

In [52]:
R =  true_positive/ (true_positive + false_negative)
R

0.9010638297872341

* ROC

In [53]:
FPR = false_positive/(false_positive+true_negative)
print('False Positive Rate= ',FPR*100,'%')

False Positive Rate=  6.0 %
