In [1]:
#import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#read the data

url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

df = pd.read_csv(url)

In [2]:
categorical = list(df.dtypes[df.dtypes==object].index)
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [3]:
numerical = list(df.dtypes[df.dtypes != object].index)
numerical

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [4]:
df[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [5]:
df[categorical] = df[categorical].fillna('NA')

In [6]:
df[numerical] = df[numerical].fillna(0.0)

In [7]:
#Q1 mode for the industry varaible
df.industry.mode()

0    retail
Name: industry, dtype: object

In [8]:
#Correlation matricx for the numerial variables
corr_matrix = np.zeros([5,5])
i = 0
for col in numerical:
    corr_matrix[i,:] = df[numerical].corrwith(df[col]).abs()
    i = i+1
corr_matrix

array([[1.        , 0.00977029, 0.02356522, 0.004879  , 0.43591366],
       [0.00977029, 1.        , 0.02703647, 0.01560955, 0.05313144],
       [0.02356522, 0.02703647, 1.        , 0.00988818, 0.37457252],
       [0.004879  , 0.01560955, 0.00988818, 1.        , 0.1936735 ],
       [0.43591366, 0.05313144, 0.37457252, 0.1936735 , 1.        ]])

In [9]:
df[numerical].corrwith(df.number_of_courses_viewed).values

array([ 1.        ,  0.00977029, -0.02356522, -0.004879  ,  0.43591366])

In [10]:
numerical

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [11]:
#Q2 What are the two values with the biggest correlation
intCount_leadScore = corr_matrix[2,3]
intCount_leadScore

np.float64(0.0098881824969131)

In [12]:
numCoursViewed_leadscore = corr_matrix[0,3]
numCoursViewed_leadscore

np.float64(0.004878998354681263)

In [13]:
numCoursViewed_intCount = corr_matrix[0,2]
numCoursViewed_intCount

np.float64(0.023565222882888103)

In [14]:
annInc_intCount = corr_matrix[1,2]
annInc_intCount

np.float64(0.027036472404814327)

In [15]:
max(np.array([intCount_leadScore, numCoursViewed_leadscore, numCoursViewed_intCount, annInc_intCount]))

np.float64(0.027036472404814327)

In [16]:
# Q2 Ans: max is annual_income vs interaction_count

In [17]:
# split the data into training, validation, and test set
from sklearn.model_selection import train_test_split

In [18]:
df_full_train, df_test = train_test_split(df[numerical + categorical], test_size=0.2, random_state=42)
#df_full_train.tail()

In [19]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, shuffle=False)
#df_val

In [20]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [21]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values
y_train.shape, y_val.shape, y_test.shape

((876,), (293,), (293,))

In [22]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [27]:
# Q3 Mutual information
from sklearn.metrics import mutual_info_score

In [32]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train['converted'])

In [33]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)

In [41]:
round(mi,2)
#mi

lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [43]:
#Q3 Ans: lead_sourse has the highest MI

In [44]:
#Q4 

from sklearn.feature_extraction import DictVectorizer

In [45]:
dv = DictVectorizer(sparse=False)

In [59]:
train_dict = df_train.to_dict(orient='records')
#train_dict
X_train = dv.fit_transform(train_dict)
X_train.shape

(876, 31)

In [66]:
val_dict = df_val.to_dict(orient='records')
#val_dict
X_val = dv.transform(val_dict)
X_val.shape

(293, 31)

In [68]:
# Train logistic regression
from sklearn.linear_model import LogisticRegression

In [69]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [70]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [77]:
#model.intercept_
#model.coef_

In [81]:
y_pred = model.predict_proba(X_val)[:, 1]
#y_pred

In [87]:
converted = (y_pred >=0.5).astype(int)

In [112]:
acc = (y_val == converted).mean()
#acc
round(acc,2)
acc

np.float64(0.757679180887372)

In [93]:
# Q4 Ans: accuracy = 0.76, close ans = 0.74

In [109]:
#Q5 Least useful feature with feature elimination technique
features = list(df_train.columns)
#features


['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'lead_source',
 'industry',
 'employment_status',
 'location']

In [118]:


for value in features:
    df_tr = df_train.copy()
    del df_tr[value]
    dv = DictVectorizer(sparse=False)
    train_dict = df_tr.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    converted = (y_pred >=0.5).astype(int)
    acc = (y_val == converted).mean()
    acc_value.append(acc)
    #acc_value.append(acc)

np.array(0.757679180887372 - np.array(acc_value))


array([ 0.00367918, -0.00032082, -0.00032082, -0.00032082, -0.00032082,
        0.16040956, -0.07849829,  0.01365188,  0.00341297, -0.00341297,
        0.        , -0.00341297,  0.        ,  0.16040956, -0.07849829,
        0.01365188,  0.00341297, -0.00341297,  0.        , -0.00341297,
        0.        ,  0.16040956, -0.07849829,  0.01365188,  0.00341297,
       -0.00341297,  0.        , -0.00341297,  0.        ,  0.16040956,
       -0.07849829,  0.01365188,  0.00341297, -0.00341297,  0.        ,
       -0.00341297,  0.        ,  0.16040956, -0.07849829,  0.01365188,
        0.00341297, -0.00341297,  0.        , -0.00341297,  0.        ,
        0.16040956, -0.07849829,  0.01365188,  0.00341297, -0.00341297,
        0.        , -0.00341297,  0.        ,  0.16040956, -0.07849829,
        0.01365188,  0.00341297, -0.00341297,  0.        , -0.00341297,
        0.        ])

In [119]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media',
       'number_of_courses_viewed'], dtype=object)

In [103]:
#Q6 Train regularized logistic regression
c_value = [0.01, 0.1, 1, 10, 100]
acc_value = []

for c in c_value:
    dv = DictVectorizer(sparse=False)
    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    converted = (y_pred >=0.5).astype(int)
    acc = (y_val == converted).mean()
    acc_value.append(round(acc,3))
    #acc_value.append(acc)

np.array(acc_value)
    

array([0.754, 0.758, 0.758, 0.758, 0.758])

In [104]:
#Q6 ans: c=0.1 leades to the best accuracy