In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [118]:
df = pd.read_csv('course_lead_scoring.csv')
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [119]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [120]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

# Question 1

In [121]:
dict_dtypes = df.dtypes.to_dict()
dict_dtypes

{'lead_source': dtype('O'),
 'industry': dtype('O'),
 'number_of_courses_viewed': dtype('int64'),
 'annual_income': dtype('float64'),
 'employment_status': dtype('O'),
 'location': dtype('O'),
 'interaction_count': dtype('int64'),
 'lead_score': dtype('float64'),
 'converted': dtype('int64')}

In [122]:
categorical = [k for k,v in dict_dtypes.items() if str(v) == 'object']
numerical = [k for k,v in dict_dtypes.items() if str(v) in(['int64', 'float64'])]
numerical.remove('converted')

In [123]:
for c in categorical:
    df[c] = df[c].fillna('NA')
for c in numerical: 
    df[c] = df[c].fillna(0.0)

In [124]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [125]:
most_frequent_industry = df.industry.value_counts(ascending=False)

## Answer 1

In [126]:
most_frequent_industry

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

# Question 2

In [127]:
df[numerical]

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.80
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62
...,...,...,...,...
1457,1,0.0,4,0.53
1458,3,65259.0,2,0.24
1459,1,45688.0,3,0.02
1460,5,71016.0,0,0.25


In [128]:
corr_numerical = df[numerical].corr()

In [129]:
corr_numerical

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [130]:
s = corr_numerical.stack()
s = s.abs()
top5 = s.nlargest(5)
top5

        

number_of_courses_viewed  number_of_courses_viewed    1.000000
annual_income             annual_income               1.000000
interaction_count         interaction_count           1.000000
lead_score                lead_score                  1.000000
annual_income             interaction_count           0.027036
dtype: float64

## Answer 2

In [131]:
biggest_correlation = top5[-1]
biggest_correlation

  biggest_correlation = top5[-1]


0.02703647240481443

# Split the data

In [132]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42) 

In [133]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del(df_train['converted'])
del(df_val['converted'])
del(df_test['converted'])

# Question 3

In [134]:
from sklearn.metrics import mutual_info_score

In [135]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [136]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False).round(2)

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

## Answer 3

In [137]:
biggest_mi = mi.sort_values(ascending=False).round(2).index[0]
biggest_mi

'lead_source'

# Question 4

In [138]:
from sklearn.feature_extraction import DictVectorizer

In [139]:
train_dicts = df_train[categorical+numerical].to_dict(orient='records')

In [140]:
df_train.head(10)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
1,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
2,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
3,,technology,1,74956.0,employed,europe,3,0.34
4,organic_search,retail,3,59335.0,student,australia,1,0.98
5,social_media,retail,4,50961.0,student,south_america,1,1.0
6,social_media,finance,0,0.0,unemployed,africa,3,0.73
7,events,retail,3,45170.0,student,middle_east,3,0.38
8,organic_search,,0,73717.0,employed,asia,2,0.28
9,organic_search,education,2,61872.0,student,australia,1,0.01


In [141]:
# DictVectorized infers from data how we want to transform the entire dataframe
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [142]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [143]:
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)

In [144]:
test_dicts = df_test[categorical+numerical].to_dict(orient='records')
X_test = dv.fit_transform(test_dicts)

In [145]:
dicts_full_train = df_full_train[categorical+numerical].to_dict(orient='records')
X_full_train = dv.fit_transform(dicts_full_train)

In [146]:
y_full_train = df_full_train.converted.values

In [147]:
from sklearn.linear_model import LogisticRegression

In [148]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [149]:
model.coef_, model.intercept_[0]

(array([[-1.77843877e-05, -1.47154423e-02,  3.39095225e-02,
          2.66248432e-03,  1.15238518e-02, -1.02527697e-01,
         -2.48510995e-02,  4.93604222e-02, -2.01258344e-02,
         -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
         -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,
          5.12012528e-02,  2.01511698e-02, -1.20346284e-02,
         -1.16021521e-02, -1.15251880e-01,  7.95303436e-02,
         -2.99401329e-02,  3.95843295e-03, -1.14296944e-02,
         -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,
          5.58598769e-03, -3.33967159e-02, -2.52837052e-02,
          4.53752887e-01]]),
 -0.06914728027824993)

In [150]:
y_pred = model.predict_proba(X_val)


In [151]:
y_pred = y_pred[:, 1]


In [152]:
converted_decision = (y_pred >= 0.5)


## Answer 4

In [153]:
accuaracy_q4 = round((converted_decision == y_test).mean(),2)
accuaracy_q4

0.64

# Question 5

In [154]:
accuracy_original = accuaracy_q4

In [155]:
accuracy_diffs = {k: 0 for k in categorical+numerical}

In [156]:
features = categorical+numerical
features

['lead_source',
 'industry',
 'employment_status',
 'location',
 'number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [157]:
for feature in features:
    features_small = features.copy()
    features_small.remove(feature)
    print(features_small)
    dicts_small_train = df_train[features_small].to_dict(orient='records')
    X_small_train = dv.fit_transform(dicts_small_train)
    dicts_small_val = df_val[features_small].to_dict(orient='records')
    X_small_val = dv.fit_transform(dicts_small_val)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_small_train, y_train)
    y_pred_small = model.predict_proba(X_small_val)[:, 1]
    accuracy_small = round(((y_pred_small>= 0.5) == y_test).mean(),2)
    accuracy_diffs[feature] += abs(accuracy_original-accuracy_small)
    

['industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
['lead_source', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
['lead_source', 'industry', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
['lead_source', 'industry', 'employment_status', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
['lead_source', 'industry', 'employment_status', 'location', 'annual_income', 'interaction_count', 'lead_score']
['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'interaction_count', 'lead_score']
['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'lead_score']
['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count']


## Answer 5

In [158]:
accuracy_diffs


{'lead_source': 0.010000000000000009,
 'industry': 0.0,
 'employment_status': 0.010000000000000009,
 'location': 0.010000000000000009,
 'number_of_courses_viewed': 0.030000000000000027,
 'annual_income': 0.050000000000000044,
 'interaction_count': 0.030000000000000027,
 'lead_score': 0.010000000000000009}

In [159]:
min(accuracy_diffs['industry'], accuracy_diffs['employment_status'], accuracy_diffs['lead_score'])

0.0

# Question 6

In [189]:
Cs = [0.01, 0.1, 1, 10, 100]
accuracies_reg = {f'{i}': 0 for i in Cs}

In [None]:
for c in Cs:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accuracy = (converted_decision == y_val).mean()
    accuracies_reg[str(c)] += round(accuracy,3)

C=0.01, Coeff: [[-1.44269496e-05 -1.50706426e-02  3.29340037e-02  7.74137467e-04
   8.60929738e-03 -1.04877363e-01 -2.55340247e-02  4.82213629e-02
  -2.09010925e-02 -1.49527146e-02 -4.12113462e-03 -1.03656251e-02
  -3.32670234e-02 -1.67103148e-02  2.63709089e-01  4.70729819e-02
   1.90076423e-02 -1.30314747e-02 -1.35024862e-02 -1.16598119e-01
   7.77556500e-02 -3.12617789e-02  3.59346197e-03 -1.25310608e-02
  -1.20323048e-02 -6.91618829e-03  6.41786556e-03  4.22307621e-03
  -3.49558482e-02 -2.54295685e-02  4.16472803e-01]]
C=0.1, Coeff: [[-1.74490535e-05 -1.47654943e-02  3.38345780e-02  2.46367391e-03
   1.12234076e-02 -1.02865198e-01 -2.49449781e-02  4.92809607e-02
  -2.02255436e-02 -1.35960447e-02 -3.12399242e-03 -9.38544148e-03
  -3.19788924e-02 -1.61351006e-02  3.06513056e-01  5.08044646e-02
   2.00466245e-02 -1.21507220e-02 -1.18145222e-02 -1.15493479e-01
   7.94084814e-02 -3.01054154e-02  3.92285139e-03 -1.15570032e-02
  -1.13389219e-02 -5.74437767e-03  8.07445648e-03  5.44547970

## Answer 6

In [188]:
accuracies_reg

{'0.01': 0.7, '0.1': 0.7, '1': 0.7, '10': 0.7, '100': 0.7}