In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data =  'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [4]:
!wget $data -O data-week-3-hw.csv

--2025-10-13 21:55:41--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘data-week-3-hw.csv’


2025-10-13 21:55:41 (31.9 MB/s) - ‘data-week-3-hw.csv’ saved [80876/80876]



In [5]:
df = pd.read_csv("data-week-3-hw.csv")
print(df.shape)
df.head()

(1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Data Preparation

In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
print(categorical)

['lead_source', 'industry', 'employment_status', 'location']


In [8]:
numerical = list(df.dtypes[df.dtypes != 'object'].index)
print(numerical)

['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


In [9]:
df[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [10]:
df[categorical] = df[categorical].fillna('NA')

In [11]:
df[numerical].isnull().sum()

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [12]:
df.annual_income = df.annual_income.fillna(0.0)

In [13]:
df[numerical].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

### Q1

In [14]:
df.industry.unique()

array(['NA', 'retail', 'healthcare', 'education', 'manufacturing',
       'technology', 'other', 'finance'], dtype=object)

In [15]:
df.industry.mode()

0    retail
Name: industry, dtype: object

### Q2

In [16]:
df[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [17]:
# annual_income and interaction_count 

#### Split the data

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [20]:
len(df_full_train), len(df_test)

(1169, 293)

In [21]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [22]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [23]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [24]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [25]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [26]:
df_train.head(1)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,retail,0,58472.0,student,middle_east,5,0.03


### Q3

In [27]:
from sklearn.metrics import mutual_info_score

In [28]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, y_train)

In [29]:
mi = df_train[categorical].apply(mutual_info_converted_score)
round(mi, 2)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

### Q4

In [30]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [31]:
numerical = list(df_train.dtypes[df_train.dtypes != 'object'].index)

In [32]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [33]:
train_dicts[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03}

In [34]:
dv = DictVectorizer(sparse=False)

In [35]:
X_train = dv.fit_transform(train_dicts)

In [36]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
val_dicts[0]

{'lead_source': 'paid_ads',
 'industry': 'healthcare',
 'employment_status': 'unemployed',
 'location': 'europe',
 'number_of_courses_viewed': 3,
 'annual_income': 52220.0,
 'interaction_count': 1,
 'lead_score': 0.07}

In [37]:
X_val = dv.transform(val_dicts)

In [38]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [39]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [40]:
convert_decision = model.predict(X_val)

In [41]:
accuracy = (y_val == convert_decision).mean()
round(accuracy, 2)

np.float64(0.7)

In [42]:
y_pred = model.predict_proba(X_val)[:,1] #this gives a probability/score
convert_decision = (y_pred >= 0.5)

In [43]:
(y_val == convert_decision).mean()

np.float64(0.6996587030716723)

In [44]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'annual_income': np.float64(-0.0),
 'employment_status=NA': np.float64(-0.015),
 'employment_status=employed': np.float64(0.034),
 'employment_status=self_employed': np.float64(0.003),
 'employment_status=student': np.float64(0.012),
 'employment_status=unemployed': np.float64(-0.103),
 'industry=NA': np.float64(-0.025),
 'industry=education': np.float64(0.049),
 'industry=finance': np.float64(-0.02),
 'industry=healthcare': np.float64(-0.013),
 'industry=manufacturing': np.float64(-0.003),
 'industry=other': np.float64(-0.009),
 'industry=retail': np.float64(-0.032),
 'industry=technology': np.float64(-0.016),
 'interaction_count': np.float64(0.311),
 'lead_score': np.float64(0.051),
 'lead_source=NA': np.float64(0.02),
 'lead_source=events': np.float64(-0.012),
 'lead_source=organic_search': np.float64(-0.012),
 'lead_source=paid_ads': np.float64(-0.115),
 'lead_source=referral': np.float64(0.08),
 'lead_source=social_media': np.float64(-0.03),
 'location=NA': np.float64(0.004),
 'l

### Q5

In [52]:
features = categorical + numerical
scores = []

for feature in features:
    small = [f for f in features if f != feature]
    
    dicts_train_small = df_train[small].to_dict(orient='records')
    dicts_val_small = df_val[small].to_dict(orient='records')
    
    dv_small = DictVectorizer(sparse=False)
    dv_small.fit(dicts_train_small)
    
    X_train_small = dv_small.transform(dicts_train_small)
    X_val_small = dv_small.transform(dicts_val_small)
    
    model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_small.fit(X_train_small, y_train)
    
    y_pred = model_small.predict(X_val_small)
    score = (y_val == y_pred).mean()
    
    scores.append((feature, score))

for feature, score in scores:
    print(f"{feature}: {accuracy - score:.20f}")


for feature, score in scores:
    print(f"{feature}: {score}")

lead_source: -0.00341296928327650129
industry: 0.00000000000000000000
employment_status: 0.00341296928327639026
location: -0.01023890784982939284
number_of_courses_viewed: 0.14334470989761094462
annual_income: -0.15358361774744033745
interaction_count: 0.14334470989761094462
lead_score: -0.00682593856655289155
lead_source: 0.7030716723549488
industry: 0.6996587030716723
employment_status: 0.6962457337883959
location: 0.7098976109215017
number_of_courses_viewed: 0.5563139931740614
annual_income: 0.8532423208191127
interaction_count: 0.5563139931740614
lead_score: 0.7064846416382252


### Q6

In [46]:
reg = [0.01, 0.1, 1, 10, 100]
scores = []

for val in reg:
    model = LogisticRegression(solver='liblinear', C=val, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    score = (y_val == y_pred).mean()

    scores.append((val, score))

for val, score in scores:
    print(f"{val}: {score}")

0.01: 0.6996587030716723
0.1: 0.6996587030716723
1: 0.6996587030716723
10: 0.6996587030716723
100: 0.6996587030716723


In [49]:
best_C, highest_accuracy  = max(scores, key=lambda x: x[1])
best_C, highest_accuracy

(0.01, np.float64(0.6996587030716723))

In [48]:
for val in reg:
    model = LogisticRegression(solver='liblinear', C=val, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    print(val, model.coef_[0][:5])  # checking why accuracy looks same for all
    

0.01 [-1.44269496e-05 -1.50706426e-02  3.29340037e-02  7.74137467e-04
  8.60929738e-03]
0.1 [-1.74490535e-05 -1.47654943e-02  3.38345780e-02  2.46367391e-03
  1.12234076e-02]
1 [-1.77843877e-05 -1.47154423e-02  3.39095225e-02  2.66248432e-03
  1.15238518e-02]
10 [-1.78182832e-05 -1.47102082e-02  3.39168287e-02  2.68270276e-03
  1.15543333e-02]
100 [-1.78216761e-05 -1.47096825e-02  3.39175574e-02  2.68472802e-03
  1.15573859e-02]
