In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 16:19:51--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-13 16:19:51 (29.4 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
raw_leads_df = pd.read_csv("course_lead_scoring.csv")
raw_leads_df.head(5)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
# define numerical and categorical columns
numerical_columns = ["annual_income", "lead_score", 'interaction_count', 'number_of_courses_viewed', 'converted']
categorical_columns = ['lead_source', 'industry', 'employment_status', 'location']

In [5]:
# Clean up missing values
cleaned_leads_df = raw_leads_df.copy()
cleaned_leads_df[numerical_columns] = cleaned_leads_df[numerical_columns].fillna(0.0)
cleaned_leads_df[categorical_columns] = cleaned_leads_df[categorical_columns].fillna('NA')

cleaned_leads_df.isnull().sum()


Unnamed: 0,0
lead_source,0
industry,0
number_of_courses_viewed,0
annual_income,0
employment_status,0
location,0
interaction_count,0
lead_score,0
converted,0


**Question 1**

In [6]:
cleaned_leads_df.industry.value_counts()

Unnamed: 0_level_0,count
industry,Unnamed: 1_level_1
retail,203
finance,200
other,198
healthcare,187
education,187
technology,179
manufacturing,174
,134


Answer: "retail"

**Question 2**

In [7]:
correlation_martrix = cleaned_leads_df[numerical_columns].corr()
# correlation_martrix = cleaned_leads_df[numerical_columns].corr().abs()
np.fill_diagonal(correlation_martrix.values, np.nan)

sorted_correlation_matrix = correlation_martrix.unstack().reset_index().sort_values([0], ascending=False)


sorted_correlation_matrix[sorted_correlation_matrix["level_0"] == "interaction_count"]


Unnamed: 0,level_0,level_1,0
14,interaction_count,converted,0.374573
10,interaction_count,annual_income,0.027036
11,interaction_count,lead_score,0.009888
13,interaction_count,number_of_courses_viewed,-0.023565
12,interaction_count,interaction_count,


In [8]:
sorted_correlation_matrix[sorted_correlation_matrix["level_0"] == "number_of_courses_viewed"]

Unnamed: 0,level_0,level_1,0
19,number_of_courses_viewed,converted,0.435914
15,number_of_courses_viewed,annual_income,0.00977
16,number_of_courses_viewed,lead_score,-0.004879
17,number_of_courses_viewed,interaction_count,-0.023565
18,number_of_courses_viewed,number_of_courses_viewed,


Answer: interaction_count - annual_income

**Question 3**

In [11]:
from sklearn.model_selection import train_test_split

full_df = cleaned_leads_df.copy()

df_full_train, df_test = train_test_split(full_df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(full_df), len(df_train), len(df_test), len(df_val)

(1462, 876, 293, 293)

In [12]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

df_train.head(5)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
835,,technology,1,74956.0,employed,europe,3,0.34
837,organic_search,retail,3,59335.0,student,australia,1,0.98


In [19]:
from sklearn.metrics import mutual_info_score

scores = []

for column in categorical_columns:
  scores.append({
      "column_name": column,
      "mutual_score": mutual_info_score(df_train[column], y_train).round(2)
  })

pd.DataFrame(scores)

Unnamed: 0,column_name,mutual_score
0,lead_source,0.04
1,industry,0.01
2,employment_status,0.01
3,location,0.0


Answer: lead_source

**Question 4**

In [26]:
from sklearn.feature_extraction import DictVectorizer

numerical_columns = ["annual_income", "lead_score", 'interaction_count', 'number_of_courses_viewed']

dv = DictVectorizer(sparse=False)

train_dict = df_train[numerical_columns + categorical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[numerical_columns + categorical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[numerical_columns + categorical_columns].to_dict(orient='records')
X_test = dv.transform(test_dict)

In [27]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

In [41]:
y_pred_probabilities = model.predict_proba(X_val)[:, 1]
y_pred_decision = y_pred_probabilities >= 0.5
y_pred = y_pred_decision.astype(int)

correct_predictions = y_pred == y_val

correct_predictions.mean().round(2)

np.float64(0.7)

Answer - 0.74

**Question 5**