In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import urllib.request

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression

## Getting the data

In [2]:
def get_data(file_path, dir_path, remote_path):
    if file_path.exists():
        print(f"The path '{file_path}' exists.")
    else:
        dir_path.mkdir(parents=True, exist_ok=True)
        print(f"Downloading data from {remote_path}...")
        urllib.request.urlretrieve(remote_path, file_path)
    return pd.read_csv(file_path)

In [3]:
directory_path = Path('data')
file_path = directory_path / 'course_lead_scoring.csv'
data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

df = get_data(file_path, directory_path, data_url)
df.head()

The path 'data/course_lead_scoring.csv' exists.


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


## Data preparation

In [5]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [6]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [7]:
# Cheking for missing values per column
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [8]:
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0)

In [9]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [10]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [11]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [39]:
corr_matrix = df[numerical].corr()
print(corr_matrix.to_string(index=True))

                          number_of_courses_viewed  annual_income  interaction_count  lead_score
number_of_courses_viewed                  1.000000       0.009770          -0.023565   -0.004879
annual_income                             0.009770       1.000000           0.027036    0.015610
interaction_count                        -0.023565       0.027036           1.000000    0.009888
lead_score                               -0.004879       0.015610           0.009888    1.000000


In [13]:
## Split the data

In [14]:

df_full_train, df_test = train_test_split(df, test_size=0.2,random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test= df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

df_full_train = df_full_train.reset_index(drop=True)
df_full_train.converted.value_counts(normalize=True)


def mutual_info_score_converted_score(series):
    return mutual_info_score(series, df_full_train.converted )

mi = df_full_train[categorical].apply(mutual_info_score_converted_score)

mi.sort_values(ascending=False)

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

In [17]:
# Training set 
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(train_dicts)

# Validation set 
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
x_val = dv.transform(val_dicts)

# Train the model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

# Calculate predictions and accuracy
y_pred = model.predict_proba(x_val)[:, 1]
churn_decision = (y_pred >= 0.5)
accuracy = (y_val == churn_decision).mean()

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.70


In [18]:
def train_and_evaluate(features_to_use):
    # Prepare training data
    train_dicts = df_train[features_to_use].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    
    # Prepare validation data
    val_dicts = df_val[features_to_use].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    # Train model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Calculate accuracy
    y_pred = model.predict_proba(X_val)[:, 1]
    predictions = (y_pred >= 0.5).astype(int)
    accuracy = (y_val == predictions).mean()
    
    return accuracy

In [20]:
# Train baseline model with all features
print("Training baseline model with all features...")
baseline_accuracy = train_and_evaluate( categorical + numerical)
print(f"Baseline accuracy (all features): {baseline_accuracy:.6f}\n")

Training baseline model with all features...
Baseline accuracy (all features): 0.699659



In [28]:
results = []
for feature_to_remove in (categorical + numerical):
    # Create feature list without current feature
    features_subset = [f for f in (categorical + numerical) if f != feature_to_remove]
    
    # Train model without this feature
    accuracy_without = train_and_evaluate(features_subset)
    
    # Calculate difference (baseline - without_feature)
    # Positive difference means the feature is useful (accuracy drops without it)
    # Negative difference means the feature is harmful (accuracy improves without it)
    difference = baseline_accuracy - accuracy_without
    
    results.append({
        'feature': feature_to_remove,
        'accuracy_without': accuracy_without,
        'difference': difference
    })
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('difference', ascending=True)
print(results_df.to_string(index=False))
print()
print()
print()
least_useful_feature = results_df.loc[results_df['difference'].abs().idxmin()]
print("Least useful feature (smallest absolute difference):")
print()
print(least_useful_feature)
print(f"\nFeature name: {least_useful_feature['feature']}")
print(f"Difference: {least_useful_feature['difference']}")

                 feature  accuracy_without  difference
           annual_income          0.853242   -0.153584
                location          0.709898   -0.010239
              lead_score          0.706485   -0.006826
             lead_source          0.703072   -0.003413
                industry          0.699659    0.000000
       employment_status          0.696246    0.003413
number_of_courses_viewed          0.556314    0.143345
       interaction_count          0.556314    0.143345



Least useful feature (smallest absolute difference):

feature             industry
accuracy_without    0.699659
difference               0.0
Name: 1, dtype: object

Feature name: industry
Difference: 0.0


In [34]:
def accuracy_training (c_value):  
    # Training set 
    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dicts)
    
    # Validation set 
    val_dicts = df_val[categorical + numerical].to_dict(orient='records')
    x_val = dv.transform(val_dicts)
    
    # Train the model
    model = LogisticRegression(solver='liblinear', C=c_value, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)
    
    # Calculate predictions and accuracy
    y_pred = model.predict_proba(x_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    accuracy = (y_val == churn_decision).mean()

    return {
        'c_value': c_value,
        'accuracy': round(accuracy,3)
    }

In [35]:
accuracy_results = []

c_value_list = [0.01, 0.1, 1, 10, 100]
for c in c_value_list:
    result = accuracy_training(c)
    accuracy_results.append(result)

In [36]:
accuracy_results_df = pd.DataFrame(accuracy_results)

accuracy_results_df = accuracy_results_df.sort_values(
    by=['accuracy', 'c_value'], 
    ascending=[False, True]
)
print("\n" + "=" * 40)
print("Results sorted by accuracy (best first):")
print(accuracy_results_df.to_string(index=False))


Results sorted by accuracy (best first):
 c_value  accuracy
    0.01       0.7
    0.10       0.7
    1.00       0.7
   10.00       0.7
  100.00       0.7
