### Classification

In [12]:
import pandas as pd
import matplotlib.pyplot as plt

In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-11 20:15:24--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-11 20:15:24 (10.5 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [3]:
df = pd.read_csv('course_lead_scoring.csv')

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [4]:
df.isnull().any()

lead_source                  True
industry                     True
number_of_courses_viewed    False
annual_income                True
employment_status            True
location                     True
interaction_count           False
lead_score                  False
converted                   False
dtype: bool

In [8]:
for cat_col in ['lead_source', 'industry', 'employment_status', 'location']:
    df[cat_col] = df[cat_col].fillna('NA')

for num_col in ['annual_income']:
    df[num_col] = df[num_col].fillna(0.0)

#### Industry Mode Value

In [9]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [10]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

##

#### Correlation matrix

In [16]:
num_cols = ['lead_score', 'interaction_count', 'annual_income', 'number_of_courses_viewed']

df[num_cols].corr()

Unnamed: 0,lead_score,interaction_count,annual_income,number_of_courses_viewed
lead_score,1.0,0.009888,0.01561,-0.004879
interaction_count,0.009888,1.0,0.027036,-0.023565
annual_income,0.01561,0.027036,1.0,0.00977
number_of_courses_viewed,-0.004879,-0.023565,0.00977,1.0


#### Mutual info score

In [40]:
from sklearn.model_selection import train_test_split

X = df.drop('converted', axis=1)
y = df['converted'].values

X_train_val, X_test, y_train_val, y_test  = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

In [41]:
from sklearn.metrics import mutual_info_score

mutual_info_score_dict = {}

for cat_col in ['lead_source', 'industry', 'employment_status', 'location']:
    mutual_info_score_dict[cat_col] = round(mutual_info_score(y_train, X_train[cat_col]), 2)


In [42]:
sorted(mutual_info_score_dict.items(), key=lambda x: x[1])

[('location', 0.0),
 ('industry', 0.01),
 ('employment_status', 0.01),
 ('lead_source', 0.04)]

### Logistic regression model

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

cat_cols = ['lead_source', 'industry', 'employment_status', 'location']

log_reg_pipe = Pipeline(
   [ ('oneHotEncoder',  ColumnTransformer(
        [("encoder", OneHotEncoder(), cat_cols)],
        remainder="passthrough"
    )),
    (
        "log_reg", LogisticRegression(solver="liblinear", C=1.0, max_iter=1_000, random_state=42)
    )

   ]

)


In [44]:
log_model = log_reg_pipe.fit(X_train, y_train)

In [48]:
y_pred = log_model.predict(X_val)

In [52]:
from sklearn.metrics import accuracy_score

full_model_score = accuracy_score(y_val, y_pred)

### Feature elimination

In [54]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_drop = None

    def fit(self, X, y=None, columns_to_drop = None):
        if columns_to_drop is not None:
            self.columns_to_drop = columns_to_drop
        return self 

    def transform(self, X):
        if self.columns_to_drop is not None:
            return X.drop(columns=self.columns_to_drop, errors='ignore')
        return X
    
log_reg_feature_elim_pipe = Pipeline(
   [ ("column_dropper", ColumnDropper()),
       
       
       ('oneHotEncoder',  ColumnTransformer(
        [("encoder", OneHotEncoder(), cat_cols)],
        remainder="passthrough"
    )),
    (
        "log_reg", LogisticRegression(solver="liblinear", C=1.0, max_iter=1_000, random_state=42)
    )

   ]
)


In [66]:

score_diff_dict = {}

cat_cols_orig = ['lead_source', 'industry', 'employment_status', 'location']

for elim_col in ['industry', 'employment_status', 'lead_score']:
    
    if elim_col in cat_cols_orig:
        cat_cols =  [col for col in cat_cols_orig if col != elim_col]
    else:
        cat_cols = cat_cols_orig
    
    print(cat_cols, elim_col)

    log_reg_feature_elim_pipe = Pipeline(
            [ ("column_dropper", ColumnDropper()),
                
                
                ('oneHotEncoder',  ColumnTransformer(
                    [("encoder", OneHotEncoder(), cat_cols)],
                    remainder="passthrough"
                )),
                (
                    "log_reg", LogisticRegression(solver="liblinear", C=1.0, max_iter=1_000, random_state=42)
                )

            ]
            )
    
    log_model = log_reg_feature_elim_pipe.fit(X_train, y_train, column_dropper__columns_to_drop=[elim_col])

    y_pred = log_model.predict(X_val)

    feature_elim_score = accuracy_score(y_val, y_pred)
    

    score_diff_dict[elim_col] = abs(full_model_score - feature_elim_score)



['lead_source', 'employment_status', 'location'] industry
['lead_source', 'industry', 'location'] employment_status
['lead_source', 'industry', 'employment_status', 'location'] lead_score


#### Regularized log regression

In [78]:
c_param_accuracy_dict = {}

cat_cols = ['lead_source', 'industry', 'employment_status', 'location']

for C_val in [0.01, 0.1, 1, 10, 100]:
    log_reg_pipe = Pipeline(
                        [ ('oneHotEncoder',  ColumnTransformer(
                                [("encoder", OneHotEncoder(), cat_cols)],
                                remainder="passthrough"
                            )),
                            (
                                "log_reg", LogisticRegression(solver="liblinear", C=C_val, max_iter=1_000, random_state=42)
                            )

                        ]

                        )

    log_model = log_reg_pipe.fit(X_train, y_train)

    y_pred = log_model.predict(X_val)

    c_param_accuracy_dict[C_val] = accuracy_score(y_val, y_pred)

In [76]:
sorted(c_param_accuracy_dict.items(), key=lambda x: x[1])

[(0.01, 0.6996587030716723),
 (0.1, 0.6996587030716723),
 (1, 0.6996587030716723),
 (10, 0.6996587030716723),
 (100, 0.6996587030716723)]