In [1]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif

In [2]:
# 1) Load data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

# Quick look
print("Rows, cols:", df.shape)
print(df.columns.tolist())

# The target column name in this dataset is 'converted' (0/1) or similar.
# Inspect to confirm:
print(df['converted'].value_counts(dropna=False))

Rows, cols: (1462, 9)
['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score', 'converted']
1    905
0    557
Name: converted, dtype: int64


In [3]:
# 2) Check missing values
missing_counts = df.isna().sum()
print("Missing counts per column (top 20):\n", missing_counts[missing_counts>0].sort_values(ascending=False).head(20))

Missing counts per column (top 20):
 annual_income        181
industry             134
lead_source          128
employment_status    100
location              63
dtype: int64


In [4]:
# 3) Impute missing values:
# For categorical => replace NaN with 'NA'
# For numeric => replace NaN with 0.0
df_imputed = df.copy()

for col in df_imputed.columns:
    if df_imputed[col].dtype == 'O' or str(df_imputed[col].dtype).startswith('category'):
        df_imputed[col] = df_imputed[col].fillna('NA')
    else:
        # numeric
        df_imputed[col] = df_imputed[col].fillna(0.0)

In [5]:
# 4) Q1: mode of 'industry'
industry_mode = df_imputed['industry'].mode().iloc[0]
print("Mode of 'industry':", industry_mode)


Mode of 'industry': retail


In [6]:
# 5) Correlation matrix for numerical features
# Select numeric columns only
num_cols = df_imputed.select_dtypes(include=[np.number]).columns.tolist()
corr = df_imputed[num_cols].corr()
print("Numeric columns:", num_cols)
print("Correlation matrix (top):\n", corr)

# For Q2 we will consider the candidate pairs:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]
for a,b in pairs:
    if a in corr.index and b in corr.columns:
        print(f"corr({a},{b}) = {corr.loc[a,b]:.4f}")
    else:
        print(f"One of {a} or {b} not numeric/present.")

Numeric columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']
Correlation matrix (top):
                           number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  
corr(interaction_count,lead_score

In [7]:
# 6) Split to train / val / test (60/20/20), seed=42
# Remove target from feature matrix when transforming etc.
df_full = df_imputed.copy()

# target name
target = 'converted'
if target not in df_full.columns:
    raise RuntimeError("Target 'converted' not found")

# split train_temp (80%) and test (20%)
train_val, test = train_test_split(df_full, test_size=0.2, random_state=42, stratify=df_full[target])
# from train_val split train (60%) and val (20%): train_val currently 80% -> we want train 60% total => train fraction = 0.75 of train_val
train, val = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val[target])

print("Sizes: train, val, test =", len(train), len(val), len(test))

# Ensure target is not present in features when training
X_train = train.drop(columns=[target])
y_train = train[target].astype(int)

X_val = val.drop(columns=[target])
y_val = val[target].astype(int)

X_test = test.drop(columns=[target])
y_test = test[target].astype(int)

Sizes: train, val, test = 876 293 293


In [8]:
# 7) Q3: mutual information between converted and categorical variables (training set only)
# Identify categorical columns (object or category)
cat_cols = [c for c in X_train.columns if X_train[c].dtype == 'O' or str(X_train[c].dtype).startswith('category')]
print("Categorical columns:", cat_cols)

# We need to encode categorical columns as integer codes for mutual_info_classif.
# Use pandas factorize per column to preserve mapping
mi_scores = {}
for col in cat_cols:
    x = X_train[col].astype(str).fillna('NA')
    codes, uniques = pd.factorize(x)
    mi = mutual_info_classif(codes.reshape(-1,1), y_train, discrete_features=True, random_state=42)
    mi_scores[col] = float(mi[0])

# Round scores
mi_scores_rounded = {k: round(v, 2) for k,v in mi_scores.items()}
print("Mutual information (rounded):", mi_scores_rounded)

# Find the categorical variable with highest MI
best_cat = max(mi_scores.items(), key=lambda x: x[1])[0]
print("Best categorical MI:", best_cat, mi_scores_rounded[best_cat])

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Mutual information (rounded): {'lead_source': 0.03, 'industry': 0.01, 'employment_status': 0.01, 'location': 0.0}
Best categorical MI: lead_source 0.03


In [9]:
# 8) Q4: Logistic regression with one-hot encoding of categorical variables
# We'll one-hot encode categorical features (drop='first' to avoid collinearity or keep all - both are common).
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Fit on train categorical columns
X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_val_cat = ohe.transform(X_val[cat_cols])
X_test_cat = ohe.transform(X_test[cat_cols])

# Numeric columns (remaining)
num_cols = [c for c in X_train.columns if c not in cat_cols]

X_train_num = X_train[num_cols].values
X_val_num = X_val[num_cols].values
X_test_num = X_test[num_cols].values

# Concatenate
X_train_enc = np.hstack([X_train_num, X_train_cat])
X_val_enc = np.hstack([X_val_num, X_val_cat])
X_test_enc = np.hstack([X_test_num, X_test_cat])

# Fit logistic regression with given params
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)

# Predict on validation set
y_val_pred = model.predict(X_val_enc)
val_acc = accuracy_score(y_val, y_val_pred)
print("Validation accuracy (Q4):", round(val_acc, 2))

Validation accuracy (Q4): 0.73


In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Load dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

# 2. Handle missing values
for col in df.columns:
    if df[col].dtype == 'O' or str(df[col].dtype).startswith('category'):
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

# 3. Split into train / val / test (60/20/20)
target = 'converted'
df_full = df.copy()

train_val, test = train_test_split(df_full, test_size=0.2, random_state=42, stratify=df_full[target])
train, val = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val[target])

X_train = train.drop(columns=[target])
y_train = train[target].astype(int)
X_val = val.drop(columns=[target])
y_val = val[target].astype(int)

# 4. Identify categorical and numerical columns
cat_cols = [c for c in X_train.columns if X_train[c].dtype == 'O' or str(X_train[c].dtype).startswith('category')]
num_cols = [c for c in X_train.columns if c not in cat_cols]

# 5. One-hot encode categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_val_cat = ohe.transform(X_val[cat_cols])

X_train_num = X_train[num_cols].values
X_val_num = X_val[num_cols].values

X_train_enc = np.hstack([X_train_num, X_train_cat])
X_val_enc = np.hstack([X_val_num, X_val_cat])

# 6. Train regularized logistic regression for different C values
Cs = [0.01, 0.1, 1, 10, 100]
results_C = {}

for C in Cs:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_val_pred = model.predict(X_val_enc)
    acc_val = round(accuracy_score(y_val, y_val_pred), 3)
    results_C[C] = acc_val
    print(f"C={C} -> Validation accuracy: {acc_val}")

# 7. Determine best C (smallest C in case of tie)
best_C = min([c for c,a in results_C.items() if a == max(results_C.values())])

print("\nValidation accuracies per C:", results_C)
print(f"Best C (choose smallest if tie): {best_C}")


C=0.01 -> Validation accuracy: 0.734
C=0.1 -> Validation accuracy: 0.73
C=1 -> Validation accuracy: 0.73
C=10 -> Validation accuracy: 0.73
C=100 -> Validation accuracy: 0.73

Validation accuracies per C: {0.01: 0.734, 0.1: 0.73, 1: 0.73, 10: 0.73, 100: 0.73}
Best C (choose smallest if tie): 0.01
