In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mutual_info_score

import numpy as np
import pandas as pd

import pandas as pd
import numpy as np

In [6]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

In [7]:
print("Missing values before:")
print(df.isnull().sum())

Missing values before:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [8]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [9]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

cat_cols = list(df.select_dtypes(include=["object", "category"]).columns)
num_cols = list(df.select_dtypes(include=[np.number]).columns)

In [10]:
for c in cat_cols:
  df[c] = df[c].str.lower().str.replace(" ", "_") if df[c].dtype == "object" else df[c]

In [11]:
df[cat_cols] = df[cat_cols].fillna("NA")
df[num_cols] = df[num_cols].fillna(0.0)

In [12]:
#Question 1: Most frequent observation (mode) for "industry"
col = "industry"
if col not in df.columns:
    raise ValueError(f"Column '{col}' not found. Available columns: {list(df.columns)}")

mode_value = df[col].value_counts(dropna=False).idxmax()
mode_count = df[col].value_counts(dropna=False).max()

print(f"Most frequent observation (mode) for '{col}': {mode_value} (count={mode_count})")

# Optional: show top 10 frequencies
df[col].value_counts(dropna=False).head(10)

Most frequent observation (mode) for 'industry': retail (count=203)


industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [13]:
# Qestion 2
corr_matrix = df[num_cols].corr()
corr_matrix

pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]

pair_corrs = {}
missing_pairs = []

for a, b in pairs:
    if a in corr_matrix.index and b in corr_matrix.columns:
        pair_corrs[(a, b)] = corr_matrix.loc[a, b]
    else:
        missing_pairs.append((a, b))

print("Selected pair correlations:")
for (a, b), v in pair_corrs.items():
    print(f"{a} ↔ {b}: corr={v:.6f} | abs={abs(v):.6f}")

if missing_pairs:
    print("\nWarning: These pairs were not found in the dataframe (column missing):")
    for p in missing_pairs:
        print(" -", p)

if pair_corrs:
    best_pair = max(pair_corrs.items(), key=lambda kv: abs(kv[1]))
    print(f"\n► Pair with the largest absolute correlation (among the given options): {best_pair[0]} = {best_pair[1]:.6f}")
else:
    print("\nNo valid pairs found. Check column names in the dataset.")

from sklearn.model_selection import train_test_split

TARGET_COL = "converted"
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found. Available columns: {list(df.columns)}")

# Try to convert target to numeric (handles 'yes'/'no' or '0'/'1')
y_raw = df[TARGET_COL]
if not np.issubdtype(y_raw.dtype, np.number):
    # Attempt numeric coercion first
    y_num = pd.to_numeric(y_raw, errors="coerce")
    if y_num.isna().any():
        # Map common string labels
        map_dict = {"yes": 1, "no": 0, "y": 1, "n": 0, "true": 1, "false": 0, "1": 1, "0": 0}
        y_num = y_raw.astype(str).str.lower().map(map_dict)
    y = y_num.fillna(0).astype(int).values
else:
    y = y_raw.astype(int).values

# Feature matrix without target
X = df.drop(columns=[TARGET_COL])

# First split: train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# Second split: temp into val (20%) and test (20%) -> each is half of temp (i.e., 0.2 of total)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Shapes:")
print("  X_train:", X_train.shape, "y_train:", y_train.shape)
print("  X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)
print("  X_test: ", X_test.shape,  "y_test: ", y_test.shape)

# Quick sanity check that target is not inside features
assert TARGET_COL not in X_train.columns
assert TARGET_COL not in X_val.columns
assert TARGET_COL not in X_test.columns


Selected pair correlations:
interaction_count ↔ lead_score: corr=0.009888 | abs=0.009888
number_of_courses_viewed ↔ lead_score: corr=-0.004879 | abs=0.004879
number_of_courses_viewed ↔ interaction_count: corr=-0.023565 | abs=0.023565
annual_income ↔ interaction_count: corr=0.027036 | abs=0.027036

► Pair with the largest absolute correlation (among the given options): ('annual_income', 'interaction_count') = 0.027036
Shapes:
  X_train: (877, 8) y_train: (877,)
  X_val:   (292, 8) y_val:   (292,)
  X_test:  (293, 8) y_test:  (293,)


In [14]:
# Qestion 3

# Identify categorical columns in the TRAIN split only
cat_cols_train = list(X_train.select_dtypes(include=["object", "category"]).columns)

# Helper to compute MI safely
def mi_with_y(y, series):
    return mutual_info_score(y, series)

candidates = ["industry", "location", "lead_source", "employment_status"]

mi_results = {}
missing = []

for col in candidates:
    if col in X_train.columns and col in cat_cols_train:
        score = mi_with_y(y_train, X_train[col])
        mi_results[col] = round(score, 2)
    elif col in X_train.columns:
        # Column exists but isn't categorical per dtype -> still attempt MI on raw values
        score = mi_with_y(y_train, X_train[col].astype(str))
        mi_results[col] = round(score, 2)
    else:
        missing.append(col)

print("Mutual information (rounded to 2 decimals):")
for k, v in mi_results.items():
    print(f"  {k}: {v}")

if missing:
    print("\nWarning: Missing columns in training data:", missing)

if mi_results:
    best = max(mi_results.items(), key=lambda kv: kv[1])
    print(f"\n► Variable with the largest MI (among the given options): {best[0]} (score={best[1]})")
else:
    print("\nNo candidate variables found in training data.")

Mutual information (rounded to 2 decimals):
  industry: 0.02
  location: 0.0
  lead_source: 0.03
  employment_status: 0.02

► Variable with the largest MI (among the given options): lead_source (score=0.03)


In [15]:
# Qestion 4
# Identify categorical and numerical columns from TRAIN data
cat_cols_train = list(X_train.select_dtypes(include=["object", "category"]).columns)
num_cols_train = list(X_train.select_dtypes(include=[np.number, "bool"]).columns)

# Preprocessor: OneHot for categoricals, pass-through for numericals
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_train),
        ("num", "passthrough", num_cols_train),
    ]
)

# Build the pipeline
logreg_clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)),
    ]
)

# Fit on TRAIN
logreg_clf.fit(X_train, y_train)

# Validate on VAL
y_val_pred = logreg_clf.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Validation accuracy (rounded to 2 decimals): {round(val_acc, 2)}")

Validation accuracy: 0.7432
Validation accuracy (rounded to 2 decimals): 0.74


In [16]:
# Question 5

# Recreate the baseline pipeline (same as Q4)
cat_cols_train = list(X_train.select_dtypes(include=["object", "category"]).columns)
num_cols_train = list(X_train.select_dtypes(include=[np.number, "bool"]).columns)

baseline_preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_train),
        ("num", "passthrough", num_cols_train),
    ]
)

baseline_clf = Pipeline(
    steps=[
        ("preprocess", baseline_preprocessor),
        ("model", LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)),
    ]
)

baseline_clf.fit(X_train, y_train)
baseline_pred = baseline_clf.predict(X_val)
baseline_acc = accuracy_score(y_val, baseline_pred)
print("Baseline validation accuracy (all features):", baseline_acc)

# Helper: train & eval with a subset of columns
def eval_without(feature_to_drop: str):
    if feature_to_drop not in X_train.columns:
        return np.nan  # feature not present

    cols_after_drop = [c for c in X_train.columns if c != feature_to_drop]

    Xtr = X_train[cols_after_drop]
    Xv  = X_val[cols_after_drop]

    cat_cols = list(Xtr.select_dtypes(include=["object", "category"]).columns)
    num_cols = list(Xtr.select_dtypes(include=[np.number, "bool"]).columns)

    pre = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", num_cols),
        ]
    )

    clf = Pipeline(
        steps=[
            ("preprocess", pre),
            ("model", LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)),
        ]
    )

    clf.fit(Xtr, y_train)
    yv_pred = clf.predict(Xv)
    acc = accuracy_score(y_val, yv_pred)
    return acc

candidates = ["industry", "employment_status", "lead_score"]

rows = []
for f in candidates:
    acc_wo = eval_without(f)
    diff = baseline_acc - acc_wo if pd.notnull(acc_wo) else np.nan
    rows.append(
        {
            "feature": f,
            "val_acc_without": acc_wo,
            "difference (baseline - without)": diff,
            "abs_difference": abs(diff) if pd.notnull(diff) else np.nan,
        }
    )

results_df = pd.DataFrame(rows).sort_values("abs_difference", ascending=True)
print("\nPer-feature removal results (sorted by absolute difference):")
print(results_df.to_string(index=False))

if results_df["abs_difference"].notna().any():
    least_impact_row = results_df.iloc[0]
    print(
        f"\n► Least impact (smallest |difference|): '{least_impact_row['feature']}' "
        f"(difference={least_impact_row['difference (baseline - without)']})"
    )

Baseline validation accuracy (all features): 0.7431506849315068

Per-feature removal results (sorted by absolute difference):
          feature  val_acc_without  difference (baseline - without)  abs_difference
         industry         0.743151                         0.000000        0.000000
       lead_score         0.743151                         0.000000        0.000000
employment_status         0.746575                        -0.003425        0.003425

► Least impact (smallest |difference|): 'industry' (difference=0.0)


In [17]:
# Quetion 6

Cs_q6 = [0.01, 0.1, 1, 10, 100]

# Column sets from TRAIN (same logic as Q4; we don't modify earlier variables)
cat_cols_q6 = list(X_train.select_dtypes(include=["object", "category"]).columns)
num_cols_q6 = list(X_train.select_dtypes(include=[np.number, "bool"]).columns)

records = []

for C in Cs_q6:
    pre_q6 = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_q6),
            ("num", "passthrough", num_cols_q6),
        ]
    )
    clf_q6 = Pipeline(
        steps=[
            ("preprocess", pre_q6),
            ("model", LogisticRegression(solver="liblinear", C=C, max_iter=1000, random_state=42)),
        ]
    )
    clf_q6.fit(X_train, y_train)
    y_val_pred_q6 = clf_q6.predict(X_val)
    acc_q6 = accuracy_score(y_val, y_val_pred_q6)
    acc_q6_round = round(acc_q6, 3)
    records.append({"C": C, "val_acc": acc_q6, "val_acc_rounded_3": acc_q6_round})

results_q6 = pd.DataFrame(records).sort_values("C")
print(results_q6[["C", "val_acc_rounded_3"]].to_string(index=False))

# Select best by **rounded** accuracy; tie -> smallest C
max_rounded = results_q6["val_acc_rounded_3"].max()
best_candidates = results_q6[results_q6["val_acc_rounded_3"] == max_rounded]
best_C_final = float(best_candidates["C"].min())

print(f"\n► Best C (chosen by max rounded accuracy, tie -> smallest C): {best_C_final}")

     C  val_acc_rounded_3
  0.01              0.743
  0.10              0.743
  1.00              0.743
 10.00              0.743
100.00              0.743

► Best C (chosen by max rounded accuracy, tie -> smallest C): 0.01
