# HW - 3

## Data Preparation

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tabulate import tabulate

In [23]:
df = pd.read_csv("course_lead_scoring.csv")

In [24]:
df.head(10)
df.info

<bound method DataFrame.info of          lead_source       industry  number_of_courses_viewed  annual_income  \
0           paid_ads            NaN                         1        79450.0   
1       social_media         retail                         1        46992.0   
2             events     healthcare                         5        78796.0   
3           paid_ads         retail                         2        83843.0   
4           referral      education                         3        85012.0   
...              ...            ...                       ...            ...   
1457        referral  manufacturing                         1            NaN   
1458        referral     technology                         3        65259.0   
1459        paid_ads     technology                         1        45688.0   
1460        referral            NaN                         5        71016.0   
1461  organic_search        finance                         3        92855.0   

     em

In [25]:
# check missing value in all cols
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [26]:
# getting the datatype of each columns

for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col] = df[col].fillna(0)
    else:
        df[col] = df[col].fillna('NA')
df.head(10)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
5,events,manufacturing,1,59904.0,,africa,6,0.83,1
6,social_media,technology,0,51283.0,,middle_east,2,0.57,0
7,social_media,,5,62975.0,student,europe,4,0.62,1
8,referral,healthcare,4,38648.0,unemployed,south_america,2,0.86,1
9,paid_ads,other,3,59866.0,student,australia,3,0.43,1


## Question 1

In [27]:
df["industry"].mode()

0    retail
Name: industry, dtype: object

## Question 2

### correlation matrix

In [28]:
corr_list = [
    ["interaction_count", "lead_score"],
    ["number_of_courses_viewed", "lead_score"],
    ["number_of_courses_viewed", "interaction_count"],
    ["annual_income", "lead_score"]
]

max_value = -1
max_corr = []

for corr in corr_list:
    corr_value = df[corr[0]].corr(df[corr[1]])
    if abs(corr_value) >= max_value:
        max_corr = corr
        max_value = abs(corr_value)

print(f"The biggest correlation is between {max_corr[0]} and {max_corr[1]}: {max_value:.3f}")


The biggest correlation is between number_of_courses_viewed and interaction_count: 0.024


### Split the data

In [29]:
seed = 42
d_test = 0.2
d_val = 0.2

# drop 'converted' col from df
df_full_train, df_test = train_test_split(df.drop(columns='converted'), test_size=d_test, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=d_val, random_state=seed)

print(f'Shape of full training set: {df_full_train.shape}')
print(f'Shape of validation set: {df_val.shape}')
print(f'Shape of test set: {df_test.shape}')
df_train

Shape of full training set: (1169, 8)
Shape of validation set: (234, 8)
Shape of test set: (293, 8)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
69,organic_search,healthcare,4,69608.0,,australia,2,0.49
1010,paid_ads,other,5,69765.0,self_employed,north_america,2,0.32
159,paid_ads,manufacturing,0,64593.0,unemployed,europe,2,0.25
156,paid_ads,technology,3,78148.0,employed,middle_east,2,0.80
1427,paid_ads,,3,37109.0,unemployed,middle_east,6,0.50
...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33
401,social_media,retail,3,64969.0,employed,north_america,1,0.18
957,,education,3,89042.0,employed,asia,4,0.75
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65


## Question 3

`mutual_info_score` measures **how much knowing one variable reduces uncertainty about another**.
It’s a measure from **information theory**, used to detect any **statistical dependence** between two variables — not just linear (like correlation), but *any* relationship.

Formally:
$$
I(X; Y) = \sum_{x,y} p(x, y) \log \frac{p(x, y)}{p(x)p(y)}
$$

**Correlation** → measures linear alignment.

**Mutual Information** → measures shared information, any dependency — linear or not.

| Plot Type                 | Looks Like     | Correlation | Mutual Info |
| ------------------------- | -------------- | ----------- | ----------- |
| 🔼 Straight line (Y = 2X) | ⬈              | High        | High        |
| 🔵 Parabola (Y = X²)      | ⬆️ then ⬇️     | Near 0      | High        |
| 🔀 Random (no pattern)    | scattered dots | 0           | 0           |


In [30]:
labels = df.converted[df_train.index]

feature_of_interest = ['industry', 'location', 'lead_source', 'employment_status']

categorical = df[feature_of_interest].select_dtypes(exclude="number").columns.to_list()

mi_scores = df_train[categorical].apply(lambda series: mutual_info_score(series, labels))

mi_scores_sorted = mi_scores.sort_values(ascending=False)

print(mi_scores_sorted.round(2))

highest_score_variable = mi_scores_sorted.idxmax()
highest_score_value = mi_scores_sorted.max()

print(f"The variable with the biggest mutual information score is '{highest_score_variable}' with a score of {highest_score_value:.2f}.")

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64
The variable with the biggest mutual information score is 'lead_source' with a score of 0.03.


## Question 4

In [31]:
numerical = df_train.select_dtypes(include="number").columns.to_list()
categorical = df_train.select_dtypes(exclude="number").columns.to_list()
features = categorical + numerical

train_dict = df_train[features].to_dict(orient="records")
val_dict = df_val[features].to_dict(orient="records")
test_dict = df_test[features].to_dict(orient="records")

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(X=train_dict)
X_val = dv.transform(X=val_dict)
X_test = dv.transform(X=test_dict)

y_train = df.converted[df_train.index].to_numpy()
y_val = df.converted[df_val.index].to_numpy()
y_test = df.converted[df_test.index].to_numpy()

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {round(accuracy, 2)}')

Validation Accuracy: 0.71


## Question 5

In [32]:
features_of_interest = ['industry', 'employment_status', 'lead_score']

results = []

accuracy_differences = {}

for feature in features_of_interest:
    features_excluded = [f for f in features_of_interest if f != feature]
    
    train_dict_excluded = df_train[features_excluded].to_dict(orient="records")
    val_dict_excluded = df_val[features_excluded].to_dict(orient="records")

    X_train_excluded = dv.fit_transform(X=train_dict_excluded)
    X_val_excluded = dv.transform(X=val_dict_excluded)
    
    model.fit(X_train_excluded, y_train)
    y_val_pred_excluded = model.predict(X_val_excluded)
    
    accuracy_excluded = accuracy_score(y_val, y_val_pred_excluded)
    
    accuracy_difference = accuracy - accuracy_excluded
    accuracy_differences[feature] = accuracy_difference
    
    results.append([feature, round(accuracy, 4), round(accuracy_excluded, 4), round(accuracy_difference, 4)])

results_df = pd.DataFrame(results, columns=["Feature", "Accuracy With Feature", "Accuracy Without Feature", "Accuracy Difference"])

print("\nAccuracy Results:")
print(tabulate(results_df, headers='keys', tablefmt='pretty', floatfmt=".4f"))

least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)
smallest_difference_value = accuracy_differences[least_useful_feature]

print(f'\nThe least useful feature is "{least_useful_feature}" with a difference of {smallest_difference_value:.4f}.')


Accuracy Results:
+---+-------------------+-----------------------+--------------------------+---------------------+
|   |      Feature      | Accuracy With Feature | Accuracy Without Feature | Accuracy Difference |
+---+-------------------+-----------------------+--------------------------+---------------------+
| 0 |     industry      |        0.7051         |          0.6282          |       0.0769        |
| 1 | employment_status |        0.7051         |          0.6624          |       0.0427        |
| 2 |    lead_score     |        0.7051         |          0.641           |       0.0641        |
+---+-------------------+-----------------------+--------------------------+---------------------+

The least useful feature is "employment_status" with a difference of 0.0427.


## Question 6

In [33]:
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=seed)
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    rounded_accuracy = round(accuracy, 3)
    
    print(f'Accuracy with C={C}: {rounded_accuracy}')
    
    if rounded_accuracy > best_accuracy or (rounded_accuracy == best_accuracy and (best_C is None or C < best_C)):
        best_accuracy = rounded_accuracy
        best_C = C

print(f'The best C value is {best_C} with an accuracy of {best_accuracy:.3f}.')

Accuracy with C=0.01: 0.705
Accuracy with C=0.1: 0.705
Accuracy with C=1: 0.705
Accuracy with C=10: 0.705
Accuracy with C=100: 0.705
The best C value is 0.01 with an accuracy of 0.705.
