In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("course_lead_scoring.csv")
df.head()

In [None]:
df.dtypes

In [None]:
def fill_na(df, column, value):
    df[column] = df[column].fillna(value)
    return df

df = fill_na(df, ['lead_source', 'industry', 'employment_status', 'location'], 'NA')
df = fill_na(df, 'annual_income', 0)

### Question 1

What is the most frequent observation (mode) for the column `industry`?

- NA
- technology
- healthcare
- __retail__

In [None]:
df.industry.value_counts()

In [None]:
num_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
cat_features = df.select_dtypes(include=['object']).columns.tolist()

df_numeric = df[num_features]
df_categorical = df[cat_features]

corr_matrix = df_numeric.corr()
print(corr_matrix)

plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

What are the two features that have the biggest correlation?

- interaction_count and lead_score
- number_of_courses_viewed and lead_score
- __number_of_courses_viewed and interaction_count__
- annual_income and lead_score

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [None]:
def mutual_info_cat(df, y, cat_features):
    mi_scores = {}
    for feature in cat_features:
        mi = round(mutual_info_score(y, df[feature]),2)
        mi_scores[feature] = mi
    return mi_scores

mutual_info_cat(df_train, y_train, cat_features)

Which of these variables has the biggest mutual information score?
  
- industry
- location
- __lead_source__
- employment_status

In [None]:
train_dicts = df_train.to_dict(orient='records')
train_dicts[0]
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred
converted_descision = y_pred >= 0.5
(y_val == converted_descision).mean()

What accuracy did you get?

- 0.64
- __0.74__
- 0.84
- 0.94

In [None]:
def train_model(df_train, df_val, y_train, y_val, C=1.0):
    train_dicts = df_train.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)

    val_dicts = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    converted_descision = y_pred >= 0.5
    accuracy = (y_val == converted_descision).mean()
    return accuracy

In [None]:
print(train_model(df_train, df_val, y_train, y_val))
print(train_model(df_train[['lead_source',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score']],df_val, y_train, y_val))

print(train_model(df_train[['lead_source',
 'number_of_courses_viewed',
 'annual_income',
 'industry',
 'location',
 'interaction_count',
 'lead_score']],df_val, y_train, y_val))

print(train_model(df_train[['lead_source',
 'number_of_courses_viewed',
 'annual_income',
 'industry',
 'location',
 'interaction_count',
 'employment_status']],df_val, y_train, y_val))


Which of following feature has the smallest difference?

- __'industry'__
- 'employment_status'
- 'lead_score'

In [None]:
for c in [0.01, 0.1, 1, 10, 100]:
    print(f"C={c}")
    print(train_model(df_train, df_val, y_train, y_val, C=c))

Which of these `C` leads to the best accuracy on the validation set?

- __0.01__
- 0.1
- 1
- 10
- 100

**Note**: If there are multiple options, select the smallest `C`.