In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('course_lead_scoring.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: 'course_lead_scoring.csv'

### Data preparation

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']	
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    df[c] = df[c].fillna('NA')

df[categorical].isnull().sum()

In [None]:
for n in numerical:
    df[n] = df[n].fillna(0.0)

df[numerical].isnull().sum()

### QUESTION 1

In [None]:
df.industry.mode()

### QUESTION 2

In [None]:
# correlation matrix
df[numerical].corr().round(3)

### QUESTION 3

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting the dataset into 3 dataframes
train_full_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)
train_df, validation_df = train_test_split(train_full_df, test_size = 0.25, random_state = 42)

len(train_df), len(validation_df), len(test_df)

In [None]:
y_train = train_df.converted.values
y_validation = validation_df.converted.values
y_test = test_df.converted.values

del train_df['converted']
del validation_df['converted']
del test_df['converted']

In [None]:
# mutual information
from sklearn.metrics import mutual_info_score

def mutual_info_converted_score(y):
    return mutual_info_score(y, train_full_df.converted)

mi = train_full_df[categorical].apply(mutual_info_converted_score)

# mutual information in sorted order, rounded to 2 decimals
round(mi.sort_values(ascending=False), 2)   

### QUESTION 4

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [None]:
# creating a one-hot encoding matrix
dicts = train_df[categorical + numerical].to_dict(orient = 'records')

div = DictVectorizer(sparse = False)
X_train = div.fit_transform(dicts)

# one-hot encoding matrix 
X_train           

In [None]:
div.get_feature_names_out()

In [None]:
model = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model.fit(X_train, y_train)

In [None]:
validation_dict = validation_df[categorical + numerical].to_dict(orient = 'records')
X_validation = div.transform(validation_dict)

y_predictions = model.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions)

In [None]:
# accuracy
converted_decision = (y_predictions >= 0.5)
converted_decision.astype(int)
converted_mean = (converted_decision == y_validation).mean()
round(converted_mean, 2)

### QUESTION 5

In [None]:
model_1 = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_1.fit(X_train, y_train)

In [None]:
y_predictions_model_1 = model_1.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_1)

In [None]:
# accuracy
converted_decision_model_1 = (y_predictions_model_1 >= 0.5)
converted_decision_model_1.astype(int)
converted_mean_model_1 = (converted_decision_model_1 == y_validation).mean()
converted_mean_model_1

In [None]:
# Global accuracy
original_accuracy = converted_mean_model_1
original_accuracy

### Excluding industry feature

In [None]:
categorical_no_industry = ['lead_source', 'employment_status', 'location']
categorical_no_industry

In [None]:
# creating a one-hot encoding matrix
dicts_no_industry = train_df[categorical_no_industry + numerical].to_dict(orient = 'records')

div_no_industry = DictVectorizer(sparse = False)
X_train_no_industry = div_no_industry.fit_transform(dicts_no_industry)

# one-hot encoding matrix
X_train_no_industry

In [None]:
div_no_industry.get_feature_names_out()

In [None]:
# Splitting the dataset into 3 dataframes
train_full_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)
train_df, validation_df = train_test_split(train_full_df, test_size = 0.25, random_state = 42)

len(train_df), len(validation_df), len(test_df)

In [None]:
# Removing the industry feature from the train_df
no_industry_train_df = train_df.copy()
no_industry_train_df = no_industry_train_df.drop(columns=['industry'])

y_train_no_industry = no_industry_train_df.converted.values
no_industry_train_df = no_industry_train_df[categorical_no_industry + numerical]
no_industry_train_df

In [None]:
# Removing the industry feature from the validation_df
no_industry_validation_df = validation_df.copy()
no_industry_validation_df = no_industry_validation_df.drop(columns=['industry'])

y_validation_no_industry = no_industry_validation_df.converted.values
no_industry_validation_df = no_industry_validation_df[categorical_no_industry + numerical]
no_industry_validation_df

In [None]:
# Fitting the model
model_no_industry = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_no_industry.fit(X_train_no_industry, y_train_no_industry)

In [None]:
validation_dict_no_industry = validation_df[categorical_no_industry + numerical].to_dict(orient = 'records')
X_validation_no_industry = div_no_industry.transform(validation_dict_no_industry)

y_predictions_no_industry = model_no_industry.predict_proba(X_validation_no_industry)[:, 1]
len(X_validation_no_industry), len(y_predictions_no_industry)

In [None]:
# Computing the local accuracy
converted_decision_no_industry = (y_predictions_no_industry >= 0.5)
converted_decision_no_industry.astype(int)
converted_mean_no_industry = (converted_decision_no_industry == y_validation_no_industry).mean()
converted_mean_no_industry

In [None]:
# The difference between the two accuracies
original_accuracy - converted_mean_no_industry

### Excluding employment_status feature

In [None]:
categorical_no_employment = ['lead_source', 'industry', 'location']
categorical_no_employment

In [None]:
# creating a one-hot encoding matrix
dicts_no_employment = train_df[categorical_no_employment + numerical].to_dict(orient = 'records')

div_no_employment = DictVectorizer(sparse = False)
X_train_no_employment = div_no_employment.fit_transform(dicts_no_employment)

# one-hot encoding matrix
X_train_no_employment

In [None]:
div_no_employment.get_feature_names_out()

In [None]:
# Removing the employment_status feature from the train_df
no_employment_train_df = train_df.copy()
no_employment_train_df = no_employment_train_df.drop(columns=['employment_status'])

y_train_no_employment = no_employment_train_df.converted.values
no_employment_train_df = no_employment_train_df[categorical_no_employment + numerical]
no_employment_train_df

In [None]:
# Removing the employment_status feature from the validation_df
no_employment_validation_df = validation_df.copy()
no_employment_validation_df = no_employment_validation_df.drop(columns=['employment_status'])

y_validation_no_employment = no_employment_validation_df.converted.values
no_employment_validation_df = no_employment_validation_df[categorical_no_employment + numerical]
no_employment_validation_df

In [None]:
# Fitting the model
model_no_employment = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_no_employment.fit(X_train_no_employment, y_train_no_employment)

In [None]:
validation_dict_no_employment = validation_df[categorical_no_employment + numerical].to_dict(orient = 'records')
X_validation_no_employment = div_no_employment.transform(validation_dict_no_employment)

y_predictions_no_employment = model_no_employment.predict_proba(X_validation_no_employment)[:, 1]
len(X_validation_no_employment), len(y_predictions_no_employment)

In [None]:
# Computing the local accuracy
converted_decision_no_employment = (y_predictions_no_employment >= 0.5)
converted_decision_no_employment.astype(int)
converted_mean_no_employment = (converted_decision_no_employment == y_validation_no_employment).mean()
converted_mean_no_employment

In [None]:
# The difference between the two accuracies
original_accuracy - converted_mean_no_employment

### Excluding lead_score feature

In [None]:
numerical_no_score = ['number_of_courses_viewed', 'annual_income', 'interaction_count']
numerical_no_score

In [None]:
# creating a one-hot encoding matrix
dicts_no_score = train_df[categorical + numerical_no_score].to_dict(orient = 'records')

div_no_score = DictVectorizer(sparse = False)
X_train_no_score = div_no_score.fit_transform(dicts_no_score)

# one-hot encoding matrix
X_train_no_score

In [None]:
div_no_score.get_feature_names_out()

In [None]:
# Removing the lead_score feature from the train_df
no_score_train_df = train_df.copy()
no_score_train_df = no_score_train_df.drop(columns=['lead_score'])

y_train_no_score = no_score_train_df.converted.values
no_score_train_df = no_score_train_df[categorical + numerical_no_score]
no_score_train_df

In [None]:
# Removing the lead_score feature from the validation_df
no_score_validation_df = validation_df.copy()
no_score_validation_df = no_score_validation_df.drop(columns=['lead_score'])
    
y_validation_no_score = no_score_validation_df.converted.values
no_score_validation_df = no_score_validation_df[categorical + numerical_no_score]
no_score_validation_df

In [None]:
# Fitting the model
model_no_score = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_no_score.fit(X_train_no_score, y_train_no_score)

In [None]:
validation_dict_no_score = validation_df[categorical + numerical_no_score].to_dict(orient = 'records')
X_validation_no_score = div_no_score.transform(validation_dict_no_score)

y_predictions_no_score = model_no_score.predict_proba(X_validation_no_score)[:, 1]
len(X_validation_no_score), len(y_predictions_no_score)

In [None]:
# Computing the local accuracy
converted_decision_no_score = (y_predictions_no_score >= 0.5)
converted_decision_no_score.astype(int)
converted_mean_no_score = (converted_decision_no_score == y_validation_no_score).mean()
converted_mean_no_score

In [None]:
# The difference between the two accuracies
original_accuracy - converted_mean_no_score

### QUESTION 6

#### MODEL 1: C = 0.01

In [None]:
reg_model_1 = LogisticRegression(solver = 'liblinear', C = 0.01, max_iter = 1000, random_state = 42)

reg_model_1.fit(X_train, y_train)

In [None]:
y_predictions_model_1 = reg_model_1.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_1)

In [None]:
# model 1 accuracy
converted_decision_model_1 = (y_predictions_model_1 >= 0.5)
converted_decision_model_1.astype(int)
converted_mean_model_1 = (converted_decision_model_1 == y_validation).mean()
round(converted_mean_model_1, 3)

#### MODEL 2: C = 0.1

In [None]:
reg_model_2 = LogisticRegression(solver = 'liblinear', C = 0.1, max_iter = 1000, random_state = 42)

reg_model_2.fit(X_train, y_train)

In [None]:
y_predictions_model_2 = reg_model_2.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_2)

In [None]:
# model 2 accuracy
converted_decision_model_2 = (y_predictions_model_2 >= 0.5)
converted_decision_model_2.astype(int)
converted_mean_model_2 = (converted_decision_model_2 == y_validation).mean()
round(converted_mean_model_2, 3)

#### MODEL 3: C = 1

In [None]:
reg_model_3 = LogisticRegression(solver = 'liblinear', C = 1, max_iter = 1000, random_state = 42)

reg_model_3.fit(X_train, y_train)

In [None]:
y_predictions_model_3 = reg_model_3.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_3)

In [None]:
# model 3 accuracy
converted_decision_model_3 = (y_predictions_model_3 >= 0.5)
converted_decision_model_3.astype(int)
converted_mean_model_3 = (converted_decision_model_3 == y_validation).mean()
round(converted_mean_model_3, 3)

#### MODEL 4: C = 10

In [None]:
reg_model_4 = LogisticRegression(solver = 'liblinear', C = 10, max_iter = 1000, random_state = 42)

reg_model_4.fit(X_train, y_train)

In [None]:
y_predictions_model_4 = reg_model_4.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_4)

In [None]:
# model 4 accuracy
converted_decision_model_4 = (y_predictions_model_4 >= 0.5)
converted_decision_model_4.astype(int)
converted_mean_model_4 = (converted_decision_model_4 == y_validation).mean()
round(converted_mean_model_4, 3)

#### MODEL 5: C = 100

In [None]:
reg_model_5 = LogisticRegression(solver = 'liblinear', C = 100, max_iter = 1000, random_state = 42)

reg_model_5.fit(X_train, y_train)

In [None]:
y_predictions_model_5 = reg_model_5.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_5)

In [None]:
# model 5 accuracy
converted_decision_model_5 = (y_predictions_model_5 >= 0.5)
converted_decision_model_5.astype(int)
converted_mean_model_5 = (converted_decision_model_5 == y_validation).mean()
round(converted_mean_model_5, 3)