In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [48]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [50]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
# create list of numerical columns
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)

In [51]:
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [52]:
numerical_columns

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [53]:
# create a list of columns that have missing values
numerical_missing_values = [col for col in numerical_columns if df[col].isnull().sum() > 0]

In [54]:
numerical_missing_values

['annual_income']

In [55]:
# Fill missing values in categorical columns with 'NA'
df[categorical_columns] = df[categorical_columns].fillna('NA')

In [56]:
# Verify no missing values remain
df[categorical_columns].isnull().sum()

lead_source          0
industry             0
employment_status    0
location             0
dtype: int64

In [57]:
# Replace missing values in numerical columns with 0.0
df[numerical_missing_values] = df[numerical_missing_values].fillna(0.0)

In [58]:
df[numerical_columns].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


## Question 1

What is the most frequent observation (mode) for the column `industry`?

- `NA`
- `technology`
- `healthcare`
- **_`retail`_**

In [60]:
# What is the most frequent observation for the industry column
df['industry'].mode()[0]

'retail'

## Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `interaction_count` and `lead_score`
- `number_of_courses_viewed` and `lead_score`
- `number_of_courses_viewed` and `interaction_count`
- _**`annual_income` and `interaction_count`**_


In [61]:
# Create the correlation matrix for the numerical features of your dataset
correlation_matrix = df[numerical_columns].corr()

In [62]:
correlation_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


## Split the Data

- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
- Make sure the target value `y` is not in your dataframe.

In [63]:
# Create df_full_train and df_test (80%/20%)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [64]:
# Create df_train and df_val (75%/25% of df_full_train)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [65]:
len(df), len(df_train), len(df_val), len(df_test), len(df_full_train)

(1462, 876, 293, 293, 1169)

In [66]:
# Reset index for all dataframes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

In [67]:
# Create target variable y for all dataframes
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [68]:
# drop converted column from all dataframes
for df in (df_train, df_val, df_test):
    df.drop(columns=['converted'], inplace=True)

## Question 3

- Calculate the mutual information score between `y` and the other categorical variables in the dataset. Use the training set only.
- Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the highest mutual information score?

- `industry`
- `location`
- _**`lead_source`**_
- `employment_status`

In [71]:
# collect categorical columns from df_train
categorical_columns = [c for c in df_train.columns if df_train[c].dtype == 'object']

# compute mutual information for each categorical column
mi_scores = {}
for col in categorical_columns:
    codes = pd.factorize(df_train[col])[0]  # convert categories to integer codes
    mi_scores[col] = mutual_info_score(y_train, codes)

# present results sorted descending
mi_series = pd.Series(mi_scores).sort_values(ascending=False).round(2)
print(mi_series)

lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64


In [72]:
def df_mutual_info_score(series):
    return mutual_info_score(y_train, series)

In [73]:
mi = df_train[categorical_columns].apply(
    lambda s: mutual_info_score(y_train, pd.factorize(s)[0])
)
mi.sort_values(ascending=False)

lead_source          0.035396
employment_status    0.012938
industry             0.011575
location             0.004464
dtype: float64

## Question 4

- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.64
- _**0.74**_
- 0.84
- 0.94

In [76]:
# Remove converted column from numerical columns list if exists
if 'converted' in numerical_columns:
    numerical_columns.remove('converted')

In [77]:
# Create a dictionary from training dataset to use in DictVectorizer
train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')

In [78]:
train_dicts

[{'lead_source': 'paid_ads',
  'industry': 'retail',
  'employment_status': 'student',
  'location': 'middle_east',
  'number_of_courses_viewed': 0,
  'annual_income': 58472.0,
  'interaction_count': 5,
  'lead_score': 0.03},
 {'lead_source': 'organic_search',
  'industry': 'manufacturing',
  'employment_status': 'student',
  'location': 'middle_east',
  'number_of_courses_viewed': 3,
  'annual_income': 71738.0,
  'interaction_count': 6,
  'lead_score': 0.77},
 {'lead_source': 'paid_ads',
  'industry': 'technology',
  'employment_status': 'employed',
  'location': 'north_america',
  'number_of_courses_viewed': 3,
  'annual_income': 81973.0,
  'interaction_count': 2,
  'lead_score': 0.59},
 {'lead_source': 'NA',
  'industry': 'technology',
  'employment_status': 'employed',
  'location': 'europe',
  'number_of_courses_viewed': 1,
  'annual_income': 74956.0,
  'interaction_count': 3,
  'lead_score': 0.34},
 {'lead_source': 'organic_search',
  'industry': 'retail',
  'employment_status': 

In [79]:
# Create DictVectorizer object
dv = DictVectorizer(sparse=False)

In [80]:
# Fit and transform training data
X_train = dv.fit_transform(train_dicts)

In [81]:
# Create a dictionary from validation dataset to use in DictVectorizer
val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')

In [82]:
# Transform validation data
X_val = dv.transform(val_dicts)

In [84]:
# Create Logistic Regression model with given parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [85]:
# Train model on training data
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [92]:
# Make predictions on validation data
y_pred = model.predict_proba(X_val)

In [93]:
# Extract y_pred >= 0.5
y_pred_pos = (y_pred[:, 1] >= 0.5).astype(int)

In [102]:
# Calculate accuracy of model
accuracy = np.mean(y_val == y_pred_pos)

In [103]:
accuracy

np.float64(0.6996587030716723)

## Question 5
- Let's find the leas useful feature using the _feature elimination_ technique.
- Train a model using the same features and parameters as Q4 (without rounding).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of the following features has the smallest difference?

- _**`industry`**_
- `employment_status`
- `lead_score`

In [104]:
# Exclude industry feature and train model
features_to_exclude = ['industry', 'employment_status', 'lead_score']
accuracy_diffs = {}
for feature in features_to_exclude:
    # Create new list of features excluding the current feature
    current_features = [f for f in (categorical_columns + numerical_columns) if f != feature]
    # Create dictionaries for training and validation datasets
    train_dicts = df_train[current_features].to_dict(orient='records')
    val_dicts = df_val[current_features].to_dict(orient='records')
    # Transform datasets using DictVectorizer
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    # Train Logistic Regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    # Make predictions on validation data
    y_pred = model.predict_proba(X_val)
    y_pred_pos = (y_pred[:, 1] >= 0.5).astype(int)
    # Calculate accuracy
    ftr_accuracy = np.mean(y_val == y_pred_pos)
    # Calculate difference from original accuracy
    accuracy_diffs[feature] = accuracy - ftr_accuracy



In [105]:
accuracy_diffs

{'industry': np.float64(0.0),
 'employment_status': np.float64(0.0034129692832763903),
 'lead_score': np.float64(-0.0068259385665528916)}

## Question 6

- Now let's train a regularized logistic regression
- Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
- Train all models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation data set?
- _**0.01**_
- 0.1
- 1
- 10
- 100

In [108]:
# Try different values of C and record accuracy
C_values = [0.01, 0.1, 1, 10, 100]
C_accuracies = {}
for C in C_values:
    # Create dictionaries for training and validation datasets
    train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
    val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
    # Transform datasets using DictVectorizer
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    # Train Logistic Regression model with current C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    # Make predictions on validation data
    y_pred = model.predict_proba(X_val)
    y_pred_pos = (y_pred[:, 1] >= 0.5).astype(int)
    # Calculate accuracy
    C_accuracies[C] = np.mean(y_val == y_pred_pos)

In [109]:
C_accuracies

{0.01: np.float64(0.6996587030716723),
 0.1: np.float64(0.6996587030716723),
 1: np.float64(0.6996587030716723),
 10: np.float64(0.6996587030716723),
 100: np.float64(0.6996587030716723)}