# 03-homework

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

%matplotlib inline

In [2]:
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

In [3]:
df = pd.read_csv(data)

## Data preparation
Check if the missing values are presented in the features.

If there are missing values:
* For caterogiral features, replace them with 'NA'
* For numerical features, replace with with 0.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [None]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [None]:
df.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031464,59886.273224,2.976744,0.506108,0.619015
std,1.449717,15070.140389,1.681564,0.288465,0.485795
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.2625,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [None]:
# check missing values
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
# for categorical features, replace them with 'NA'
df['lead_source'] = df['lead_source'].fillna('NA')
df['industry'] = df['industry'].fillna('NA')
df['employment_status'] = df['employment_status'].fillna('NA')
df['location'] = df['location'].fillna('NA')

# for numerical features, replace with with 0.0
df['annual_income'] = df['annual_income'].fillna(0.0)

## Question 1
What is the most frequent observation (mode) for the column industry?
* NA
* technology
* healthcare
* retail

In [None]:
df['industry'].unique()

array(['NA', 'retail', 'healthcare', 'education', 'manufacturing',
       'technology', 'other', 'finance'], dtype=object)

In [None]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [None]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?
* interaction_count and lead_score
* number_of_courses_viewed and lead_score
* number_of_courses_viewed and interaction_count
* annual_income and lead_score

Only consider the pairs above when answering this question.

In [None]:
numeric_columns = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score' ]

corr_matrix = df[numeric_columns].corr()

corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [None]:
long_df = corr_matrix.stack().reset_index()
long_df.columns = ['feature1', 'feature2', 'correlation']
# remove self-correlations
long_df = long_df[long_df['feature1'] != long_df['feature2']]

# remove duplicate pairs
long_df = long_df[long_df['feature1'] < long_df['feature2']]

# sort by correlation descending
long_df = long_df.sort_values(by='correlation', ascending=False)

long_df

Unnamed: 0,feature1,feature2,correlation
6,annual_income,interaction_count,0.027036
7,annual_income,lead_score,0.01561
11,interaction_count,lead_score,0.009888
4,annual_income,number_of_courses_viewed,0.00977
12,lead_score,number_of_courses_viewed,-0.004879
8,interaction_count,number_of_courses_viewed,-0.023565


| feature1            | feature2                 | correlation  |
|---------------------|--------------------------|--------------|
| interaction_count   | lead_score               | 0.009888     |
| lead_score          | number_of_courses_viewed | -0.004879    |
| interaction_count   | number_of_courses_viewed | -0.023565    |
| annual_income       | interaction_count        | 0.027036     |

The biggest correlation is annual_income and interaction_count

## Split the data

Split your data in train/val/test sets with 60%/20%/20% distribution.

Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

Make sure that the target value y is not in your dataframe.

In [5]:
# split the data to 80%/20% first
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)
# then split the 80% to 60%/20%
df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=42, shuffle=False)

# reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# move the target value to y
y_train = df_train['converted']
y_val = df_val['converted']
y_test = df_test['converted']

# drop the target value from X
X_train = df_train.drop('converted', axis=1)
X_val = df_val.drop('converted', axis=1)
X_test = df_test.drop('converted', axis=1)

# check total rows
print(len(X_train), len(X_val), len(X_test))
print(len(y_train), len(y_val), len(y_test))

876 293 293
876 293 293


## Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.

Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?
* industry
* location
* lead_source
* employment_status

In [7]:
# separate numeric and categorical columns
numeric_columns = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score' ]
categorical_columns = ['lead_source', 'industry', 'employment_status', 'location']


In [None]:
mi_scores = []

for col in categorical_columns:
    mi = mutual_info_score(X_train[col], y_train)
    mi_scores.append(mi)
    print(f"{col}: {mi}")

print("highest mutual info score:", categorical_columns[np.argmax(mi_scores)], max(mi_scores))


lead_source: 0.023478670209835295
industry: 0.00841637534916051
employment_status: 0.013245799888513363
location: 0.0009809640862368935
highest mutual info score: lead_source 0.023478670209835295


## Question 4
* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
* To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    * model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    * Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?
* 0.64
* 0.74
* 0.84
* 0.94

In [None]:
train_dict = df_train[categorical_columns + numeric_columns].to_dict(orient='records')

print(len(train_dict))

train_dict

876


[{'lead_source': 'paid_ads',
  'industry': 'NA',
  'employment_status': 'unemployed',
  'location': 'south_america',
  'number_of_courses_viewed': 1,
  'annual_income': 79450.0,
  'interaction_count': 4,
  'lead_score': 0.94},
 {'lead_source': 'social_media',
  'industry': 'retail',
  'employment_status': 'employed',
  'location': 'south_america',
  'number_of_courses_viewed': 1,
  'annual_income': 46992.0,
  'interaction_count': 1,
  'lead_score': 0.8},
 {'lead_source': 'events',
  'industry': 'healthcare',
  'employment_status': 'unemployed',
  'location': 'australia',
  'number_of_courses_viewed': 5,
  'annual_income': 78796.0,
  'interaction_count': 3,
  'lead_score': 0.69},
 {'lead_source': 'paid_ads',
  'industry': 'retail',
  'employment_status': 'NA',
  'location': 'australia',
  'number_of_courses_viewed': 2,
  'annual_income': 83843.0,
  'interaction_count': 1,
  'lead_score': 0.87},
 {'lead_source': 'referral',
  'industry': 'education',
  'employment_status': 'self_employed

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [None]:
X_train = dv.transform(train_dict)

In [None]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [None]:
print(X_train.shape)


(876, 31)


In [None]:
X_train

array([[7.9450e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [4.6992e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [7.8796e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        5.0000e+00],
       ...,
       [6.1676e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        4.0000e+00],
       [5.2389e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        4.0000e+00],
       [6.9518e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        1.0000e+00]])

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
val_dict = df_val[categorical_columns + numeric_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred1 = model.predict(X_val)

y_pred = y_pred.round(2)

y_pred

array([0.54, 0.69, 0.55, 0.95, 0.77, 0.67, 0.83, 0.83, 0.21, 0.85, 0.78,
       0.55, 0.35, 0.3 , 0.86, 0.59, 0.62, 0.54, 0.54, 0.84, 0.58, 0.35,
       0.75, 0.63, 0.77, 0.94, 0.8 , 0.75, 0.97, 0.71, 0.83, 0.55, 0.64,
       0.92, 0.69, 0.59, 0.52, 0.48, 0.55, 0.71, 0.65, 0.75, 0.5 , 0.62,
       0.36, 0.53, 0.76, 0.57, 0.56, 0.77, 0.77, 0.54, 0.89, 0.54, 0.7 ,
       0.87, 0.76, 0.8 , 0.87, 0.79, 0.56, 0.43, 0.81, 0.62, 0.72, 0.91,
       0.43, 0.23, 0.56, 0.4 , 0.98, 0.88, 0.34, 0.87, 0.74, 0.95, 0.75,
       0.86, 0.64, 0.88, 0.77, 0.76, 0.65, 0.68, 0.69, 0.51, 0.62, 0.46,
       0.64, 0.57, 0.76, 0.9 , 0.78, 0.71, 0.66, 0.92, 0.82, 0.52, 0.85,
       0.7 , 0.75, 0.54, 0.76, 0.36, 0.78, 0.29, 0.9 , 0.52, 0.79, 0.66,
       0.9 , 0.84, 0.73, 0.88, 0.61, 0.88, 0.67, 0.6 , 0.85, 0.92, 0.98,
       0.73, 0.94, 0.41, 0.37, 0.77, 0.91, 0.83, 0.81, 0.76, 0.52, 0.9 ,
       0.7 , 0.77, 0.82, 0.91, 0.78, 0.79, 0.77, 0.81, 0.81, 0.85, 0.83,
       0.43, 0.58, 0.88, 0.57, 0.95, 0.86, 0.92, 0.

In [None]:
converted_pred = (y_pred > 0.5).astype(int)

converted_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0])

In [None]:
(y_val == converted_pred).mean().round(2)

0.72

## Question 5

Let's find the least useful feature using the feature elimination technique.

Train a model using the same features and parameters as in Q4 (without rounding).

Now exclude each feature from this set and train a model without it. Record the accuracy for each model.

For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?
* 'industry'
* 'employment_status'
* 'lead_score'

Note: The difference doesn't have to be positive.

In [8]:
full_columns = categorical_columns + numeric_columns

In [9]:
def train_data(df_train, y_train, df_val, y_val, exclude_column=''):
    fc = full_columns.copy()
    #print(fc)
    if exclude_column != '':
        # remove exclude_column from fc
        fc.remove(exclude_column)
    #print(fc)

    train_dict = df_train[fc].to_dict(orient='records')
    val_dict = df_val[fc].to_dict(orient='records')
    #print(train_dict)

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    #y_pred = model.predict_proba(X_val)[:, 1]
    #converted_pred = (y_pred > 0.5).astype(int)
    #accuracy = (y_val == converted_pred).mean()
    y_pred = model.predict(X_val)
    accuracy = (y_val == y_pred).mean()
    
    return accuracy

In [16]:
exclude_accuracy = []

full_acc = train_data(df_train, y_train, df_val, y_val, "")



In [17]:
for c in full_columns:
    print(c)
    acc = train_data(df_train, y_train, df_val, y_val, c)

    exclude_accuracy.append(
        {c: acc, 'diff': full_acc - acc}
    )

    #break

lead_source
industry
employment_status
location
number_of_courses_viewed
annual_income
interaction_count
lead_score


In [18]:
exclude_accuracy

[{'lead_source': 0.7167235494880546, 'diff': 0.0},
 {'industry': 0.7133105802047781, 'diff': 0.0034129692832765013},
 {'employment_status': 0.7167235494880546, 'diff': 0.0},
 {'location': 0.7167235494880546, 'diff': 0.0},
 {'number_of_courses_viewed': 0.6279863481228669, 'diff': 0.0887372013651877},
 {'annual_income': 0.8668941979522184, 'diff': -0.15017064846416384},
 {'interaction_count': 0.621160409556314, 'diff': 0.09556313993174059},
 {'lead_score': 0.7167235494880546, 'diff': 0.0}]

In [153]:
acc_dict = {list(d.keys())[0]: list(d.values())[0] for d in exclude_accuracy}

full_acc = acc_dict['full']
for feature in full_columns:
    diff = full_acc - acc_dict[feature]
    print(f"{feature}: {diff}")


lead_source: 0.0
industry: 0.0034129692832765013
employment_status: 0.0
location: 0.0
number_of_courses_viewed: 0.0887372013651877
annual_income: -0.15017064846416384
interaction_count: 0.09556313993174059
lead_score: 0.0


## Question 6

Now let's train a regularized logistic regression.

Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].

Train models using all the features as in Q4.

Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?
* 0.01
* 0.1
* 1
* 10
* 100

Note: If there are multiple options, select the smallest C.

In [21]:
def train_data_c(df_train, y_train, df_val, y_val, c=1.0):

    train_dict = df_train.to_dict(orient='records')
    val_dict = df_val.to_dict(orient='records')
    #print(train_dict)

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    #y_pred = model.predict_proba(X_val)[:, 1]
    #converted_pred = (y_pred > 0.5).astype(int)
    #accuracy = (y_val == converted_pred).mean()
    y_pred = model.predict(X_val)
    accuracy = (y_val == y_pred).mean()
    
    return accuracy

In [22]:
list_c = [0.01, 0.1, 1, 10, 100]

for c in list_c:
    acc = train_data_c(df_train, y_train, df_val, y_val, c)
    print(f"C={c}: {acc}")

C=0.01: 0.7713310580204779
C=0.1: 0.7747440273037542
C=1: 0.7747440273037542
C=10: 0.7747440273037542
C=100: 0.7747440273037542
