In [79]:
# Import Packages and Dataset

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
print(df.head())


    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  


In [80]:
# Data preparation
# Check if the missing values are presented in the features.
# If there are missing values:
# For caterogiral features, replace them with 'NA'
# For numerical features, replace with with 0.0



# inspect missing values
df.isna().sum()
missing = ['lead_source','industry','annual_income','employment_status','location']
df[missing].nunique()

# assign N/A values
missing_categorical = ['lead_source','industry','employment_status','location']
missing_numerial = ['annual_income']

df[missing_categorical] = df[missing_categorical].fillna('NA')
df[missing_numerial] = df[missing_numerial].fillna(0)


# apply the correct data type

# inspect
#print(df.dtypes)
#print(df.nunique())

categorical = ['lead_source', 'industry','employment_status','location']
numerical = list(set(df.columns) - set(categorical) - set('converted')) # everything else except 'converted'

print(f"categorical: {categorical}")
print(f"numerical: {numerical}")

df[categorical] = df[categorical].astype('category')

print(df.dtypes)

categorical: ['lead_source', 'industry', 'employment_status', 'location']
numerical: ['lead_score', 'converted', 'number_of_courses_viewed', 'interaction_count', 'annual_income']
lead_source                 category
industry                    category
number_of_courses_viewed       int64
annual_income                float64
employment_status           category
location                    category
interaction_count              int64
lead_score                   float64
converted                      int64
dtype: object


In [81]:
# Q1 What is the most frequent observation (mode) for the column industry?
df['industry'].value_counts(ascending=False)

# Answer is 'retail'


industry
retail           203
finance          200
other            198
education        187
healthcare       187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [82]:
# Q2 What are the two features that have the biggest correlation?

print(f'interaction_count and lead_score: {df['interaction_count'].corr(df['lead_score'])}')
print(f'number_of_courses_viewed and lead_score: {df['number_of_courses_viewed'].corr(df['lead_score'])}')
print(f'number_of_courses_viewed and interaction_count: {df['number_of_courses_viewed'].corr(df['interaction_count'])}')
print(f'annual_income and interaction_count: {df['annual_income'].corr(df['interaction_count'])}')

# Answer is 'annual_income and interaction_count'

interaction_count and lead_score: 0.009888182496913084
number_of_courses_viewed and lead_score: -0.004878998354681257
number_of_courses_viewed and interaction_count: -0.023565222882888117
annual_income and interaction_count: 0.02703647240481436


In [83]:
# Split the data

df_alltrain, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df, test_size=0.25, random_state=42)

df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)

y_train = df_train['converted'].values
df_train = df_train.drop(columns='converted')

y_val = df_val['converted'].values
df_val = df_val.drop(columns='converted')

y_test = df_test['converted'].values
df_test = df_test.drop(columns='converted')

In [84]:
# Q3 Which of these variables has the biggest mutual information score?
# industry
# location
# lead_source
# employment_status

# Make sure that the target value y is not in your dataframe.

columns_to_inspect = ['industry','location','lead_source','employment_status']
for c in columns_to_inspect:
    print(f'mutual score of {c}: {round(mutual_info_score(y_train, df_train[c]),2)}')


# Answer: lead_source: 0.025993724013756367


mutual score of industry: 0.01
mutual score of location: 0.0
mutual score of lead_source: 0.03
mutual score of employment_status: 0.01


In [85]:
# Q4 Logistic regression - What accuracy did you get?

# First transform the data and apply one hot encoding

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)


# Fit the model

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:,1]

# Get accuracy

accuracy = round(((y_pred >= 0.5) == y_val).mean(),4)
print(f"accuracy: {accuracy}")

accuracy: 0.7295


In [86]:
# Q5 Which of following feature has the smallest difference?

# Let's find the _least_ useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

# a: 'industry'
# b: 'employment_status'
# c: 'lead_score'

# Test importance, not used
#params = dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))
#sorted(params.items(), key=lambda item: item[1], reverse=True)

to_drop = ['industry','employment_status','lead_score']

# a
for elem in to_drop:
    dv_a = DictVectorizer(sparse=False)
    train_dict_a = df_train.drop(columns=elem).to_dict(orient='records')
    X_train_a = dv_a.fit_transform(train_dict_a)
    val_dict_a = df_val.drop(columns=elem).to_dict(orient='records')
    X_val_a = dv.transform(val_dict_a)
    model_a = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_a.fit(X_train_a, y_train)
    y_pred_a = model.predict_proba(X_val_a)[:,1]
    accuracy_a = round(((y_pred_a >= 0.5) == y_val).mean(),4)
    print(f"drop {elem} accuracy difference: {accuracy - accuracy_a}")



drop industry accuracy difference: -0.0026999999999999247
drop employment_status accuracy difference: 0.0027000000000000357
drop lead_score accuracy difference: -0.005499999999999949


In [90]:
# Q6 Which of these C leads to the best accuracy on the validation set?

# Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

C = [0.01, 0.1, 1, 10, 100]
for c in C:
    model_c = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model_c.fit(X_train, y_train)
    y_pred_c = model_c.predict_proba(X_val)[:,1]
    accuracy_c = ((y_pred_c >= 0.5) == y_val).mean()
    print(f"C = {c}, accuracy: {accuracy_c}")


C = 0.01, accuracy: 0.726775956284153
C = 0.1, accuracy: 0.7295081967213115
C = 1, accuracy: 0.7295081967213115
C = 10, accuracy: 0.7295081967213115
C = 100, accuracy: 0.7295081967213115
