In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('course_lead_scoring.csv')
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


### Data preparation

In [3]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [5]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']	
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    df[c] = df[c].fillna('NA')

df[categorical].isnull().sum()

lead_source          0
industry             0
employment_status    0
location             0
dtype: int64

In [6]:
for n in numerical:
    df[n] = df[n].fillna(0.0)

df[numerical].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
dtype: int64

### QUESTION 1

In [7]:
df.industry.mode()

0    retail
Name: industry, dtype: object

### QUESTION 2

In [8]:
# correlation matrix
df[numerical].corr().round(3)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.01,-0.024,-0.005
annual_income,0.01,1.0,0.027,0.016
interaction_count,-0.024,0.027,1.0,0.01
lead_score,-0.005,0.016,0.01,1.0


### QUESTION 3

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# Splitting the dataset into 3 dataframes
train_full_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)
train_df, validation_df = train_test_split(train_full_df, test_size = 0.25, random_state = 42)

len(train_df), len(validation_df), len(test_df)

(876, 293, 293)

In [11]:
y_train = train_df.converted.values
y_validation = validation_df.converted.values
y_test = test_df.converted.values

del train_df['converted']
del validation_df['converted']
del test_df['converted']

In [12]:
# mutual information
from sklearn.metrics import mutual_info_score

def mutual_info_converted_score(y):
    return mutual_info_score(y, train_full_df.converted)

mi = train_full_df[categorical].apply(mutual_info_converted_score)

# mutual information in sorted order, rounded to 2 decimals
round(mi.sort_values(ascending=False), 2)   

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

### QUESTION 4

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [14]:
# creating a one-hot encoding matrix
dicts = train_df[categorical + numerical].to_dict(orient = 'records')

div = DictVectorizer(sparse = False)
X_train = div.fit_transform(dicts)

# one-hot encoding matrix 
X_train           

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [15]:
div.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [16]:
model = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [17]:
validation_dict = validation_df[categorical + numerical].to_dict(orient = 'records')
X_validation = div.transform(validation_dict)

y_predictions = model.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions)

(293, 293)

In [18]:
# accuracy
converted_decision = (y_predictions >= 0.5)
converted_decision.astype(int)
converted_mean = (converted_decision == y_validation).mean()
round(converted_mean, 2)

np.float64(0.7)

### QUESTION 5

In [19]:
model_1 = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_1.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [20]:
y_predictions_model_1 = model_1.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_1)

(293, 293)

In [21]:
# accuracy
converted_decision_model_1 = (y_predictions_model_1 >= 0.5)
converted_decision_model_1.astype(int)
converted_mean_model_1 = (converted_decision_model_1 == y_validation).mean()
converted_mean_model_1

np.float64(0.6996587030716723)

In [22]:
# Global accuracy
original_accuracy = converted_mean_model_1
original_accuracy

np.float64(0.6996587030716723)

### Excluding industry feature

In [23]:
categorical_no_industry = ['lead_source', 'employment_status', 'location']
categorical_no_industry

['lead_source', 'employment_status', 'location']

In [24]:
# creating a one-hot encoding matrix
dicts_no_industry = train_df[categorical_no_industry + numerical].to_dict(orient = 'records')

div_no_industry = DictVectorizer(sparse = False)
X_train_no_industry = div_no_industry.fit_transform(dicts_no_industry)

# one-hot encoding matrix
X_train_no_industry

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 23))

In [25]:
div_no_industry.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'interaction_count', 'lead_score', 'lead_source=NA',
       'lead_source=events', 'lead_source=organic_search',
       'lead_source=paid_ads', 'lead_source=referral',
       'lead_source=social_media', 'location=NA', 'location=africa',
       'location=asia', 'location=australia', 'location=europe',
       'location=middle_east', 'location=north_america',
       'location=south_america', 'number_of_courses_viewed'], dtype=object)

In [26]:
# Splitting the dataset into 3 dataframes
train_full_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)
train_df, validation_df = train_test_split(train_full_df, test_size = 0.25, random_state = 42)

len(train_df), len(validation_df), len(test_df)

(876, 293, 293)

In [27]:
# Removing the industry feature from the train_df
no_industry_train_df = train_df.copy()
no_industry_train_df = no_industry_train_df.drop(columns=['industry'])

y_train_no_industry = no_industry_train_df.converted.values
no_industry_train_df = no_industry_train_df[categorical_no_industry + numerical]
no_industry_train_df

Unnamed: 0,lead_source,employment_status,location,number_of_courses_viewed,annual_income,interaction_count,lead_score
1077,paid_ads,student,middle_east,0,58472.0,5,0.03
463,organic_search,student,middle_east,3,71738.0,6,0.77
842,paid_ads,employed,north_america,3,81973.0,2,0.59
835,,employed,europe,1,74956.0,3,0.34
837,organic_search,student,australia,3,59335.0,1,0.98
...,...,...,...,...,...,...,...
725,organic_search,employed,australia,1,43907.0,4,0.33
401,social_media,employed,north_america,3,64969.0,1,0.18
957,,employed,asia,3,89042.0,4,0.75
992,social_media,self_employed,europe,1,0.0,1,0.65


In [28]:
# Removing the industry feature from the validation_df
no_industry_validation_df = validation_df.copy()
no_industry_validation_df = no_industry_validation_df.drop(columns=['industry'])

y_validation_no_industry = no_industry_validation_df.converted.values
no_industry_validation_df = no_industry_validation_df[categorical_no_industry + numerical]
no_industry_validation_df

Unnamed: 0,lead_source,employment_status,location,number_of_courses_viewed,annual_income,interaction_count,lead_score
662,paid_ads,unemployed,europe,3,52220.0,1,0.07
600,organic_search,unemployed,middle_east,3,59656.0,4,0.65
477,events,self_employed,north_america,0,57134.0,4,0.13
1057,events,,asia,0,0.0,0,0.03
891,referral,unemployed,south_america,1,54103.0,3,0.16
...,...,...,...,...,...,...,...
1367,social_media,self_employed,africa,1,55222.0,1,0.25
1390,paid_ads,employed,middle_east,1,20326.0,3,0.81
419,organic_search,employed,south_america,1,74166.0,2,0.01
114,organic_search,self_employed,africa,2,39103.0,3,0.60


In [29]:
# Fitting the model
model_no_industry = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_no_industry.fit(X_train_no_industry, y_train_no_industry)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [30]:
validation_dict_no_industry = validation_df[categorical_no_industry + numerical].to_dict(orient = 'records')
X_validation_no_industry = div_no_industry.transform(validation_dict_no_industry)

y_predictions_no_industry = model_no_industry.predict_proba(X_validation_no_industry)[:, 1]
len(X_validation_no_industry), len(y_predictions_no_industry)

(293, 293)

In [31]:
# Computing the local accuracy
converted_decision_no_industry = (y_predictions_no_industry >= 0.5)
converted_decision_no_industry.astype(int)
converted_mean_no_industry = (converted_decision_no_industry == y_validation_no_industry).mean()
converted_mean_no_industry

np.float64(0.6996587030716723)

In [32]:
# The difference between the two accuracies
original_accuracy - converted_mean_no_industry

np.float64(0.0)

### Excluding employment_status feature

In [33]:
categorical_no_employment = ['lead_source', 'industry', 'location']
categorical_no_employment

['lead_source', 'industry', 'location']

In [34]:
# creating a one-hot encoding matrix
dicts_no_employment = train_df[categorical_no_employment + numerical].to_dict(orient = 'records')

div_no_employment = DictVectorizer(sparse = False)
X_train_no_employment = div_no_employment.fit_transform(dicts_no_employment)

# one-hot encoding matrix
X_train_no_employment

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 26))

In [35]:
div_no_employment.get_feature_names_out()

array(['annual_income', 'industry=NA', 'industry=education',
       'industry=finance', 'industry=healthcare',
       'industry=manufacturing', 'industry=other', 'industry=retail',
       'industry=technology', 'interaction_count', 'lead_score',
       'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [36]:
# Removing the employment_status feature from the train_df
no_employment_train_df = train_df.copy()
no_employment_train_df = no_employment_train_df.drop(columns=['employment_status'])

y_train_no_employment = no_employment_train_df.converted.values
no_employment_train_df = no_employment_train_df[categorical_no_employment + numerical]
no_employment_train_df

Unnamed: 0,lead_source,industry,location,number_of_courses_viewed,annual_income,interaction_count,lead_score
1077,paid_ads,retail,middle_east,0,58472.0,5,0.03
463,organic_search,manufacturing,middle_east,3,71738.0,6,0.77
842,paid_ads,technology,north_america,3,81973.0,2,0.59
835,,technology,europe,1,74956.0,3,0.34
837,organic_search,retail,australia,3,59335.0,1,0.98
...,...,...,...,...,...,...,...
725,organic_search,other,australia,1,43907.0,4,0.33
401,social_media,retail,north_america,3,64969.0,1,0.18
957,,education,asia,3,89042.0,4,0.75
992,social_media,manufacturing,europe,1,0.0,1,0.65


In [37]:
# Removing the employment_status feature from the validation_df
no_employment_validation_df = validation_df.copy()
no_employment_validation_df = no_employment_validation_df.drop(columns=['employment_status'])

y_validation_no_employment = no_employment_validation_df.converted.values
no_employment_validation_df = no_employment_validation_df[categorical_no_employment + numerical]
no_employment_validation_df

Unnamed: 0,lead_source,industry,location,number_of_courses_viewed,annual_income,interaction_count,lead_score
662,paid_ads,healthcare,europe,3,52220.0,1,0.07
600,organic_search,technology,middle_east,3,59656.0,4,0.65
477,events,manufacturing,north_america,0,57134.0,4,0.13
1057,events,other,asia,0,0.0,0,0.03
891,referral,retail,south_america,1,54103.0,3,0.16
...,...,...,...,...,...,...,...
1367,social_media,healthcare,africa,1,55222.0,1,0.25
1390,paid_ads,,middle_east,1,20326.0,3,0.81
419,organic_search,technology,south_america,1,74166.0,2,0.01
114,organic_search,technology,africa,2,39103.0,3,0.60


In [38]:
# Fitting the model
model_no_employment = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_no_employment.fit(X_train_no_employment, y_train_no_employment)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [39]:
validation_dict_no_employment = validation_df[categorical_no_employment + numerical].to_dict(orient = 'records')
X_validation_no_employment = div_no_employment.transform(validation_dict_no_employment)

y_predictions_no_employment = model_no_employment.predict_proba(X_validation_no_employment)[:, 1]
len(X_validation_no_employment), len(y_predictions_no_employment)

(293, 293)

In [40]:
# Computing the local accuracy
converted_decision_no_employment = (y_predictions_no_employment >= 0.5)
converted_decision_no_employment.astype(int)
converted_mean_no_employment = (converted_decision_no_employment == y_validation_no_employment).mean()
converted_mean_no_employment

np.float64(0.6962457337883959)

In [41]:
# The difference between the two accuracies
original_accuracy - converted_mean_no_employment

np.float64(0.0034129692832763903)

### Excluding lead_score feature

In [42]:
numerical_no_score = ['number_of_courses_viewed', 'annual_income', 'interaction_count']
numerical_no_score

['number_of_courses_viewed', 'annual_income', 'interaction_count']

In [66]:
# creating a one-hot encoding matrix
dicts_no_score = train_df[categorical + numerical_no_score].to_dict(orient = 'records')

div_no_score = DictVectorizer(sparse = False)
X_train_no_score = div_no_score.fit_transform(dicts_no_score)

# one-hot encoding matrix
X_train_no_score

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 30))

In [67]:
div_no_score.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [68]:
# Removing the lead_score feature from the train_df
no_score_train_df = train_df.copy()
no_score_train_df = no_score_train_df.drop(columns=['lead_score'])

y_train_no_score = no_score_train_df.converted.values
no_score_train_df = no_score_train_df[categorical + numerical_no_score]
no_score_train_df

Unnamed: 0,lead_source,industry,employment_status,location,number_of_courses_viewed,annual_income,interaction_count
1077,paid_ads,retail,student,middle_east,0,58472.0,5
463,organic_search,manufacturing,student,middle_east,3,71738.0,6
842,paid_ads,technology,employed,north_america,3,81973.0,2
835,,technology,employed,europe,1,74956.0,3
837,organic_search,retail,student,australia,3,59335.0,1
...,...,...,...,...,...,...,...
725,organic_search,other,employed,australia,1,43907.0,4
401,social_media,retail,employed,north_america,3,64969.0,1
957,,education,employed,asia,3,89042.0,4
992,social_media,manufacturing,self_employed,europe,1,0.0,1


In [69]:
# Removing the lead_score feature from the validation_df
no_score_validation_df = validation_df.copy()
no_score_validation_df = no_score_validation_df.drop(columns=['lead_score'])
    
y_validation_no_score = no_score_validation_df.converted.values
no_score_validation_df = no_score_validation_df[categorical + numerical_no_score]
no_score_validation_df

Unnamed: 0,lead_source,industry,employment_status,location,number_of_courses_viewed,annual_income,interaction_count
662,paid_ads,healthcare,unemployed,europe,3,52220.0,1
600,organic_search,technology,unemployed,middle_east,3,59656.0,4
477,events,manufacturing,self_employed,north_america,0,57134.0,4
1057,events,other,,asia,0,0.0,0
891,referral,retail,unemployed,south_america,1,54103.0,3
...,...,...,...,...,...,...,...
1367,social_media,healthcare,self_employed,africa,1,55222.0,1
1390,paid_ads,,employed,middle_east,1,20326.0,3
419,organic_search,technology,employed,south_america,1,74166.0,2
114,organic_search,technology,self_employed,africa,2,39103.0,3


In [70]:
# Fitting the model
model_no_score = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state = 42)

model_no_score.fit(X_train_no_score, y_train_no_score)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [71]:
validation_dict_no_score = validation_df[categorical + numerical_no_score].to_dict(orient = 'records')
X_validation_no_score = div_no_score.transform(validation_dict_no_score)

y_predictions_no_score = model_no_score.predict_proba(X_validation_no_score)[:, 1]
len(X_validation_no_score), len(y_predictions_no_score)

(293, 293)

In [72]:
# Computing the local accuracy
converted_decision_no_score = (y_predictions_no_score >= 0.5)
converted_decision_no_score.astype(int)
converted_mean_no_score = (converted_decision_no_score == y_validation_no_score).mean()
converted_mean_no_score

np.float64(0.7064846416382252)

In [73]:
# The difference between the two accuracies
original_accuracy - converted_mean_no_score

np.float64(-0.0068259385665528916)

### QUESTION 6

#### MODEL 1: C = 0.01

In [74]:
reg_model_1 = LogisticRegression(solver = 'liblinear', C = 0.01, max_iter = 1000, random_state = 42)

reg_model_1.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [75]:
y_predictions_model_1 = reg_model_1.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_1)

(293, 293)

In [76]:
# model 1 accuracy
converted_decision_model_1 = (y_predictions_model_1 >= 0.5)
converted_decision_model_1.astype(int)
converted_mean_model_1 = (converted_decision_model_1 == y_validation).mean()
round(converted_mean_model_1, 3)

np.float64(0.7)

#### MODEL 2: C = 0.1

In [54]:
reg_model_2 = LogisticRegression(solver = 'liblinear', C = 0.1, max_iter = 1000, random_state = 42)

reg_model_2.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [55]:
y_predictions_model_2 = reg_model_2.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_2)

(293, 293)

In [56]:
# model 2 accuracy
converted_decision_model_2 = (y_predictions_model_2 >= 0.5)
converted_decision_model_2.astype(int)
converted_mean_model_2 = (converted_decision_model_2 == y_validation).mean()
round(converted_mean_model_2, 3)

np.float64(0.7)

#### MODEL 3: C = 1

In [57]:
reg_model_3 = LogisticRegression(solver = 'liblinear', C = 1, max_iter = 1000, random_state = 42)

reg_model_3.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [58]:
y_predictions_model_3 = reg_model_3.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_3)

(293, 293)

In [59]:
# model 3 accuracy
converted_decision_model_3 = (y_predictions_model_3 >= 0.5)
converted_decision_model_3.astype(int)
converted_mean_model_3 = (converted_decision_model_3 == y_validation).mean()
round(converted_mean_model_3, 3)

np.float64(0.7)

#### MODEL 4: C = 10

In [60]:
reg_model_4 = LogisticRegression(solver = 'liblinear', C = 10, max_iter = 1000, random_state = 42)

reg_model_4.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [61]:
y_predictions_model_4 = reg_model_4.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_4)

(293, 293)

In [62]:
# model 4 accuracy
converted_decision_model_4 = (y_predictions_model_4 >= 0.5)
converted_decision_model_4.astype(int)
converted_mean_model_4 = (converted_decision_model_4 == y_validation).mean()
round(converted_mean_model_4, 3)

np.float64(0.7)

#### MODEL 5: C = 100

In [63]:
reg_model_5 = LogisticRegression(solver = 'liblinear', C = 100, max_iter = 1000, random_state = 42)

reg_model_5.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,100
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [64]:
y_predictions_model_5 = reg_model_5.predict_proba(X_validation)[:, 1]
len(X_validation), len(y_predictions_model_5)

(293, 293)

In [65]:
# model 5 accuracy
converted_decision_model_5 = (y_predictions_model_5 >= 0.5)
converted_decision_model_5.astype(int)
converted_mean_model_5 = (converted_decision_model_5 == y_validation).mean()
round(converted_mean_model_5, 3)

np.float64(0.7)