####In this project we are trying to predict whether or not an employee will be leaving the company they are currently working in, in the next few years based on certain data.
> First we pre-process the data to convert the Categorical values to Numeric using label encoding

> Then we apply various Feature Selection methods to Select the top voted features

> We initially try to run the classifiers with the default hyper parameters and bechmark the accuracy scores.

> We then use GridSearchCV to tune the hyperparameters for all the Classifiers and obtain the best values.

> We re-run the classifications using the obtained values for the parameters with the top voted features to find the imporved accuracy scores

In [None]:
## import necessary library
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Data Pre-processing:

In [None]:
#import data 
data = pd.read_csv('https://raw.githubusercontent.com/SimonFWH/APP-MATH-CONC-FOR-MACH-LEARN-Labs/main/Term%20Project/Employee.csv')
data

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [None]:
## Showing data type
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [None]:
## Checking null values 
data.isna().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

## Label encoding

In [None]:
## Get the mapping of columns that are going to be encoded
eduation_column_classes = data['Education'].unique()
city_column_classes = data['City'].unique()
gender_column_classes = data['Gender'].unique()
everbenched_column_classes = data['EverBenched'].unique()

le = LabelEncoder()
le.fit_transform(eduation_column_classes)
eduation_label_mapping = dict(zip(le.classes_, range(len(le.classes_))))

le.fit_transform(city_column_classes)
city_label_mapping = dict(zip(le.classes_, range(len(le.classes_))))

le.fit_transform(gender_column_classes)
gender_label_mapping = dict(zip(le.classes_, range(len(le.classes_))))

le.fit_transform(everbenched_column_classes)
everbenched__label_mapping = dict(zip(le.classes_, range(len(le.classes_))))

print("Mapping of encoded columns: ")
print(eduation_label_mapping)
print(city_label_mapping)
print(gender_label_mapping)
print(everbenched__label_mapping)

## Label encoder: converting categorical into Numeric
le = LabelEncoder()
Catcols=['Education','City','Gender','EverBenched']
data[Catcols] = data[Catcols].apply(le.fit_transform)
df=pd.DataFrame(data)
print(df)
df.info()

Mapping of encoded columns: 
{'Bachelors': 0, 'Masters': 1, 'PHD': 2}
{'Bangalore': 0, 'New Delhi': 1, 'Pune': 2}
{'Female': 0, 'Male': 1}
{'No': 0, 'Yes': 1}
      Education  JoiningYear  City  PaymentTier  Age  Gender  EverBenched  \
0             0         2017     0            3   34       1            0   
1             0         2013     2            1   28       0            0   
2             0         2014     1            3   38       0            0   
3             1         2016     0            3   27       1            0   
4             1         2017     2            3   24       1            1   
...         ...          ...   ...          ...  ...     ...          ...   
4648          0         2013     0            3   26       0            0   
4649          1         2013     2            2   37       1            0   
4650          1         2018     1            3   27       1            0   
4651          0         2012     0            3   30       1           

In [None]:
# Defining dependent(y) and independent(X) variables
X=df.copy()
del X['LeaveOrNot']
y=df['LeaveOrNot']
print(X)
print(y.to_frame())
num_feats=8
feature_name = list(X.columns)

      Education  JoiningYear  City  PaymentTier  Age  Gender  EverBenched  \
0             0         2017     0            3   34       1            0   
1             0         2013     2            1   28       0            0   
2             0         2014     1            3   38       0            0   
3             1         2016     0            3   27       1            0   
4             1         2017     2            3   24       1            1   
...         ...          ...   ...          ...  ...     ...          ...   
4648          0         2013     0            3   26       0            0   
4649          1         2013     2            2   37       1            0   
4650          1         2018     1            3   27       1            0   
4651          0         2012     0            3   30       1            1   
4652          0         2015     0            3   33       1            1   

      ExperienceInCurrentDomain  
0                             0  
1      

# Feature Selection:

## Filter Feature Selection - Pearson Correlation

In [None]:
def cor_selector(X, y,num_feats):
    
    # Your code goes here (Multiple lines)
    cor_list = []
    feature_name = X.columns.tolist()
    
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [None]:
cor_support, cor_feature = cor_selector(X, y,num_feats)

## Filter Feature Selection - Chi-Sqaure

In [None]:
def chi_squared_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    X_norm=MinMaxScaler().fit_transform(X)
    bestfeatures = SelectKBest(score_func=chi2, k=num_feats)
    bestfeatures.fit(X_norm,y)
    chi_support=bestfeatures.get_support()
    chi_feature=X.loc[:,chi_support].columns.tolist()
    
    # Your code ends here
    return chi_support, chi_feature

In [None]:
chi_support, chi_feature = chi_squared_selector(X, y,num_feats)

## Wrapper Feature Selection - Recursive Feature Elimination

In [None]:
def rfe_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    rfe_selector= RFE(estimator=LogisticRegression(),n_features_to_select=num_feats,step=10,verbose=5)
    X_norm=MinMaxScaler().fit_transform(X)
    rfe_selector.fit(X_norm,y)
    rfe_support=rfe_selector.get_support()
    rfe_feature=X.loc[:,rfe_support].columns.tolist()
    # Your code ends here
    return rfe_support, rfe_feature

In [None]:
rfe_support, rfe_feature = rfe_selector(X, y,num_feats)

## Embedded Selection - Lasso Regression

In [None]:
def embedded_log_reg_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    lr_selector=SelectFromModel(LogisticRegression(),max_features=num_feats)
    X_norm=MinMaxScaler().fit_transform(X)
    lr_selector.fit(X_norm,y)
    lr_support=lr_selector.get_support()
    lr_feature=X.loc[:,lr_support].columns.tolist()
    # Your code ends here
    return lr_support, lr_feature

In [None]:
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)

## Tree based - Random Forest

In [None]:
def embedded_rf_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    rf_selector=SelectFromModel(RandomForestClassifier(n_estimators=100),max_features=num_feats)
    
    rf_selector.fit(X,y)
    rf_support=rf_selector.get_support()
    rf_feature=X.loc[:,rf_support].columns.tolist()
    
    # Your code ends here
    return rf_support, rf_feature

In [None]:
embeded_rf_support, embeded_rf_feature = embedded_rf_selector(X, y, num_feats)

## Tree based - Light GBM

In [None]:
def embedded_lgbm_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    lgbc=LGBMClassifier(n_estimators=500,learning_rate=.05,num_leaves=32,colsample_bytree=.2,
                       reg_alpha=3,reg_lambda=1,min_split_gain=.01,min_child_weight=40)
    lgb_selector=SelectFromModel(lgbc,max_features=num_feats)
    lgb_selector.fit(X,y)
    lgb_support=lgb_selector.get_support()
    lgb_feature=X.loc[:,lgb_support].columns.tolist()
    # Your code ends here
    return lgb_support, lgb_feature

In [None]:
embeded_lgbm_support, embeded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)

## Feature Selection Summary

In [None]:
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embedded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgbm_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

  return reduction(axis=axis, out=out, **passkwargs)


Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,JoiningYear,True,True,True,True,True,True,6
2,Age,True,True,True,False,True,True,5
3,PaymentTier,True,True,True,True,False,False,4
4,Gender,True,True,True,True,False,False,4
5,ExperienceInCurrentDomain,True,True,True,False,False,True,4
6,City,True,True,True,True,False,False,4
7,EverBenched,True,True,True,False,False,False,3
8,Education,True,True,True,False,False,False,3


# Classification:

In [None]:
# Normalising and Splitting the dataset into Train and Test
X_norm=MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.20, random_state=0, shuffle=True)

## Logistic Regression

In [None]:
# Defining and Training the logistic Regression Model
lr_model = LogisticRegression(random_state=0)
lr_model.fit(X_train, y_train)

# Testing the Logistic Regression Model
lr_predictions = lr_model.predict(X_test)
lr_score = lr_model.score(X_test, y_test)
print("Accuracy: ", lr_score)

# Confusion Matrix
lr_cm = confusion_matrix(y_test,lr_predictions)
print("Confusion Matrix:\n", lr_cm)

Accuracy:  0.7024704618689581
Confusion Matrix:
 [[538  81]
 [196 116]]


### Logistic Regression with GridSearch

In [None]:
# Hyperparameters to tune with GridSearch
param_grid = {'C': [1,5, 20,100,500],
              'tol': [0.001, 0.0001, 0.00001], 
              'class_weight': ['balanced', None],
              'fit_intercept': [True, False]}

# Training the Model
lr_model = LogisticRegression(random_state=0)
grid_search = GridSearchCV(lr_model,
                           param_grid,
                           scoring='accuracy',
                           verbose=1,
                           n_jobs=-1)

# Applying the Model
grid_search.fit(X_train, y_train)
lr_predictions = grid_search.predict(X_test)

# Optimum values values for the hyperparameters and the improved accuracy score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

## Decision Tree

In [None]:
#Defining and Training the Decision Tree Model
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

#Node count and Maximum depth of the tree
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')

#Make probability predictions
train_probs = tree.predict_proba(X_train)[:,1]
probs = tree.predict_proba(X_test)[:,1]

train_predictions = tree.predict(X_train)
predictions = tree.predict(X_test)

#Plot ROC AUC Score to assess Decision Tree Performance
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')


# Calculate the accuracy score
y_pred = tree.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {dt_accuracy}")

#Decision Tree Confusional Matrix
dt_cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", dt_cm)

### Decision Tree with GridSearch

In [None]:
# Hyperparameters to tune with GridSearch
param_dict = {'criterion':['gini', 'entropy'],
              'max_depth':range(1,10),
              'min_samples_split':range(2,10),
              'min_samples_leaf':range(1,5)}

 # Training the Model           
dt = DecisionTreeClassifier(random_state=0)
dt_grid = GridSearchCV(dt,
                       param_grid=param_dict,
                       scoring='accuracy',
                       verbose=1,
                       n_jobs=-1)

# Applying the Model
dt_grid.fit(X_train, y_train)
dt_predictions = dt_grid.predict(X_test)

#Print out the best parameters by using GridSearchCV and the best score of the model
print("Best parameters:", dt_grid.best_params_)
print("Best score:", dt_grid.best_score_)

## Random Forest

In [None]:
#Defining and Training the Random Forest Classifiesr
rf_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0,
                                  max_features='sqrt',
                                  n_jobs=-1,
                                  verbose=1)
rf_model.fit(X_train, y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished


RandomForestClassifier(max_features='sqrt', n_jobs=-1, random_state=0,
                       verbose=1)

In [None]:
#Node count and Maximum depth of the tree
n_nodes = []
max_depths = []

for ind_tree in rf_model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 1286
Average maximum depth 20


In [None]:
#Make Random Forest predictions
train_rf_predictions = rf_model.predict(X_train)
train_rf_probs = rf_model.predict_proba(X_train)[:, 1]

rf_predictions = rf_model.predict(X_test)
rf_probs = rf_model.predict_proba(X_test)[:, 1]


In [None]:
#Plot ROC AUC Score
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_rf_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, rf_probs)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')

# Calculate the accuracy score
y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {rf_accuracy}")

#Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, rf_predictions)
print("Confusion Matrix:\n", rf_cm)

### Random Forest with GridSearch

In [None]:
# Hyperparameters to tune with GridSearch
param_grid = {'n_estimators': [10, 50, 100],
              'max_depth': [None, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

#Train and Fit the Model
rf = RandomForestClassifier(random_state=0)
rf_grid = GridSearchCV(rf,
                       param_grid,
                       scoring="accuracy",
                       verbose=1,
                       n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Optimum values values for the hyperparameters and the improved score
print(f"Best parameters: {rf_grid.best_params_}")
print(f"Best score: {rf_grid.best_score_}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best score: 0.8503485602944361


## Stochastic Gradient Descent

In [None]:
# Defining and Training the Stochastic Gradient Descent Model
sdg_model = SGDClassifier(random_state=0)
sdg_model.fit(X_train, y_train)

# Testing the Stochastic Gradient Descent Model
sdg_predictions = sdg_model.predict(X_test)
sgd_score = sdg_model.score(X_test, y_test)
print("Accuracy: ", sgd_score)

# Confusion Matrix
sgd_cm = confusion_matrix(y_test,sdg_predictions)
print("Confusion Matrix:\n", sgd_cm)

Accuracy:  0.7056928034371643
Confusion Matrix:
 [[561  58]
 [216  96]]


### Stochastic Gradient Descent with GridSearch

In [None]:
# Hyperparameters to tune with GridSearch
param_grid = {'loss': ['hinge', 'modified_huber', 'squared_hinge', 'perceptron'],
              'alpha': [0.001, 0.0001, 0.00001],
              'tol': [0.001, 0.0001, 0.00001],
              'shuffle': [True, False],
              'class_weight': ['balanced', None],
              'fit_intercept': [True, False],
              'early_stopping': [True, False]}

# Training the Model
sdg_model = SGDClassifier(random_state=0, max_iter=10000)
grid_search = GridSearchCV(sdg_model,
                           param_grid,
                           scoring='accuracy',
                           verbose=1,
                           n_jobs=-1)

# Applying the Model
grid_search.fit(X_train, y_train)
sdg_predictions = grid_search.predict(X_test)

# Optimum values values for the hyperparameters and the improved accuracy score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'alpha': 0.0001, 'class_weight': None, 'early_stopping': False, 'fit_intercept': False, 'loss': 'hinge', 'shuffle': False, 'tol': 0.0001}
Best score: 0.7168207404200043


## Support Vector Machine

In [None]:
# Defining and Training the Support Vector Machine Model
svm_model = SVC(random_state=0)
svm_model.fit(X_train, y_train)

# Testing the Stochastic Gradient Descent Model
svm_predictions = svm_model.predict(X_test)
svm_score = svm_model.score(X_test, y_test)
print("Accuracy: ", svm_score)

svm_cm = confusion_matrix(y_test,svm_predictions)
print("Confusion Matrix:\n", svm_cm)

Accuracy:  0.8098818474758325
Confusion Matrix:
 [[590  29]
 [148 164]]


### Support Vector Machine with GridSearch

In [None]:
# Hyperparameters to tune with GridSearch
param_grid = {'C': [1,5, 20,100,500],
              'gamma': [.001, .01, .1,1, 'scale', 'auto'],
              'kernel': ['linear', 'rbf'],
              'shrinking': [True, False]}

# Training the Model
svm_model = SVC(random_state=0)
grid_search = GridSearchCV(svm_model,
                           param_grid,
                           scoring='accuracy',
                           verbose=1,
                           n_jobs=-1)

# Applying the Model
grid_search.fit(X_train, y_train)
svm_predictions=grid_search.predict(X_test)

# Optimum values values for the hyperparameters and the improved accuracy score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best parameters: {'C': 5, 'gamma': 1, 'kernel': 'rbf', 'shrinking': False}
Best score: 0.8430904957783071


# Dropping some features based on Feature Selection Analysis:

In [None]:
# Dropping the 2 lowest rated features 'EverBenched' and 'Education'
X = df.drop(["EverBenched", "Education", "LeaveOrNot"], axis='columns')

# Normalising and Splitting the dataset into Train and Test
X_norm=MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.20, random_state=0, shuffle=True)

## Running Classifiers with optimum values obtained via GridSearchCV on data with reduced features

### Logistic Regression

In [None]:
# Defining and Training the logistic Regression Model
lr_model = LogisticRegression(random_state=0,
                              C=5,
                              class_weight=None,
                              fit_intercept=False,
                              tol=0.001)
lr_model.fit(X_train, y_train)

# Testing the Logistic Regression Model
lr_predictions = lr_model.predict(X_test)
lr_score = lr_model.score(X_test, y_test)
print("Accuracy: ", lr_score)

# Confusion Matrix
lr_cm = confusion_matrix(y_test,lr_predictions)
print("Confusion Matrix:\n", lr_cm)

Accuracy:  0.7035445757250268
Confusion Matrix:
 [[545  74]
 [202 110]]


### Decision Tree

In [None]:
#Defining and Training the Decision Tree Model
tree = DecisionTreeClassifier(random_state=0,
                              criterion='gini',
                              max_depth=7,
                              min_samples_leaf=2,
                              min_samples_split=2)
tree.fit(X_train, y_train)

#Node count and Maximum depth of the tree
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')

#Make probability predictions
train_probs = tree.predict_proba(X_train)[:,1]
probs = tree.predict_proba(X_test)[:,1]

train_predictions = tree.predict(X_train)
predictions = tree.predict(X_test)

#Plot ROC AUC Score to assess Decision Tree Performance
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')

Decision tree has 127 nodes with maximum depth 7.
Train ROC AUC Score: 0.8574884720598969
Test ROC AUC  Score: 0.7837625823288181
Baseline ROC AUC: 0.5


In [None]:
# Calculate the accuracy score
dt_accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {dt_accuracy}")

#Decision Tree Confusional Matrix
dt_cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", dt_cm)

Accuracy: 0.8098818474758325
Confusion Matrix:
 [[583  36]
 [141 171]]


### Random Forest

In [None]:
#Defining and Training the Random Forest Classifiesr with
rf_model = RandomForestClassifier(n_estimators=50, 
                                random_state=0, 
                                max_depth=10,
                                min_samples_leaf=4,
                                min_samples_split=2,
                                max_features='sqrt',
                                n_jobs=-1, verbose=1)
rf_model.fit(X_train, y_train)

train_rf_predictions = rf_model.predict(X_train)
train_rf_probs = rf_model.predict_proba(X_train)[:, 1]

rf_predictions = rf_model.predict(X_test)
rf_probs = rf_model.predict_proba(X_test)[:, 1]

#Plot ROC AUC Score
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_rf_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, rf_probs)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Train ROC AUC Score: 0.8978861828036562
Test ROC AUC  Score: 0.8016703947640944
Baseline ROC AUC: 0.5


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:    0.0s finished


In [None]:
# Calculate the accuracy score
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Accuracy: {rf_accuracy}")

#Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, rf_predictions)
print("Confusion Matrix:\n", rf_cm)

Accuracy: 0.8066595059076263
Confusion Matrix:
 [[570  49]
 [131 181]]


### Stochastic Gradient Descent

In [None]:
# Defining and Training the Stochastic Gradient Descent Model
sdg_model = SGDClassifier(random_state=0,
                          alpha=0.0001,
                          class_weight=None,
                          early_stopping=False,
                          fit_intercept=False,
                          loss='hinge',
                          shuffle=False,
                          tol=0.0001)
sdg_model.fit(X_train, y_train)

# Testing the Stochastic Gradient Descent Model
sdg_predictions = sdg_model.predict(X_test)
sgd_score = sdg_model.score(X_test, y_test)
print("Accuracy: ", sgd_score)

# Confusion Matrix
sgd_cm = confusion_matrix(y_test,sdg_predictions)
print("Confusion Matrix:\n", sgd_cm)

Accuracy:  0.7089151450053706
Confusion Matrix:
 [[529  90]
 [181 131]]


### Support Vector Machine

In [None]:
# Defining and Training the Support Vector Machine Model
svm_model = SVC(random_state=0,
                C=5,
                gamma=1,
                kernel='rbf',
                shrinking=False)
svm_model.fit(X_train, y_train)

# Testing the Stochastic Gradient Descent Model
svm_predictions = svm_model.predict(X_test)
svm_score = svm_model.score(X_test, y_test)
print("Accuracy: ", svm_score)

svm_cm = confusion_matrix(y_test,svm_predictions)
print("Confusion Matrix:\n", svm_cm)

Accuracy:  0.8055853920515574
Confusion Matrix:
 [[581  38]
 [143 169]]
