## ESG controversy analysis - Modeling

In [1]:
# Import packages
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.neighbors import DistanceMetric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier

import bnlearn as bn
from sklearn.metrics import classification_report

from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go 
from plotly.graph_objects import Layout
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

### Modeling

In [2]:
# Import data
os.chdir(
    r"//Users/mlvos/Desktop/Moritz/Education/Erasmus University/Master/Master Thesis_code/"
)

df_merged = pd.read_csv("data/merged_data.csv", index_col=['id', 'year'])

In [3]:
# Define the columns to be one-hot encoded
categorical_cols = df_merged.select_dtypes(include=['object']).columns.tolist()
categorical_cols = categorical_cols[1:]

### Logistic Regression (perhaps with imputation and regularisation)

In [4]:
# Split the data into training and testing sets
#df_merged.dropna(axis=0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_merged.drop(['ISIN Code', 'ESG Combined Score', 'GICS Industry Group Name', 'country',
                                                                    'Environmental Controversies Count','Social Controversies Count',
                                                                    'Governance Controversies Count',
                                                                    'Governance_controversy_binary',
                                                                    'Social_controversy_binary',
                                                                    'Environmental_controversy_binary',
                                                                    'Governance_controversy_binary'], axis=1), 
                                                    df_merged['Social_controversy_binary'], stratify=df_merged['Social_controversy_binary'], test_size=0.3, random_state=42)

In [5]:
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)

# One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('preprocessor', preprocessor), 
                 ('imputer', KNNImputer(metric='nan_euclidean')),
                 ('smote', SMOTE(random_state=42)),
                 ('lr', LogisticRegression(max_iter=100, solver='saga', class_weight={0: class_weights[0], 1: class_weights[1]}, random_state=42))])

# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {'imputer__n_neighbors': [7],
              'smote__sampling_strategy': ['not minority'],  # minority
              'lr__C': [0.001],
              'lr__penalty': ['l1'],  # 'elasticnet', 'l1', 'l2'
              }

In [6]:
# Fit the pipeline with GridSearchCV to the training data
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose = 1, n_jobs=6, scoring = 'f1')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [7]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [8]:
# Check paramters of best performing model
best_params

{'imputer__n_neighbors': 7,
 'lr__C': 0.001,
 'lr__penalty': 'l1',
 'smote__sampling_strategy': 'not minority'}

In [9]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.6651261997867046
Precision: 0.38850771869639794
Recall: 0.6642228739002932
F1 score: 0.4902597402597403


In [10]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test, y_pred)

# first attempt
roc_auc_score(y_test,y_pred)

fig_log_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_log_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_log_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_log_auc.update_xaxes(constrain='domain')
fig_log_auc.show()

Random Forest

In [11]:
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)

# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('scaler', StandardScaler()), 
                 #('smote', SMOTE(random_state=42)),
                #  ('lr', LogisticRegression(max_iter=10000, 
                #                            solver='saga',
                #                            random_state=42))])
                # ('preprocessor', preprocessor),
                ('imputer', KNNImputer(metric='nan_euclidean')), 
                ('classifier', RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]}, random_state=42))])

# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {
    'classifier__n_estimators': [100],
    'classifier__min_samples_split': [7],
    'classifier__max_depth': [7],
    'classifier__criterion': ['gini'],
    'classifier__min_samples_split': [7],
    'imputer__n_neighbors': [5],
}

In [12]:
# Create the grid search object
grid_search = GridSearchCV(pipe, param_grid, cv=2, scoring='f1', verbose=2)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] END classifier__criterion=gini, classifier__max_depth=7, classifier__min_samples_split=7, classifier__n_estimators=100, imputer__n_neighbors=5; total time=   1.2s
[CV] END classifier__criterion=gini, classifier__max_depth=7, classifier__min_samples_split=7, classifier__n_estimators=100, imputer__n_neighbors=5; total time=   1.2s


In [13]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test, y_test))

Best parameters:  {'classifier__criterion': 'gini', 'classifier__max_depth': 7, 'classifier__min_samples_split': 7, 'classifier__n_estimators': 100, 'imputer__n_neighbors': 5}
Train score:  0.6208499140501437
Test score:  0.6238532110091743


In [14]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.6651261997867046
Precision: 0.38850771869639794
Recall: 0.6642228739002932
F1 score: 0.4902597402597403


In [15]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test, y_pred)

# first attempt
roc_auc_score(y_test,y_pred)

fig_rf_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_rf_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_rf_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_rf_auc.update_xaxes(constrain='domain')
fig_rf_auc.show()

Light Gradient Boosting

In [16]:
df_merged_gbm = df_merged.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [17]:
# Define the columns to be one-hot encoded
categorical_cols_gbm = df_merged_gbm.select_dtypes(include=['object']).columns.tolist()
categorical_cols_gbm = categorical_cols[:]

In [18]:
X_train_gbm, X_test_gbm, y_train_gbm, y_test_gbm = train_test_split(df_merged_gbm.drop(['ISINCode', 'ESGCombinedScore', 'GICSIndustryGroupName', 'country',
                                                                    'EnvironmentalControversiesCount','SocialControversiesCount',
                                                                    'GovernanceControversiesCount',
                                                                    'Governance_controversy_binary',
                                                                    'Environmental_controversy_binary',
                                                                    'Governance_controversy_binary',
                                                                    'RecentGovernanceControversies',
                                                                    'RecentSocialControversies',
                                                                    'Social_controversy_binary'], axis=1), 
                                                    df_merged_gbm['Social_controversy_binary'], stratify=df_merged_gbm['Social_controversy_binary'], test_size=0.3, random_state=42)

In [19]:
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train_gbm)

# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols_gbm)
#     ]
# )

# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('scaler', StandardScaler()), 
                 #('smote', SMOTE(random_state=42)),
                #  ('lr', LogisticRegression(max_iter=10000, 
                #                            solver='saga',
                #                            random_state=42))])
                #('preprocessor', preprocessor), # without preprocessing much higher dont know why!!
                ('classifier', LGBMClassifier(class_weight={0: class_weights[0], 1: class_weights[1]}, random_state=42))])

# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {
    'classifier__n_estimators': [300],
    'classifier__learning_rate': [0.01],
    'classifier__max_depth': [20]
}

In [20]:
# Fit the pipeline with GridSearchCV to the training data
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose = 1, n_jobs=6, scoring = 'f1')
grid_search.fit(X_train_gbm, y_train_gbm)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [21]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [22]:
best_params

{'classifier__learning_rate': 0.01,
 'classifier__max_depth': 20,
 'classifier__n_estimators': 300}

In [23]:
# Predict on y test
y_pred_gbm = best_model.predict(X_test_gbm)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test_gbm, y_pred_gbm))
print('Precision:', precision_score(y_test_gbm, y_pred_gbm))
print('Recall:', recall_score(y_test_gbm, y_pred_gbm))
print('F1 score:', f1_score(y_test_gbm, y_pred_gbm))

Accuracy: 0.7980803412726626
Precision: 0.5681818181818182
Recall: 0.6964809384164223
F1 score: 0.6258234519104086


In [24]:
# Predict on y test
y_pred_gbm = best_model.predict_proba(X_test_gbm)[::,1]

# Assuming y_pred and y_true are the predicted and true labels, respectively
fpr, tpr, _ = roc_curve(y_test_gbm, y_pred_gbm)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test_gbm, y_pred_gbm)

# first attempt
roc_auc_score(y_test_gbm,y_pred_gbm)

fig_rf_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_rf_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_rf_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_rf_auc.update_xaxes(constrain='domain')
fig_rf_auc.show()

In [26]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test_gbm, y_test_gbm))

Best parameters:  {'classifier__learning_rate': 0.01, 'classifier__max_depth': 20, 'classifier__n_estimators': 300}
Train score:  0.6289612105506471
Test score:  0.6258234519104086


Neural Network

In [27]:
# # # One-hot encode vairables
# # preprocessor = ColumnTransformer(
# #     transformers=[
# #         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
# #     ]
# # )

# pipeline = Pipeline([#('preprocessor', preprocessor),
#                      ('imputer', KNNImputer(metric='nan_euclidean')), 
#                      ('mlp', MLPClassifier(solver='adam', verbose=False))])

# param_grid = {
#     'mlp__alpha': [0.0001, 0.0002, 0.0003, 0.001, 0.002, 0.003, 0.01, 0.1, 0.3, 0.5],
#     'mlp__learning_rate_init': [0.0001, 0.0002, 0.0003, 0.001, 0.002, 0.003, 0.01, 0.1, 0.3, 0.5],
#     'mlp__hidden_layer_sizes': [(5,), (10,), (15,), (20,)],
#     'mlp__max_iter': [2000],
#     'mlp__activation': ['relu', 'logistic'],
#     'imputer__n_neighbors': [5, 7, 10]
# }

# grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=6, scoring='f1', verbose=1)
# grid_search.fit(X_train, y_train)

In [28]:
# print("Best parameters: ", grid_search.best_params_)
# print("Train score: ", grid_search.best_score_)
# print("Test score: ", grid_search.score(X_test, y_test))

In [29]:
# # Use the best estimator from GridSearchCV to predict on the testing data
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

In [30]:
# # Predict on y test
# y_pred = best_model.predict(X_test)

# # Evaluate the model performance
# print('Accuracy:', accuracy_score(y_test, y_pred))
# print('Precision:', precision_score(y_test, y_pred))
# print('Recall:', recall_score(y_test, y_pred))
# print('F1 score:', f1_score(y_test, y_pred))

In [31]:
# # Predict on y test
# y_pred = best_model.predict_proba(X_test)[::,1]

# fpr, tpr, _ = roc_curve(y_test,  y_pred)

# # first attempt
# roc_auc_score(y_test,y_pred)

# fig_nn_auc = px.area(
#     x=fpr, y=tpr,
#     title=f'ROC Curve (AUC={auc:.2f})',
#     labels=dict(x='False Positive Rate', y='True Positive Rate'),
#     width=700, height=500
# )
# fig_nn_auc.add_shape(
#     type='line', line=dict(dash='dash'),
#     x0=0, x1=1, y0=0, y1=1
# )

# fig_nn_auc.update_yaxes(scaleanchor="x", scaleratio=1)
# fig_nn_auc.update_xaxes(constrain='domain')
# fig_nn_auc.show()

Naive Bayes Classifier

In [32]:
# # # One-hot encode vairables
# # preprocessor = ColumnTransformer(
# #     transformers=[
# #         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
# #     ]
# # )

# # Define the pipeline
# pipeline = Pipeline([
#     # ('preprocessor', preprocessor),
#     ('imputer', KNNImputer(metric='nan_euclidean')),
#     ('smote', SMOTE(random_state=42)),
#     ('classifier', MultinomialNB()) # Naive Bayes classifier
# ])


# # Define the hyperparameters to tune
# parameters = {
#     'imputer__n_neighbors': [3, 5, 7, 10, 15, 20],
#     'smote__sampling_strategy': ['minority', 'not minority'],
#     'classifier__alpha': list(np.logspace(-40.0, 3.0, 200)) # Smoothing parameter for Naive Bayes
# }

# # Perform grid search cross-validation to find the best hyperparameters
# grid_search = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
# grid_search.fit(X_train, y_train)

In [33]:
# # Use the best estimator from GridSearchCV to predict on the testing data
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

In [34]:
# best_params

In [35]:
# # Predict on y test
# y_pred = best_model.predict(X_test)

# # Evaluate the model performance
# print('Accuracy:', accuracy_score(y_test, y_pred))
# print('Precision:', precision_score(y_test, y_pred))
# print('Recall:', recall_score(y_test, y_pred))
# print('F1 score:', f1_score(y_test, y_pred))

In [36]:
# # Predict on y test
# y_pred = best_model.predict_proba(X_test)[::,1]

# fpr, tpr, _ = roc_curve(y_test,  y_pred)

# # first attempt
# roc_auc_score(y_test,y_pred)

# fig_nbc_auc = px.area(
#     x=fpr, y=tpr,
#     title=f'ROC Curve (AUC={auc:.2f})',
#     labels=dict(x='False Positive Rate', y='True Positive Rate'),
#     width=700, height=500
# )
# fig_nbc_auc.add_shape(
#     type='line', line=dict(dash='dash'),
#     x0=0, x1=1, y0=0, y1=1
# )

# fig_nbc_auc.update_yaxes(scaleanchor="x", scaleratio=1)
# fig_nbc_auc.update_xaxes(constrain='domain')
# fig_nbc_auc.show()

Bayesian Network Classifier

In [37]:
# # Define structure of Bayesian network using structure learning
# structure = bn.structure_learning.fit(df_merged)

# # Define the parameter grid for the Bayesian network classifier
# param_grid = {
#     'alpha': [0.1, 1, 10],
#     'beta': [0.1, 1, 10]
# }

# # Define the pipeline
# pipeline = Pipeline([
#     ('structure', structure),
#     ('cpds', bn.parameter_learning.fit),
#     ('clf', bn.BayesianNetworkClassifier())
# ])

# # Perform grid search to find the best hyperparameters
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters and the classification report
# print("Best hyperparameters:", grid_search.best_params_)
# print("Classification report:")
# print(classification_report(y_test, grid_search.predict(y_test)))

Quadratic Discriminant Analysis

In [39]:
# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

pipeline = Pipeline([#('preprocessor', preprocessor), 
                     ('imputer', KNNImputer(metric='nan_euclidean')),
                     ('qda', QuadraticDiscriminantAnalysis())])

param_grid = {
    'imputer__n_neighbors': [7],
    'qda__reg_param': list(np.logspace(-40.0, 3.0, 200))
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




In [40]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [41]:
best_params

{'imputer__n_neighbors': 7, 'qda__reg_param': 0.047686116977144546}

In [42]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.7987913259864913
Precision: 0.5944625407166124
Recall: 0.5351906158357771
F1 score: 0.5632716049382717


In [43]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test, y_pred)

# first attempt
roc_auc_score(y_test,y_pred)

fig_qdc_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_qdc_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_qdc_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_qdc_auc.update_xaxes(constrain='domain')
fig_qdc_auc.show()

Support Vector Machine

In [None]:
# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

# ('preprocessor', preprocessor),


In [45]:
pipeline = Pipeline([('imputer', KNNImputer(metric='nan_euclidean')),
                     ('smote', SMOTE(random_state=42)),
                     ('svm', SVC(probability=True))])

param_grid = {
    'imputer__n_neighbors': [3],
    'smote__sampling_strategy': ['not minority'], #'minority',
    'svm__C': [0.5455594781168515],#list(np.logspace(-40.0, 5.0, 200)),
    'svm__kernel': ['poly'] #'linear', 'poly', 'rbf'
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=6, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END imputer__n_neighbors=3, smote__sampling_strategy=not minority, svm__C=0.5455594781168515, svm__kernel=poly; total time=   6.0s
[CV] END imputer__n_neighbors=3, smote__sampling_strategy=not minority, svm__C=0.5455594781168515, svm__kernel=poly; total time=   6.1s
[CV] END imputer__n_neighbors=3, smote__sampling_strategy=not minority, svm__C=0.5455594781168515, svm__kernel=poly; total time=   6.2s
[CV] END imputer__n_neighbors=3, smote__sampling_strategy=not minority, svm__C=0.5455594781168515, svm__kernel=poly; total time=   6.3s
[CV] END imputer__n_neighbors=3, smote__sampling_strategy=not minority, svm__C=0.5455594781168515, svm__kernel=poly; total time=   6.6s


In [46]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [47]:
best_params

{'imputer__n_neighbors': 3,
 'smote__sampling_strategy': 'not minority',
 'svm__C': 0.5455594781168515,
 'svm__kernel': 'poly'}

In [48]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.778883753999289
Precision: 0.6595744680851063
Recall: 0.18181818181818182
F1 score: 0.2850574712643678


In [49]:
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test, y_test))

Best parameters:  {'imputer__n_neighbors': 3, 'smote__sampling_strategy': 'not minority', 'svm__C': 0.5455594781168515, 'svm__kernel': 'poly'}
Train score:  0.7726653493210484
Test score:  0.778883753999289


In [50]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test, y_pred)

# first attempt
roc_auc_score(y_test,y_pred)

fig_svm_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_svm_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_svm_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_svm_auc.update_xaxes(constrain='domain')
fig_svm_auc.show()