## ESG controversy analysis - Modeling

In [1]:
# Import packages
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
import re

from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc
from sklearn.ensemble import AdaBoostClassifier

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import plotly.express as px
import plotly.graph_objs as go 
from plotly.graph_objects import Layout
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN

from sklearn.metrics import confusion_matrix

In [2]:
import plotly.io as pio

# Define Custom Theme
pio.templates['master_thesis'] = go.layout.Template(
    layout=go.Layout(
        font = dict(family= 'Times New Roman', size = 30),
        xaxis = dict(zeroline = True, 
                    linewidth = 1, 
                    linecolor = 'black', 
                    title_font=dict(size=35),
                    mirror=False,
                    showline=True,
                    gridcolor='white'),
        yaxis = dict(zeroline = False,
                    rangemode = 'tozero', 
                    linewidth = 1, 
                    linecolor = 'black', 
                    title_font=dict(size=35), 
                    mirror=False,
                    showline=True,
                    gridcolor='white'),
        colorway=['#0055B3', '#FF2400', '#028A0F'],
        margin=dict(l=100, r=0, t=0, b=100),
        legend=dict(yanchor="top",
            y=0.98,
            xanchor="left",
            x=1.03,
            title = None,
            font=dict(size= 20),
            bordercolor="Black",
            borderwidth=1),
        plot_bgcolor='rgb(242, 242, 242)',
        ),
)
pio.templates.default = 'master_thesis'

### Modeling

In [3]:
# Import data
os.chdir(
    r"//Users/mlvos/Desktop/Moritz/Education/Erasmus University/Master/Master Thesis_code/"
)

df_merged = pd.read_csv("data/merged_data.csv", index_col=['id', 'year'])

In [4]:
# Define the columns to be one-hot encoded
categorical_cols = df_merged.select_dtypes(include=['object']).columns.tolist()
categorical_cols = categorical_cols[1:]

In [5]:
# Create empty dataframe that compares output

df_results = pd.read_csv("data/results_governance.csv")
# df_results = pd.DataFrame(columns=[
#                           'Model', 'Parameters', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'Best params'])

**Logistic Regression (perhaps with imputation and regularisation)**

In [6]:
# Split the data into training and testing sets
#df_merged.dropna(axis=0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_merged.drop(['ISIN Code', 'ESG Controversies Score', 'ESG Combined Score', 'GICS Industry Group Name', 'country',
                                                                    'Environmental Controversies Count','Social Controversies Count',
                                                                    'Governance Controversies Count',
                                                                    'Governance_controversy_binary',
                                                                    'Social_controversy_binary',
                                                                    'Environmental_controversy_binary',
                                                                    'Recent Governance Controversies',
                                                                    'Recent Social Controversies',
                                                                    'Governance_controversy_binary'], axis=1), 
                                                    df_merged['Governance_controversy_binary'], stratify=df_merged['Governance_controversy_binary'], test_size=0.3, random_state=42)

In [7]:
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)

# One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('preprocessor', preprocessor), 
                 ('imputer', KNNImputer(metric='nan_euclidean')),
                 ('smote', SMOTE(sampling_strategy = 'minority', random_state=42)),
                 #('ada', ADASYN()),
                 ('lr', LogisticRegression(random_state=42, max_iter=5000, solver='saga', class_weight={0: class_weights[0], 1: class_weights[1]}))]) 
# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {
   'imputer__n_neighbors': [3],
   'lr__C': [10],
   'lr__penalty': ['l1']
}        

In [8]:
# Fit the pipeline with GridSearchCV to the training data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv, verbose = 1, n_jobs=6, scoring = 'f1')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




In [9]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [10]:
# Check paramters of best performing model
best_params

{'imputer__n_neighbors': 3, 'lr__C': 10, 'lr__penalty': 'l1'}

In [11]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.042472759617522796
Precision: 0.02711864406779661
Recall: 0.9917355371900827
F1 score: 0.05279366476022877


In [12]:
# Append to dataframe
df_results = df_results.append({'Model': 'LR', 'Accuracy': accuracy_score(y_test, y_pred),
                                'Parameters': pipe.named_steps,
                                'Precision': precision_score(y_test, y_pred),
                                'Recall': recall_score(y_test, y_pred),
                                'F1 Score': f1_score(y_test, y_pred),
                                'AUC': roc_auc_score(y_test, y_pred),
                                'Best params': best_params},
                               ignore_index=True)

#print(df_results)

# write to csv
df_results.to_csv(r"data/results_governance.csv")

  df_results = df_results.append({'Model': 'LR', 'Accuracy': accuracy_score(y_test, y_pred),


In [13]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

fpr_lr, tpr_lr = fpr, tpr

# Compute AUC (Area Under the Curve)
auc_ = roc_auc_score(y_test, y_pred)

auc_lr = auc_

fig_log_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc_:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_log_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_log_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_log_auc.update_xaxes(constrain='domain')
fig_log_auc.show()

fig_log_auc.write_image("images/results/ROC_governance_lr.png")

In [56]:
# Get number of positive classes and rows in data set
positives = list(df_merged[df_merged['Governance_controversy_binary'] > 0].shape)[0]
rows = list(df_merged.shape)[0]
baseline = positives/rows

In [57]:
precision, recall, thresholds = precision_recall_curve(y_test,  y_pred)
auc_precision_recall = auc(recall, precision)

In [58]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

precision, recall, thresholds = precision_recall_curve(y_test,  y_pred)

precision_lr, recall_lr = precision, recall

auc_precision_recall = auc(recall, precision)

auc_precision_recall_lr = auc_precision_recall

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve (AUC={auc_precision_recall:.2f})',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.add_annotation(text=f'Baseline: (AUC={baseline:.2f})',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.0,
                    y=baseline,
                    bordercolor='black',
                    borderwidth=1)

fig.show()

fig.write_image("images/results/PRC_governance_lr.png")

In [59]:
fig = go.Figure(layout=go.Layout(width=700, height=500
    ))#title=go.layout.Title(text="Precision and Recall by Threshold")
fig.add_trace(go.Scatter(name="Precision", x=thresholds, y=precision[:-1]))
fig.add_trace(go.Scatter(name="Recall", x=thresholds, y=recall[:-1]))
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Proportion")
fig

fig.write_image("images/results/PCR_threshold_governance_lr.png")

In [60]:
# Generate a range of threshold values
thresholds = np.arange(0.01, 1.00, 0.01)

# Loop through threshold values and calculate F1 score for each
f1_scores = []
for threshold in thresholds:
    y_pred_2 = (y_pred >= threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_2))

# Find optimal threshold value that maximizes F1 score
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Evaluate the model performance
y_pred_2 = (y_pred >= optimal_threshold).astype(int)
print('Accuracy:', accuracy_score(y_test, y_pred_2))
print('Precision:', precision_score(y_test, y_pred_2))
print('Recall:', recall_score(y_test, y_pred_2))
print('F1 score:', f1_score(y_test, y_pred_2))

Accuracy: 0.966310873915944
Precision: 0.24793388429752067
Recall: 0.12396694214876033
F1 score: 0.1652892561983471


In [61]:
# Print optimal threshold
print('optimal threshold:', optimal_threshold)

optimal threshold: 0.01


In [62]:
# Display confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_2).ravel()
print('tn:', tn, 'fp:', fp, 'fn:', fn, 'tp:', tp)

tn: 8661 fp: 91 fn: 212 tp: 30


**Light Gradient Boosting**

In [63]:
df_merged_gbm = df_merged.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df_merged_gbm.dropna(inplace=True)

In [64]:
# Define the columns to be one-hot encoded
categorical_cols_gbm = df_merged_gbm.select_dtypes(include=['object']).columns.tolist()
categorical_cols_gbm = categorical_cols[:]

In [65]:
X_train_gbm, X_test_gbm, y_train_gbm, y_test_gbm = train_test_split(df_merged_gbm.drop(['ISINCode', 'ESGCombinedScore', 'ESGControversiesScore', 'GICSIndustryGroupName', 'country',
                                                                    'EnvironmentalControversiesCount','SocialControversiesCount',
                                                                    'GovernanceControversiesCount',
                                                                    'Governance_controversy_binary',
                                                                    'Environmental_controversy_binary',
                                                                    'Governance_controversy_binary',
                                                                    'RecentGovernanceControversies',
                                                                    'RecentSocialControversies',
                                                                    'Social_controversy_binary'], axis=1), 
                                                    df_merged_gbm['Governance_controversy_binary'], stratify=df_merged_gbm['Governance_controversy_binary'], test_size=0.3, random_state=42)

In [66]:
#('r', SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))), 
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train_gbm)

# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols_gbm)
#     ]
# )


# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('scaler', StandardScaler()),
                ('imputer', KNNImputer(metric='nan_euclidean')),
                ('smote', SMOTE(sampling_strategy = 'minority', random_state=42)),
                #('preprocessor', preprocessor), # without preprocessing much higher dont know why!!
                #('resample', TomekLinks(sampling_strategy='majority')), #SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))),
                #('ada', ADASYN()), #gives quite a balanced result - p .25, recall .26
                ('classifier', LGBMClassifier(random_state=42))]) #class_weight={0: class_weights[0], 1: class_weights[1]}

# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {
    'classifier__learning_rate': [0.01],
    'classifier__max_depth': [10],
    'classifier__n_estimators': [700],
    'imputer__n_neighbors': [3],
    'classifier__boosting_type': ['gbdt'],
    'classifier__num_leaves': [31]
}

In [67]:
# Fit the pipeline with GridSearchCV to the training data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv, verbose = 1, n_jobs=6, scoring = 'f1')
grid_search.fit(X_train_gbm, y_train_gbm)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [68]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [69]:
best_params

{'classifier__boosting_type': 'gbdt',
 'classifier__learning_rate': 0.01,
 'classifier__max_depth': 10,
 'classifier__n_estimators': 700,
 'classifier__num_leaves': 31,
 'imputer__n_neighbors': 3}

In [70]:
# Predict on y test
y_pred_gbm = best_model.predict(X_test_gbm)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test_gbm, y_pred_gbm))
print('Precision:', precision_score(y_test_gbm, y_pred_gbm))
print('Recall:', recall_score(y_test_gbm, y_pred_gbm))
print('F1 score:', f1_score(y_test_gbm, y_pred_gbm))

Accuracy: 0.9536194191590811
Precision: 0.4375
Recall: 0.13592233009708737
F1 score: 0.20740740740740743


In [71]:
# Append to dataframe
df_results = df_results.append({'Model': 'GBM', 'Accuracy': accuracy_score(y_test_gbm, y_pred_gbm),
                                'Parameters': pipe.named_steps,
                                'Precision': precision_score(y_test_gbm, y_pred_gbm),
                                'Recall': recall_score(y_test_gbm, y_pred_gbm),
                                'F1 Score': f1_score(y_test_gbm, y_pred_gbm),
                                'AUC': roc_auc_score(y_test_gbm, y_pred_gbm),
                                'Best params': best_params},
                               ignore_index=True)

#print(df_results)

# write to csv
df_results.to_csv(r"data/results_governance.csv")


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [72]:
# Predict on y test
y_pred_gbm = best_model.predict_proba(X_test_gbm)[::,1]

# Assuming y_pred and y_true are the predicted and true labels, respectively
fpr, tpr, _ = roc_curve(y_test_gbm,  y_pred_gbm)

fpr_lgbm, tpr_lgbm = fpr, tpr

# Compute AUC (Area Under the Curve)
auc_ = roc_auc_score(y_test_gbm, y_pred_gbm)

auc_lgbm = auc_

fig_rf_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc_:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_rf_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_rf_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_rf_auc.update_xaxes(constrain='domain')
fig_rf_auc.show()

fig_rf_auc.write_image("images/results/ROC_governance_lgbm.png")

In [73]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test_gbm, y_test_gbm))

Best parameters:  {'classifier__boosting_type': 'gbdt', 'classifier__learning_rate': 0.01, 'classifier__max_depth': 10, 'classifier__n_estimators': 700, 'classifier__num_leaves': 31, 'imputer__n_neighbors': 3}
Train score:  0.16198260181003926
Test score:  0.20740740740740743


In [74]:
# Predict on y test
y_pred = best_model.predict_proba(X_test_gbm)[::,1]

precision, recall, thresholds = precision_recall_curve(y_test_gbm,  y_pred_gbm)

precision_lgbm, recall_lgbm = precision, recall

auc_precision_recall = auc(recall, precision)

auc_precision_recall_lgbm = auc_precision_recall

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve (AUC={auc_precision_recall:.2f})',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.add_annotation(text=f'Baseline:{baseline:.4f}',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.0,
                    y=baseline,
                    bordercolor='black',
                    borderwidth=1)

fig.show()

fig.write_image("images/results/PRC_governance_lgbm.png")

In [75]:
fig = go.Figure(layout=go.Layout(width=700, height=500
    ))#title=go.layout.Title(text="Precision and Recall by Threshold")
fig.add_trace(go.Scatter(name="Precision", x=thresholds, y=precision[:-1]))
fig.add_trace(go.Scatter(name="Recall", x=thresholds, y=recall[:-1]))
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Proportion")

fig.show()

fig.write_image("images/final/governance/PCR_threshold_governance_lgbm.png")
pio.write_image(fig, 'images/final/governance/PCR_threshold_governance_lgbm.png', scale=5, width=700, height=500)

In [76]:
from sklearn.inspection import permutation_importance

# Calculate permutation feature importance
result = permutation_importance(
    grid_search, X_test_gbm, y_test_gbm, n_repeats=10, random_state=42, n_jobs=6
)

feature_names = X_train_gbm.columns

# Put the result in a dataframe
forest_importances = pd.DataFrame(result.importances_mean, columns=['importance'], index=feature_names)

# Add standard deviations to the dataframe
forest_importances['std'] = result.importances_std

# Sort the dataframe by largest values
forest_importances.sort_values(by='importance', ascending=False, inplace=True)

In [77]:
# Store top 20 in dataframe
feature_import = pd.DataFrame(forest_importances['importance'][:20])
feature_import.index = ['Net Assets', 
                                           'Total Assets', 
                                           'Announced Layoffs/Total Employees Score',
                                           'Flexible Working Hours Score',
                                           'Human Rights Contractor Score',
                                           'Policy Data Privacy Score',
                                           'Board Individual Reelection Score',
                                           'Targets Emissions Score',
                                           'Shareholders Approval Stock Comensation Score',
                                           'Board Functions Policy Score',
                                           'Executive Compensation Objectives Score',
                                           'Anti Takeover Score',
                                           'Employees Health and Safety Score',
                                           'Average Board Tensure Score',
                                           'Health Safety Policy Score',
                                           'Board Background ans Skills Score',
                                           'Total Liabilites',
                                           'Training and Development Policy Score',
                                           'Board Size Score',
                                           'Resource Reduction Policy Score']

In [78]:
fig = px.bar(feature_import, y="importance", width=800, 
                           height=900) #, title="Feature importances using permutation on full model"
fig.update_yaxes(title = 'Importance')
fig.update_xaxes(title = None)

fig.update_layout(margin=dict(l=120, r=20, t=20, b=600),)

fig.show()

fig.write_image("images/final/governance/feature_importance_governance_lgbm.png")
pio.write_image(fig, 'images/final/governance/feature_importance_governance_lgbm.png', scale=5, width=800, height=900)

In [79]:
# Generate a range of threshold values
thresholds = np.arange(0.01, 1.00, 0.01)

# Loop through threshold values and calculate F1 score for each
f1_scores = []
for threshold in thresholds:
    y_pred_2 = (y_pred >= threshold).astype(int)
    f1_scores.append(f1_score(y_test_gbm, y_pred_2))

# Find optimal threshold value that maximizes F1 score
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Evaluate the model performance
y_pred_2 = (y_pred >= optimal_threshold).astype(int)
print('Accuracy:', accuracy_score(y_test_gbm, y_pred_2))
print('Precision:', precision_score(y_test_gbm, y_pred_2))
print('Recall:', recall_score(y_test_gbm, y_pred_2))
print('F1 score:', f1_score(y_test_gbm, y_pred_2))

Accuracy: 0.929345470307759
Precision: 0.3
Recall: 0.4368932038834951
F1 score: 0.3557312252964427


In [80]:
# Print optimal threshold
print('optimal threshold:', optimal_threshold)

optimal threshold: 0.2


In [81]:
# Display confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test_gbm, y_pred_2).ravel()
print('tn:', tn, 'fp:', fp, 'fn:', fn, 'tp:', tp)

tn: 2099 fp: 105 fn: 58 tp: 45


**Quadratic Discriminant Analysis**

In [82]:
# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

pipeline = Pipeline([#('preprocessor', preprocessor), 
                     ('imputer', KNNImputer(metric='nan_euclidean')),
                     ('smote', SMOTE(sampling_strategy = 'minority', random_state=42)),
                     #('ada', ADASYN()),
                     ('qda', QuadraticDiscriminantAnalysis())])

param_grid = {
    'imputer__n_neighbors': [3],
    'qda__reg_param': [0.6723357536499335] #list(np.logspace(0.0, 1.0, 5))
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=6, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




In [83]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [84]:
best_params

{'imputer__n_neighbors': 3, 'qda__reg_param': 0.6723357536499335}

In [85]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.9674227262619524
Precision: 0.2524271844660194
Recall: 0.10743801652892562
F1 score: 0.15072463768115943


In [86]:
# Append to dataframe
df_results = df_results.append({'Model': 'QDA', 'Accuracy': accuracy_score(y_test, y_pred),
                                'Parameters': pipeline.named_steps,
                                'Precision': precision_score(y_test, y_pred),
                                'Recall': recall_score(y_test, y_pred),
                                'F1 Score': f1_score(y_test, y_pred),
                                'AUC': roc_auc_score(y_test, y_pred),
                                'Best params': best_params},
                               ignore_index=True)

#print(df_results)

# write to csv
df_results.to_csv(r"data/results_governance.csv")


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [87]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

fpr_qda, tpr_qda = fpr, tpr

# Compute AUC (Area Under the Curve)
auc_ = roc_auc_score(y_test, y_pred)

auc_qda = auc_

fig_qdc_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc_:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_qdc_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_qdc_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_qdc_auc.update_xaxes(constrain='domain')
fig_qdc_auc.show()

fig_qdc_auc.write_image("images/results/ROC_governance_qda.png")

In [88]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test, y_test))

Best parameters:  {'imputer__n_neighbors': 3, 'qda__reg_param': 0.6723357536499335}
Train score:  0.9693114129139861
Test score:  0.9674227262619524


In [89]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

precision, recall, thresholds = precision_recall_curve(y_test,  y_pred)

precision_qda, recall_qda = precision, recall

auc_precision_recall = auc(recall, precision)

auc_precision_recall_qda = auc_precision_recall

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve (AUC={auc_precision_recall:.2f})',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.add_annotation(text=f'Baseline: {baseline:.4f}',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.0,
                    y=baseline,
                    bordercolor='black',
                    borderwidth=1)

fig.show()

fig.write_image("images/results/PRC_governance_qda.png")

In [90]:
fig = go.Figure(layout=go.Layout(width=700, height=500
    ))#title=go.layout.Title(text="Precision and Recall by Threshold")
fig.add_trace(go.Scatter(name="Precision", x=thresholds, y=precision[:-1]))
fig.add_trace(go.Scatter(name="Recall", x=thresholds, y=recall[:-1]))
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Proportion")
fig

fig.write_image("images/results/PCR_threshold_governance_qda.png")

In [91]:
# Generate a range of threshold values
thresholds = np.arange(0.01, 1.00, 0.01)

# Loop through threshold values and calculate F1 score for each
f1_scores = []
for threshold in thresholds:
    y_pred_2 = (y_pred >= threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_2))

# Find optimal threshold value that maximizes F1 score
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Evaluate the model performance
y_pred_2 = (y_pred >= optimal_threshold).astype(int)
print('Accuracy:', accuracy_score(y_test, y_pred_2))
print('Precision:', precision_score(y_test, y_pred_2))
print('Recall:', recall_score(y_test, y_pred_2))
print('F1 score:', f1_score(y_test, y_pred_2))

Accuracy: 0.966310873915944
Precision: 0.24793388429752067
Recall: 0.12396694214876033
F1 score: 0.1652892561983471


In [92]:
# Print optimal threshold
print('optimal threshold:', optimal_threshold)

optimal threshold: 0.01


In [93]:
# Display confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_2).ravel()
print('tn:', tn, 'fp:', fp, 'fn:', fn, 'tp:', tp)

tn: 8661 fp: 91 fn: 212 tp: 30


### Combine Figures

In [94]:
# Combined ROC Curve
fig = go.Figure()

fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_layout(
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=600, height=500,
)
#title = 'ROC Curves for the Environmental Pillar'

fig.add_trace(go.Scatter(x=fpr_lr, y=tpr_lr, name=f"LR (AUC={auc_lr:.2f})", mode='lines'))
fig.add_trace(go.Scatter(x=fpr_lgbm, y=tpr_lgbm, name=f"GB (AUC={auc_lgbm:.2f})", mode='lines'))
fig.add_trace(go.Scatter(x=fpr_qda, y=tpr_qda, name=f"QDA (AUC={auc_qda:.2f})", mode='lines'))

fig.show()

fig.write_image("images/final/governance/ROC_governance_combined.png")

pio.write_image(fig, 'images/final/governance/ROC_governance_combined.png', scale=5, width=600, height=500)

In [95]:
# Combined PRC Curve
fig = go.Figure()

fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)

fig.update_layout(
    xaxis_title='Recall',
    yaxis_title='Precision',
    yaxis=dict(scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=600, height=500,
)

#title = 'PRC Curves for the Environmental Pillar'

fig.add_annotation(text=f'Baseline: {baseline:.4f}',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.65,
                    y=0.01,
                    bordercolor='black',
                    borderwidth=1,
                    font=dict(size=25,),)

fig.add_trace(go.Scatter(x=recall_lr, y=precision_lr, name=f"LR (AUC={auc_lr:.2f})", mode='lines'))
fig.add_trace(go.Scatter(x=recall_lgbm, y=precision_lgbm, name=f"GB (AUC={auc_lgbm:.2f})", mode='lines'))
fig.add_trace(go.Scatter(x=recall_qda, y=precision_qda, name=f"QDA (AUC={auc_qda:.2f})", mode='lines'))

fig.show()

fig.write_image("images/final/governance/PRC_governance_combined.png")

pio.write_image(fig, 'images/final/governance/PRC_governance_combined.png', scale=5, width=600, height=500)