## ESG controversy analysis - Modeling

In [1]:
# Import packages
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
#from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier, LGBMRegressor
import re

from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
from sklearn.ensemble import AdaBoostClassifier

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import plotly.express as px
import plotly.graph_objs as go 
from plotly.graph_objects import Layout
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

### Modeling

In [2]:
# Import data
os.chdir(
    r"//Users/mlvos/Desktop/Moritz/Education/Erasmus University/Master/Master Thesis_code/"
)

df_merged = pd.read_csv("data/merged_data.csv", index_col=['id', 'year'])

In [3]:
# Define the columns to be one-hot encoded
categorical_cols = df_merged.select_dtypes(include=['object']).columns.tolist()
categorical_cols = categorical_cols[1:]

In [4]:
# Create empty dataframe that compares output

df_results = pd.read_csv("data/results_environmental.csv")
# df_results = pd.DataFrame(columns=[
#                           'Model', 'Parameters', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'Best params'])

**Logistic Regression (perhaps with imputation and regularisation)**

In [5]:
# Split the data into training and testing sets
#df_merged.dropna(axis=0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_merged.drop(['ISIN Code', 'GICS Industry Group Name', 'country',
                                                                    'Environmental Controversies Count','Social Controversies Count',
                                                                    'Governance Controversies Count',
                                                                    'Governance_controversy_binary',
                                                                    'Social_controversy_binary',
                                                                    'Environmental_controversy_binary',
                                                                    'Governance_controversy_binary'], axis=1), 
                                                    df_merged['Environmental_controversy_binary'], stratify=df_merged['Environmental_controversy_binary'], test_size=0.3, random_state=42)

In [6]:
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)

# One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('preprocessor', preprocessor), 
                 ('imputer', KNNImputer(metric='nan_euclidean')),
                 ('smote', SMOTE(random_state=42)),
                 ('lr', LogisticRegression(max_iter=1000, solver='saga', class_weight={0: class_weights[0], 1: class_weights[1]}, random_state=42))])

# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {'imputer__n_neighbors': [3, 5, 7],
              'smote__sampling_strategy': ['minority', 'not minority'],
              'lr__C': [0.001, 0.1, 1],
              'lr__penalty': ['elasticnet', 'l1', 'l2'], 
              }

In [7]:
# Fit the pipeline with GridSearchCV to the training data
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=2, verbose = 1, n_jobs=6, scoring = 'f1')
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


36 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mlvos/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mlvos/opt/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py", line 297, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/Users/mlvos/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1471, in fit
    raise ValueError(
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

        nan        nan 0.0284040

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('imputer', KNNImputer()),
                                       ('smote', SMOTE(random_state=42)),
                                       ('lr',
                                        LogisticRegression(class_weight={0: 0.5064681179707486,
                                                                         1: 39.151119402985074},
                                                           max_iter=1000,
                                                           random_state=42,
                                                           solver='saga'))]),
             n_jobs=6,
             param_grid={'imputer__n_neighbors': [3, 5, 7],
                         'lr__C': [0.001, 0.1, 1],
                         'lr__penalty': ['elasticnet', 'l1', 'l2'],
                         'smote__sampling_strategy': ['minority',
                                                      'not minority']},
             scoring='f1', verbose

In [8]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [9]:
# Check paramters of best performing model
best_params

{'imputer__n_neighbors': 7,
 'lr__C': 0.001,
 'lr__penalty': 'l1',
 'smote__sampling_strategy': 'not minority'}

In [16]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.8243273293306649
Precision: 0.028332260141661302
Recall: 0.3826086956521739
F1 score: 0.052757793764988015


In [17]:
# Append to dataframe
df_results = df_results.append({'Model': 'LR', 'Accuracy': accuracy_score(y_test, y_pred),
                                'Parameters': pipe.named_steps,
                                'Precision': precision_score(y_test, y_pred),
                                'Recall': recall_score(y_test, y_pred),
                                'F1 Score': f1_score(y_test, y_pred),
                                'AUC': roc_auc_score(y_test, y_pred),
                                'Best params': best_params},
                               ignore_index=True)

print(df_results)

# write to csv
df_results.to_csv(r"data/results_environmental.csv")

  Model                                         Parameters  Accuracy  \
0    LR  {'imputer': KNNImputer(), 'smote': SMOTE(rando...  0.824327   

  Precision    Recall  F1 Score       AUC  \
0  0.028332  0.382609  0.052758  0.606329   

                                         Best params  
0  {'imputer__n_neighbors': 7, 'lr__C': 0.001, 'l...  



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [18]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test, y_pred)

# first attempt
roc_auc_score(y_test,y_pred)

fig_log_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_log_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_log_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_log_auc.update_xaxes(constrain='domain')
fig_log_auc.show()

In [19]:
# Get number of positive classes and rows in data set
positives = list(df_merged[df_merged['Environmental_controversy_binary'] > 0].shape)[0]
rows = list(df_merged.shape)[0]
baseline = positives/rows

In [20]:
print(positives)
print(rows)
print(baseline)

383
29979
0.012775609593382035


In [21]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

precision, recall, thresholds = precision_recall_curve(y_test,  y_pred)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.add_annotation(text=f'Baseline: (AUC={baseline:.2f})',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.1,
                    y=baseline,
                    bordercolor='black',
                    borderwidth=1)

fig.show()

**Light Gradient Boosting**

In [22]:
df_merged_gbm = df_merged.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df_merged_gbm.dropna(inplace=True)

In [23]:
# Define the columns to be one-hot encoded
categorical_cols_gbm = df_merged_gbm.select_dtypes(include=['object']).columns.tolist()
categorical_cols_gbm = categorical_cols[:]

In [24]:
X_train_gbm, X_test_gbm, y_train_gbm, y_test_gbm = train_test_split(df_merged_gbm.drop(['ISINCode', 'GICSIndustryGroupName', 'country',
                                                                    'EnvironmentalControversiesCount','SocialControversiesCount',
                                                                    'GovernanceControversiesCount',
                                                                    'Governance_controversy_binary',
                                                                    'Environmental_controversy_binary',
                                                                    'Governance_controversy_binary',
                                                                    'RecentGovernanceControversies',
                                                                    'RecentSocialControversies',
                                                                    'Social_controversy_binary'], axis=1), 
                                                    df_merged_gbm['Environmental_controversy_binary'], stratify=df_merged_gbm['Environmental_controversy_binary'], test_size=0.3, random_state=42)

In [25]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN


 #('r', SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))), 
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train_gbm)

# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols_gbm)
#     ]
# )


# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('scaler', StandardScaler()),
                 #('imputer', KNNImputer(metric='nan_euclidean')),
                 #('smote', SMOTE(random_state=42)),
                #('preprocessor', preprocessor), # without preprocessing much higher dont know why!!
                ('resample', TomekLinks(sampling_strategy='majority')), #SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))),
                #('ada', ADASYN()), #gives quite a balanced result - p .25, recall .26
                ('classifier', LGBMClassifier(class_weight={0: class_weights[0], 1: class_weights[1]}, random_state=42))]) #class_weight={0: class_weights[0], 1: class_weights[1]}

# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {
    #'imputer__n_neighbors': [3, 5, 7],
    #'smote__sampling_strategy': ['minority', 'not minority'], #'minority', 
    'classifier__n_estimators': [100, 300],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5]
}

In [26]:
# Fit the pipeline with GridSearchCV to the training data
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=2, verbose = 1, n_jobs=6, scoring = 'f1')
grid_search.fit(X_train_gbm, y_train_gbm)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('resample',
                                        TomekLinks(sampling_strategy='majority')),
                                       ('classifier',
                                        LGBMClassifier(class_weight={0: 0.5079260237780714,
                                                                     1: 32.041666666666664},
                                                       random_state=42))]),
             n_jobs=6,
             param_grid={'classifier__learning_rate': [0.01, 0.1],
                         'classifier__max_depth': [3, 5],
                         'classifier__n_estimators': [100, 300]},
             scoring='f1', verbose=1)

In [27]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [28]:
best_params

{'classifier__learning_rate': 0.01,
 'classifier__max_depth': 5,
 'classifier__n_estimators': 300}

In [29]:
# Predict on y test
y_pred_gbm = best_model.predict(X_test_gbm)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test_gbm, y_pred_gbm))
print('Precision:', precision_score(y_test_gbm, y_pred_gbm))
print('Recall:', recall_score(y_test_gbm, y_pred_gbm))
print('F1 score:', f1_score(y_test_gbm, y_pred_gbm))

Accuracy: 0.9388816644993498
Precision: 0.1111111111111111
Recall: 0.4166666666666667
F1 score: 0.17543859649122806


In [30]:
# Append to dataframe
df_results = df_results.append({'Model': 'GBM', 'Accuracy': accuracy_score(y_test_gbm, y_pred_gbm),
                                'Precision': precision_score(y_test_gbm, y_pred_gbm),
                                'Recall': recall_score(y_test_gbm, y_pred_gbm),
                                'F1 Score': f1_score(y_test_gbm, y_pred_gbm),
                                'AUC': roc_auc_score(y_test_gbm, y_pred_gbm),
                                'Best params': best_params},
                               ignore_index=True)

print(df_results)

# write to csv
df_results.to_csv(r"data/results_environmental.csv")

  Model                                         Parameters  Accuracy  \
0    LR  {'imputer': KNNImputer(), 'smote': SMOTE(rando...  0.824327   
1   GBM                                                NaN  0.938882   

  Precision    Recall  F1 Score       AUC  \
0  0.028332  0.382609  0.052758  0.606329   
1  0.111111  0.416667  0.175439  0.681913   

                                         Best params  
0  {'imputer__n_neighbors': 7, 'lr__C': 0.001, 'l...  
1  {'classifier__learning_rate': 0.01, 'classifie...  



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [31]:
# Predict on y test
y_pred_gbm = best_model.predict_proba(X_test_gbm)[::,1]

# Assuming y_pred and y_true are the predicted and true labels, respectively
fpr, tpr, _ = roc_curve(y_test_gbm, y_pred_gbm)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test_gbm, y_pred_gbm)

# first attempt
roc_auc_score(y_test_gbm,y_pred_gbm)

fig_rf_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_rf_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_rf_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_rf_auc.update_xaxes(constrain='domain')
fig_rf_auc.show()

In [32]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test_gbm, y_test_gbm))

Best parameters:  {'classifier__learning_rate': 0.01, 'classifier__max_depth': 5, 'classifier__n_estimators': 300}
Train score:  0.2239669421487604
Test score:  0.17543859649122806


In [33]:
# Predict on y test
y_pred = best_model.predict_proba(X_test_gbm)[::,1]

precision, recall, thresholds = precision_recall_curve(y_test_gbm,  y_pred_gbm)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.add_annotation(text=f'Baseline:{baseline:.4f}',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.1,
                    y=baseline,
                    bordercolor='black',
                    borderwidth=1)

fig.show()

**Quadratic Discriminant Analysis**

In [34]:
# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
#     ]
# )

pipeline = Pipeline([#('preprocessor', preprocessor), 
                     ('imputer', KNNImputer(metric='nan_euclidean')),
                     ('qda', QuadraticDiscriminantAnalysis())])

param_grid = {
    'imputer__n_neighbors': [3, 5, 7, 10],
    'qda__reg_param': list(np.logspace(-40.0, 3.0, 50))
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('imputer', KNNImputer()),
                                       ('qda',
                                        QuadraticDiscriminantAnalysis())]),
             n_jobs=-1,
             param_grid={'imputer__n_neighbors': [3, 5, 7, 10],
                         'qda__reg_param': [1e-40, 7.543120063354608e-40,
                                            5.689866029018282e-39,
                                            4.291934260128761e-38,
                                            3.2374575428176266e-37,
                                            2.442053094548635e-36,
                                            1.8420699693267014e-35,
                                            1.3894954943731246e-34,
                                            1....
                                            1.4563484775012385e-27,
                                            1.0985411419875527e-26,
                                       

In [35]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [36]:
best_params

{'imputer__n_neighbors': 3, 'qda__reg_param': 2.3299518105153814}

In [37]:
# Predict on y test
y_pred = best_model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.9872136980209029
Precision: 0.0
Recall: 0.0
F1 score: 0.0



invalid value encountered in power


invalid value encountered in log


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [38]:
# Append to dataframe
df_results = df_results.append({'Model': 'QDA', 'Accuracy': accuracy_score(y_test, y_pred),
                                'Parameters': pipe.named_steps,
                                'Precision': precision_score(y_test, y_pred),
                                'Recall': recall_score(y_test, y_pred),
                                'F1 Score': f1_score(y_test, y_pred),
                                'AUC': roc_auc_score(y_test, y_pred),
                                'Best params': best_params},
                               ignore_index=True)

print(df_results)

# write to csv
df_results.to_csv(r"data/results_environmental.csv")


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



  Model                                         Parameters  Accuracy  \
0    LR  {'imputer': KNNImputer(), 'smote': SMOTE(rando...  0.824327   
1   GBM                                                NaN  0.938882   
2   QDA  {'resample': TomekLinks(sampling_strategy='maj...  0.987214   

  Precision    Recall  F1 Score       AUC  \
0  0.028332  0.382609  0.052758  0.606329   
1  0.111111  0.416667  0.175439  0.681913   
2       0.0       0.0       0.0       0.5   

                                         Best params  
0  {'imputer__n_neighbors': 7, 'lr__C': 0.001, 'l...  
1  {'classifier__learning_rate': 0.01, 'classifie...  
2  {'imputer__n_neighbors': 3, 'qda__reg_param': ...  



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [39]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

fpr, tpr, _ = roc_curve(y_test,  y_pred)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test, y_pred)

# first attempt
roc_auc_score(y_test,y_pred)

fig_qdc_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_qdc_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_qdc_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_qdc_auc.update_xaxes(constrain='domain')
fig_qdc_auc.show()


invalid value encountered in power


invalid value encountered in log



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test, y_test))

Best parameters:  {'imputer__n_neighbors': 5, 'qda__reg_param': 0.30888435964774974}
Train score:  0.9871336623340818
Test score:  0.970647098065377


In [None]:
# Predict on y test
y_pred = best_model.predict_proba(X_test)[::,1]

precision, recall, thresholds = precision_recall_curve(y_test,  y_pred)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.add_annotation(text=f'Baseline: {baseline:.4f}',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.0,
                    y=baseline,
                    bordercolor='black',
                    borderwidth=1)

fig.show()

**AdaBoost**

In [None]:
# Compute class weights for logistic regression
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train_gbm)

# # One-hot encode vairables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols_gbm)
#     ]
# )

# Create a pipeline with two steps: StandardScaler and LogisticRegression
pipe = Pipeline([#('scaler', StandardScaler()), 
                 ('imputer', KNNImputer(metric='nan_euclidean')), 
                 ('smote', SMOTE(random_state=42)),
                #  ('lr', LogisticRegression(max_iter=10000, 
                #                            solver='saga',
                #                            random_state=42))])
                #('preprocessor', preprocessor), # without preprocessing much higher dont know why!!
                ('classifier', AdaBoostClassifier())])

# Define a param_grid for GridSearchCV that includes the regularization parameter C
param_grid = {
    'imputer__n_neighbors': [3, 5, 7],
    'smote__sampling_strategy': ['minority', 'not minority'],
    'classifier__n_estimators': [100, 300, 500],
    'classifier__algorithm': ['SAMME.R', 'SAMME'],
    'classifier__learning_rate': [0.01, 0.1, 1]
}

In [None]:
# Fit the pipeline with GridSearchCV to the training data
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=2, verbose = 1, n_jobs=6, scoring = 'f1')
grid_search.fit(X_train_gbm, y_train_gbm)

Fitting 2 folds for each of 108 candidates, totalling 216 fits
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=100, imputer__n_neighbors=3, smote__sampling_strategy=minority; total time=  23.0s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=100, imputer__n_neighbors=5, smote__sampling_strategy=minority; total time=  23.4s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=100, imputer__n_neighbors=3, smote__sampling_strategy=minority; total time=  26.4s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=100, imputer__n_neighbors=5, smote__sampling_strategy=minority; total time=  26.9s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=100, imputer__n_neighbors=7, smote__sampling_strategy=minority; total time=  12.7s
[CV] END classifier__algorithm=SAMME.


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=500, imputer__n_neighbors=5, smote__sampling_strategy=minority; total time=  15.0s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=300, imputer__n_neighbors=7, smote__sampling_strategy=not minority; total time=  41.0s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=500, imputer__n_neighbors=5, smote__sampling_strategy=minority; total time=  14.1s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=300, imputer__n_neighbors=7, smote__sampling_strategy=not minority; total time=  45.1s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estimators=500, imputer__n_neighbors=7, smote__sampling_strategy=minority; total time=  11.6s
[CV] END classifier__algorithm=SAMME.R, classifier__learning_rate=0.01, classifier__n_estima



108 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mlvos/opt/anaconda3/envs/python39/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mlvos/opt/anaconda3/envs/python39/lib/python3.10/site-packages/imblearn/pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Users/mlvos/opt/anaconda3/envs/python39/lib/python3.10/site-packages/imblearn/pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/Users/mlvos/opt/anaconda3/envs/py

In [None]:
# Use the best estimator from GridSearchCV to predict on the testing data
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
best_params

{'classifier__algorithm': 'SAMME.R',
 'classifier__learning_rate': 1,
 'classifier__n_estimators': 500,
 'imputer__n_neighbors': 7,
 'smote__sampling_strategy': 'not minority'}

In [None]:
# Predict on y test
y_pred_gbm = best_model.predict(X_test_gbm)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test_gbm, y_pred_gbm))
print('Precision:', precision_score(y_test_gbm, y_pred_gbm))
print('Recall:', recall_score(y_test_gbm, y_pred_gbm))
print('F1 score:', f1_score(y_test_gbm, y_pred_gbm))

Accuracy: 0.9857682899710919
Precision: 0.2903225806451613
Recall: 0.0782608695652174
F1 score: 0.12328767123287673


In [None]:
# Append to dataframe
df_results = df_results.append({'Model': 'ADA', 'Accuracy': accuracy_score(y_test, y_pred),
                                'Parameters': pipe.named_steps,
                                'Precision': precision_score(y_test, y_pred),
                                'Recall': recall_score(y_test, y_pred),
                                'F1 Score': f1_score(y_test, y_pred),
                                'AUC': roc_auc_score(y_test, y_pred),
                                'Best params': best_params},
                               ignore_index=True)

print(df_results)

# write to csv
df_results.to_csv(r"data/results_environmental.csv")

In [None]:
# Predict on y test
y_pred_gbm = best_model.predict_proba(X_test_gbm)[::,1]

# Assuming y_pred and y_true are the predicted and true labels, respectively
fpr, tpr, _ = roc_curve(y_test_gbm, y_pred_gbm)

# Compute AUC (Area Under the Curve)
auc = roc_auc_score(y_test_gbm, y_pred_gbm)

# first attempt
roc_auc_score(y_test_gbm,y_pred_gbm)

fig_rf_auc = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc:.2f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig_rf_auc.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig_rf_auc.update_yaxes(scaleanchor="x", scaleratio=1)
fig_rf_auc.update_xaxes(constrain='domain')
fig_rf_auc.show()

In [None]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Train score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test_gbm, y_test_gbm))

Best parameters:  {'classifier__algorithm': 'SAMME.R', 'classifier__learning_rate': 1, 'classifier__n_estimators': 500, 'imputer__n_neighbors': 7, 'smote__sampling_strategy': 'not minority'}
Train score:  0.1480216617065912
Test score:  0.12328767123287673


In [None]:
# Predict on y test
y_pred = best_model.predict_proba(X_test_gbm)[::,1]

precision, recall, thresholds = precision_recall_curve(y_test_gbm,  y_pred_gbm)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=baseline, y1=baseline
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.add_annotation(text=f'Baseline: {baseline:.4f}',
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=1.0,
                    y=baseline,
                    bordercolor='black',
                    borderwidth=1)

fig.show()