### Import Packages

In [21]:
import pandas as pd
import numpy as np
import json
import warnings

warnings.filterwarnings("ignore")

from preprocessing_helper import *
from machine_learning_helper import *
from itertools import combinations,product

# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
# from sklearn import ensemble
# from sklearn.metrics import roc_auc_score,accuracy_score,classification_report,plot_confusion_matrix,roc_curve,recall_score,confusion_matrix
# from sklearn.decomposition import PCA
# from sklearn.neural_network import MLPClassifier
# from sklearn.preprocessing import label_binarize

### I. Data Preprocessing

In [22]:
# Read Corp list with processed sentences, not applicable for new companies 
df = pd.read_csv("Processed_Corp_List.csv")
df = sentenceCleaning(df)
df2 = pd.read_csv('Labelled_Answers.csv')
# Filter irrelevant data points (non 2021 or PDF processing error)
df2 = df2[df2.Q3 != 'NA_2021']
df2 = df2[df2.Q3 != 'PDF_Error']
# Merge company data with labels 
df3 = pd.merge(df, df2, how='inner', left_on='IssuerName', right_on='IssuerName')
df3.head()

Unnamed: 0.1,Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q3,Q4,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14
0,0,Repsol SA,ES0173516115,REP,Spain,Energy,Integrated Oil & Gas,2021,https://www.repsol.com/content/dam/repsol-corp...,[2 0 2 1 REPSOL Group Integrated Management Re...,,Yes,Yes,Established Carbon Transition Plan,Yes,,Yes,Yes,Yes,Yes
1,1,OMV AG,AT0000743059,OMV,Austria,Energy,Integrated Oil & Gas,2021,https://www.omv.com/services/downloads/00/omv....,[Sustainability Report 2021 Non-Financial Repo...,,Yes,Yes,Established Carbon Transition Plan,Yes,Yes,Yes,Yes,,Yes
2,2,TotalEnergies SE,FR0000120271,TTE,France,Energy,Integrated Oil & Gas,2021,https://totalenergies.com/system/files/documen...,[Universal Registration Document 2021including...,,Yes,Yes,Established Carbon Transition Plan,,,Yes,Yes,Yes,Yes
3,4,Eni SpA,IT0003132476,ENI,Italy,Energy,Integrated Oil & Gas,2021,https://www.eni.com/assets/documents/eng/just-...,[Eni for 2021 A just transition ##PAGE_BREAK##...,Yes,Yes,Yes,Plans to Transition to Low Carbon Environment,Yes,Yes,Yes,Yes,Yes,Yes
4,5,Woodside Energy Group Ltd.,AU0000224040,WDS,Australia,Energy,Oil & Gas Exploration & Production,2021,https://www.woodside.com/docs/default-source/i...,[SUSTAINABLE DEVELOPMENT REPORT ##PAGE_BREAK##...,,Yes,Yes,Plans to Transition to Low Carbon Environment,,Yes,,Yes,,Yes


In [23]:
# Map labels to 0, 1 and 2 for ML 
df3 = df3[['IssuerName', 'Ticker', 'CountryOfIncorporation', 'GICSSector', 'GICSSubIndustry', 'Q3', 'Q4', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Processed_Sentences']]
for i in ['Q3', 'Q4', 'Q7', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14']:
    df3[i] = df3[i].replace({np.nan: 0, 'Yes': 1, "NA ": 0})
df3['Q8'] = df3['Q8'].replace({np.nan: 0, 'Plans to Transition to Low Carbon Environment': 1, 'Established Carbon Transition Plan': 2})

In [24]:
# Load question : keyword_list mappings
f = open("question_keywords.json", "r")
question_keywords = json.loads(f.read())
f.close()

### II. Base Machine Learning Models

In [25]:
df_cols = ['DecisionTree', 'RandomForest', 'ExtraTrees', 'LogisticRegression', 'GradientBoosting', 'SupportVectorMachines', 'KNeighbours', 'NaiveBayes', 'Stacking']
indexes = ['Q3', 'Q4', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14']
combined_model_df = pd.DataFrame(columns = df_cols, index = indexes)
combined_model_results = {}
for qn_name in indexes:
    combined_model_results[qn_name] = []

#### [A] Decision Trees

In [26]:
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
for num in list(product(['gini','entropy'],range(1,10))):
   decision_tree=DecisionTreeClassifier(criterion=num[0],max_depth=num[1], random_state=123)
   models.append(decision_tree) 
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'DecisionTree'] = max(best_roc, combined_model_df.at[qn_name, 'DecisionTree'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.7182142857142857. Feature engineering: getCountVectVaderDf, Model: DecisionTreeClassifier(max_depth=1, random_state=123) 
Best weighted ROC AUC for Q4: 0.760909090909091. Feature engineering: getCountVectDf, Model: DecisionTreeClassifier(max_depth=2, random_state=123) 
Best weighted ROC AUC for Q7: 0.8550000000000001. Feature engineering: getTfidfVaderDf, Model: DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=123) 
Best weighted ROC AUC for Q8: 0.6887070105820106. Feature engineering: getTfidfVaderDf, Model: DecisionTreeClassifier(max_depth=3, random_state=123) 
Best weighted ROC AUC for Q9: 0.6578571428571429. Feature engineering: getTfidfDf, Model: DecisionTreeClassifier(max_depth=3, random_state=123) 
Best weighted ROC AUC for Q10: 0.6995535714285714. Feature engineering: getCountVectVaderDf, Model: DecisionTreeClassifier(max_depth=3, random_state=123) 
Best weighted ROC AUC for Q11: 0.7537037037037038. Feature engineering: getCo

#### [B] Logistic Regression

In [27]:
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
logit_classifier=LogisticRegression(random_state=123)
models.append(logit_classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'LogisticRegression'] = max(best_roc, combined_model_df.at[qn_name, 'LogisticRegression'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.7172321428571429. Feature engineering: getTfidfVaderDf, Model: LogisticRegression(random_state=123) 
Best weighted ROC AUC for Q4: 0.89. Feature engineering: getTfidfDf, Model: LogisticRegression(random_state=123) 
Best weighted ROC AUC for Q7: 0.9051785714285714. Feature engineering: getCountVectDf, Model: LogisticRegression(random_state=123) 
Best weighted ROC AUC for Q8: 0.6510769400352734. Feature engineering: getTfidfDf, Model: LogisticRegression(random_state=123) 
Best weighted ROC AUC for Q9: 0.7698214285714287. Feature engineering: getCountVectDf, Model: LogisticRegression(random_state=123) 
Best weighted ROC AUC for Q10: 0.6525. Feature engineering: getTfidfVaderDf, Model: LogisticRegression(random_state=123) 
Best weighted ROC AUC for Q11: 0.912962962962963. Feature engineering: getTfidfVaderDf, Model: LogisticRegression(random_state=123) 
Best weighted ROC AUC for Q12: 0.8658730158730158. Feature engineering: getCountVectDf, Model: LogisticReg

#### [C] SVM 

In [28]:
from sklearn.svm import SVC
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
for kernels in ['rbf','poly']:
    svm_classifier=SVC(kernel=kernels,probability=True, random_state=123)
    models.append(svm_classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'SupportVectorMachines'] = max(best_roc, combined_model_df.at[qn_name, 'SupportVectorMachines'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.7536607142857144. Feature engineering: getTfidfVaderDf, Model: SVC(kernel='poly', probability=True, random_state=123) 
Best weighted ROC AUC for Q4: 0.78. Feature engineering: getTfidfVaderDf, Model: SVC(kernel='poly', probability=True, random_state=123) 
Best weighted ROC AUC for Q7: 0.9051785714285714. Feature engineering: getCountVectDf, Model: SVC(probability=True, random_state=123) 
Best weighted ROC AUC for Q8: 0.680056216931217. Feature engineering: getCountVectDf, Model: SVC(probability=True, random_state=123) 
Best weighted ROC AUC for Q9: 0.47696428571428573. Feature engineering: getTfidfVaderDf, Model: SVC(kernel='poly', probability=True, random_state=123) 
Best weighted ROC AUC for Q10: 0.6753571428571428. Feature engineering: getTfidfVaderDf, Model: SVC(kernel='poly', probability=True, random_state=123) 
Best weighted ROC AUC for Q11: 0.8993055555555556. Feature engineering: getTfidfVaderDf, Model: SVC(kernel='poly', probability=True, random

#### [D] K-Neighbours

In [29]:
from sklearn.neighbors import KNeighborsClassifier
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
for num in range(5,20):
    neighbors_classifier=KNeighborsClassifier(n_neighbors=num)
    models.append(neighbors_classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'KNeighbours'] = max(best_roc, combined_model_df.at[qn_name, 'KNeighbours'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.755625. Feature engineering: getTfidfVaderDf, Model: KNeighborsClassifier(n_neighbors=9) 
Best weighted ROC AUC for Q4: 0.8068181818181819. Feature engineering: getCountVectDf, Model: KNeighborsClassifier() 
Best weighted ROC AUC for Q7: 0.8875. Feature engineering: getCountVectDf, Model: KNeighborsClassifier(n_neighbors=18) 
Best weighted ROC AUC for Q8: 0.6609413580246914. Feature engineering: getTfidfVaderDf, Model: KNeighborsClassifier(n_neighbors=6) 
Best weighted ROC AUC for Q9: 0.7009821428571428. Feature engineering: getTfidfVaderDf, Model: KNeighborsClassifier(n_neighbors=6) 
Best weighted ROC AUC for Q10: 0.6763392857142857. Feature engineering: getTfidfVaderDf, Model: KNeighborsClassifier(n_neighbors=16) 
Best weighted ROC AUC for Q11: 0.8998842592592593. Feature engineering: getTfidfVaderDf, Model: KNeighborsClassifier(n_neighbors=6) 
Best weighted ROC AUC for Q12: 0.843968253968254. Feature engineering: getTfidfVaderDf, Model: KNeighborsClas

#### [E] Naive Bayes

In [30]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
classifiers = [MultinomialNB(), GaussianNB(), BernoulliNB()]
for classifier in classifiers:
   models.append(classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'NaiveBayes'] = max(best_roc, combined_model_df.at[qn_name, 'NaiveBayes'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.6740178571428571. Feature engineering: getCountVectVaderDf, Model: BernoulliNB() 
Best weighted ROC AUC for Q4: 0.86. Feature engineering: getTfidfDf, Model: MultinomialNB() 
Best weighted ROC AUC for Q7: 0.8994642857142857. Feature engineering: getCountVectDf, Model: BernoulliNB() 
Best weighted ROC AUC for Q8: 0.6745535714285714. Feature engineering: getCountVectDf, Model: BernoulliNB() 
Best weighted ROC AUC for Q9: 0.7651785714285715. Feature engineering: getTfidfDf, Model: MultinomialNB() 
Best weighted ROC AUC for Q10: 0.6310714285714285. Feature engineering: getCountVectDf, Model: BernoulliNB() 
Best weighted ROC AUC for Q11: 0.8914351851851852. Feature engineering: getCountVectDf, Model: BernoulliNB() 
Best weighted ROC AUC for Q12: 0.8739682539682541. Feature engineering: getCountVectDf, Model: BernoulliNB() 
Best weighted ROC AUC for Q13: 0.96875. Feature engineering: getTfidfDf, Model: MultinomialNB() 
Best weighted ROC AUC for Q14: 0.78412698

### III. Ensemble Learning Models

#### [A] Random Forest

In [31]:
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
for i in [50, 100, 150]:
   rf_classifier=ensemble.RandomForestClassifier(n_estimators=i, random_state=123)
   models.append(rf_classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'RandomForest'] = max(best_roc, combined_model_df.at[qn_name, 'RandomForest'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.6916964285714287. Feature engineering: getTfidfDf, Model: RandomForestClassifier(n_estimators=50, random_state=123) 
Best weighted ROC AUC for Q4: 0.9. Feature engineering: getTfidfVaderDf, Model: RandomForestClassifier(n_estimators=150, random_state=123) 
Best weighted ROC AUC for Q7: 0.9051785714285714. Feature engineering: getCountVectDf, Model: RandomForestClassifier(n_estimators=50, random_state=123) 
Best weighted ROC AUC for Q8: 0.7085416666666668. Feature engineering: getCountVectVaderDf, Model: RandomForestClassifier(n_estimators=50, random_state=123) 
Best weighted ROC AUC for Q9: 0.74125. Feature engineering: getCountVectDf, Model: RandomForestClassifier(n_estimators=150, random_state=123) 
Best weighted ROC AUC for Q10: 0.6546428571428572. Feature engineering: getCountVectVaderDf, Model: RandomForestClassifier(random_state=123) 
Best weighted ROC AUC for Q11: 0.9021990740740741. Feature engineering: getTfidfDf, Model: RandomForestClassifier(r

#### [B] Extra Trees

In [32]:
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
for num in list(product(['gini','entropy'],[50, 100, 150])):
  et_classifier=ensemble.ExtraTreesClassifier(n_estimators=num[1], criterion=num[0], random_state=123)
  models.append(et_classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'ExtraTrees'] = max(best_roc, combined_model_df.at[qn_name, 'ExtraTrees'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.6756249999999999. Feature engineering: getTfidfDf, Model: ExtraTreesClassifier(criterion='entropy', random_state=123) 
Best weighted ROC AUC for Q4: 0.9. Feature engineering: getTfidfDf, Model: ExtraTreesClassifier(n_estimators=50, random_state=123) 
Best weighted ROC AUC for Q7: 0.9108928571428571. Feature engineering: getCountVectDf, Model: ExtraTreesClassifier(random_state=123) 
Best weighted ROC AUC for Q8: 0.7070227072310407. Feature engineering: getCountVectDf, Model: ExtraTreesClassifier(random_state=123) 
Best weighted ROC AUC for Q9: 0.7469642857142856. Feature engineering: getTfidfDf, Model: ExtraTreesClassifier(criterion='entropy', random_state=123) 
Best weighted ROC AUC for Q10: 0.6691071428571428. Feature engineering: getCountVectVaderDf, Model: ExtraTreesClassifier(criterion='entropy', random_state=123) 
Best weighted ROC AUC for Q11: 0.8965277777777778. Feature engineering: getCountVectDf, Model: ExtraTreesClassifier(n_estimators=50, rand

#### [C] Gradient boosting

In [33]:
feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
for combi in list(product(['deviance', 'exponential'],[50, 100, 150])):
    gb_classifier=ensemble.GradientBoostingClassifier(random_state=123, loss=combi[0], n_estimators=combi[1])
    models.append(gb_classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'GradientBoosting'] = max(best_roc, combined_model_df.at[qn_name, 'GradientBoosting'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.6465178571428571. Feature engineering: getCountVectVaderDf, Model: GradientBoostingClassifier(loss='exponential', random_state=123) 
Best weighted ROC AUC for Q4: 0.925. Feature engineering: getTfidfVaderDf, Model: GradientBoostingClassifier(loss='exponential', n_estimators=50,
                           random_state=123) 
Best weighted ROC AUC for Q7: 0.8404464285714285. Feature engineering: getTfidfDf, Model: GradientBoostingClassifier(loss='exponential', n_estimators=50,
                           random_state=123) 
Best weighted ROC AUC for Q8: 0.6763326719576719. Feature engineering: getCountVectVaderDf, Model: GradientBoostingClassifier(random_state=123) 
Best weighted ROC AUC for Q9: 0.6675. Feature engineering: getCountVectDf, Model: GradientBoostingClassifier(loss='exponential', n_estimators=50,
                           random_state=123) 
Best weighted ROC AUC for Q10: 0.6260714285714285. Feature engineering: getTfidfVaderDf, Model: GradientBo

Q3: ('getTfidfVaderDf', KNeighborsClassifier(n_neighbors=9), 0.755625)
Q4: ('getTfidfVaderDf', GradientBoostingClassifier(loss='exponential', n_estimators=50,
                           random_state=123), 0.925)
Q7: ('getCountVectDf', ExtraTreesClassifier(random_state=123), 0.9108928571428571)
Q8: ('getCountVectVaderDf', RandomForestClassifier(n_estimators=50, random_state=123), 0.7085416666666668)
Q9: ('getCountVectDf', LogisticRegression(random_state=123), 0.7698214285714287)
Q10: ('getCountVectVaderDf', DecisionTreeClassifier(max_depth=3, random_state=123), 0.6995535714285714)
Q11: ('getTfidfVaderDf', LogisticRegression(random_state=123), 0.912962962962963)
Q12: ('getTfidfVaderDf', ExtraTreesClassifier(criterion='entropy', n_estimators=50, random_state=123), 0.8936507936507937)
Q13: ('getCountVectDf', RandomForestClassifier(random_state=123), 0.9707175925925926)
Q14: ('getCountVectVaderDf', ExtraTreesClassifier(random_state=123), 0.8403968253968255)


In [42]:
combined_model_df

Unnamed: 0,DecisionTree,RandomForest,ExtraTrees,LogisticRegression,GradientBoosting,SupportVectorMachines,KNeighbours,NaiveBayes
Q3,0.718214,0.691696,0.675625,0.717232,0.646518,0.753661,0.755625,0.674018
Q4,0.760909,0.9,0.9,0.89,0.925,0.78,0.806818,0.86
Q7,0.855,0.905179,0.910893,0.905179,0.840446,0.905179,0.8875,0.899464
Q8,0.688707,0.708542,0.707023,0.651077,0.676333,0.680056,0.660941,0.674554
Q9,0.657857,0.74125,0.746964,0.769821,0.6675,0.476964,0.700982,0.765179
Q10,0.699554,0.654643,0.669107,0.6525,0.626071,0.675357,0.676339,0.631071
Q11,0.753704,0.902199,0.896528,0.912963,0.854745,0.899306,0.899884,0.891435
Q12,0.73381,0.882857,0.893651,0.865873,0.803175,0.854127,0.843968,0.873968
Q13,0.924074,0.970718,0.96875,0.96875,0.9125,0.96875,0.921528,0.96875
Q14,0.818016,0.806746,0.840397,0.789603,0.801349,0.806587,0.797857,0.784127


#### [D] Stacking

In [34]:
model = ensemble.GradientBoostingClassifier()
print(type(model).__name__)

GradientBoostingClassifier


In [62]:
combined_model_df.at['Q3', 'Stacking'] = 3
combined_model_df

Unnamed: 0,DecisionTree,RandomForest,ExtraTrees,LogisticRegression,GradientBoosting,SupportVectorMachines,KNeighbours,NaiveBayes,Stacking
Q3,0.718214,0.691696,0.675625,0.717232,0.646518,0.753661,0.755625,0.674018,3
Q4,0.760909,0.9,0.9,0.89,0.925,0.78,0.806818,0.86,2
Q7,0.855,0.905179,0.910893,0.905179,0.840446,0.905179,0.8875,0.899464,2
Q8,0.688707,0.708542,0.707023,0.651077,0.676333,0.680056,0.660941,0.674554,2
Q9,0.657857,0.74125,0.746964,0.769821,0.6675,0.476964,0.700982,0.765179,2
Q10,0.699554,0.654643,0.669107,0.6525,0.626071,0.675357,0.676339,0.631071,2
Q11,0.753704,0.902199,0.896528,0.912963,0.854745,0.899306,0.899884,0.891435,2
Q12,0.73381,0.882857,0.893651,0.865873,0.803175,0.854127,0.843968,0.873968,2
Q13,0.924074,0.970718,0.96875,0.96875,0.9125,0.96875,0.921528,0.96875,2
Q14,0.818016,0.806746,0.840397,0.789603,0.801349,0.806587,0.797857,0.784127,2


In [50]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from  sklearn.model_selection import StratifiedKFold

feature_engineering = [getCountVectDf, getCountVectVaderDf, getTfidfDf, getTfidfVaderDf]
models = []
for qn_name in question_keywords:
    combined_model_results[qn_name].sort(key=lambda x: x[2], reverse=True)
    sorted_models = list(map(lambda x: (type(x[1]).__name__, x[1]), combined_model_results[qn_name]))[:3]
    stacking_classifier = StackingClassifier(estimators=sorted_models, final_estimator=LogisticRegression())
    models.append(stacking_classifier)
model_results = stratifiedKFoldModelEvaluation(models, feature_engineering, question_keywords, df3)
for qn_name in model_results:
   best_roc = model_results[qn_name][0][2]
   combined_model_df.at[qn_name, 'Stacking'] = max(best_roc, combined_model_df.at[qn_name, 'Stacking'])
   combined_model_results[qn_name].extend(model_results[qn_name])

Best weighted ROC AUC for Q3: 0.5493750000000001. Feature engineering: getCountVectVaderDf, Model: StackingClassifier(estimators=[('DecisionTreeClassifier',
                                DecisionTreeClassifier(max_depth=3,
                                                       random_state=123)),
                               ('KNeighborsClassifier',
                                KNeighborsClassifier(n_neighbors=16)),
                               ('SVC',
                                SVC(kernel='poly', probability=True,
                                    random_state=123))],
                   final_estimator=LogisticRegression()) 
Best weighted ROC AUC for Q4: 0.8300000000000001. Feature engineering: getTfidfVaderDf, Model: StackingClassifier(estimators=[('DecisionTreeClassifier',
                                DecisionTreeClassifier(max_depth=3,
                                                       random_state=123)),
                               ('KNeighborsClassifier'

In [73]:
for qn_name in question_keywords:
    combined_model_results[qn_name].sort(key=lambda x: x[2], reverse=True)
    top_result = combined_model_results[qn_name][0]
    print(f"{qn_name}: {top_result}")

Q3: ('getTfidfVaderDf', KNeighborsClassifier(n_neighbors=9), 0.755625)
Q4: ('getTfidfVaderDf', GradientBoostingClassifier(loss='exponential', n_estimators=50,
                           random_state=123), 0.925)
Q7: ('getCountVectDf', ExtraTreesClassifier(random_state=123), 0.9108928571428571)
Q8: ('getCountVectVaderDf', RandomForestClassifier(n_estimators=50, random_state=123), 0.7085416666666668)
Q9: ('getCountVectDf', LogisticRegression(random_state=123), 0.7698214285714287)
Q10: ('getCountVectVaderDf', DecisionTreeClassifier(max_depth=3, random_state=123), 0.6995535714285714)
Q11: ('getTfidfVaderDf', GradientBoostingClassifier(loss='exponential', n_estimators=150,
                           random_state=123), 0.9254629629629629)
Q12: ('getTfidfVaderDf', ExtraTreesClassifier(criterion='entropy', n_estimators=50, random_state=123), 0.8936507936507937)
Q13: ('getCountVectDf', RandomForestClassifier(random_state=123), 0.9707175925925926)
Q14: ('getCountVectVaderDf', ExtraTreesClassifie

In [72]:
combined_model_df

Unnamed: 0,DecisionTree,RandomForest,ExtraTrees,LogisticRegression,GradientBoosting,SupportVectorMachines,KNeighbours,NaiveBayes,Stacking
Q3,0.718214,0.691696,0.675625,0.717232,0.646518,0.753661,0.755625,0.674018,0.549375
Q4,0.760909,0.9,0.9,0.89,0.925,0.78,0.806818,0.86,0.83
Q7,0.855,0.905179,0.910893,0.905179,0.840446,0.905179,0.8875,0.899464,0.905179
Q8,0.688707,0.708542,0.707023,0.651077,0.676333,0.680056,0.660941,0.674554,0.698092
Q9,0.657857,0.74125,0.746964,0.769821,0.6675,0.476964,0.700982,0.765179,0.729821
Q10,0.699554,0.654643,0.669107,0.6525,0.626071,0.675357,0.676339,0.631071,0.621786
Q11,0.753704,0.902199,0.896528,0.912963,0.854745,0.899306,0.899884,0.891435,0.909028
Q12,0.73381,0.882857,0.893651,0.865873,0.803175,0.854127,0.843968,0.873968,0.815556
Q13,0.924074,0.970718,0.96875,0.96875,0.9125,0.96875,0.921528,0.96875,0.961343
Q14,0.818016,0.806746,0.840397,0.789603,0.801349,0.806587,0.797857,0.784127,0.817698


#### Question-Model Scores
- Q3 - SVM: 0.753661
- Q4 - GradientBoosting: 0.925	
- Q7 - ExtraTrees: 0.910893
- Q8 - RandomForest: 0.708542
- Q9 - LogisticRegression: 0.769821
- Q10 - Decision Tree: 0.699554	
- Q11 - LogisticRegression: 0.912963
- Q12 - ExtraTrees: 0.893651	
- Q13 - RandomForest: 0.970718
- Q14 - ExtraTrees: 0.840397
