In [None]:
import os
import sys
import importlib
from pathlib import Path

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [None]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'


In [None]:
import string
import re
import time
import tqdm
import json
import csv
import glob
import pickle
import random
import unicodedata
import multiprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.3f}'.format)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.feature_selection import (SelectFdr, SelectFpr,
                                       SelectFromModel, SelectFwe,
                                       SelectKBest, SelectPercentile, chi2,
                                       f_classif, f_regression,
                                       mutual_info_classif,
                                       mutual_info_regression)
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import (LogisticRegression,
                                  PassiveAggressiveClassifier, Perceptron,
                                  SGDClassifier)
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              BaggingRegressor, ExtraTreesClassifier,
                              GradientBoostingClassifier,
                              RandomForestClassifier, StackingClassifier,
                              StackingRegressor, VotingClassifier,
                              VotingRegressor)
from sklearn.model_selection import (GridSearchCV, KFold, LeaveOneOut,
                                     RandomizedSearchCV,
                                     RepeatedStratifiedKFold, ShuffleSplit,
                                     StratifiedKFold,
                                     StratifiedShuffleSplit,
                                     cross_val_score, cross_validate,
                                     learning_curve, train_test_split)
from sklearn.utils.validation import (check_is_fitted, column_or_1d,
                                      has_fit_parameter)
from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score,
                             balanced_accuracy_score, brier_score_loss,
                             classification_report, cohen_kappa_score,
                             confusion_matrix, f1_score, log_loss,
                             make_scorer, matthews_corrcoef,
                             precision_recall_curve, precision_score,
                             recall_score, roc_auc_score)
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import (EditedNearestNeighbours, NearMiss,
                                     RandomUnderSampler, TomekLinks)
from xgboost import XGBClassifier

In [None]:
# Validation split ratios
n_jobs = 1
train_ratio = 0.75
test_ratio = 0.10
validation_ratio = 0.15
test_split = test_size = 1 - train_ratio
validation_split = test_ratio / (test_ratio + validation_ratio)

# Cross-validation
random.seed(42)
np.random.seed(42)
random_state = 42
partition = True
cv = RepeatedStratifiedKFold(
    n_splits=10, n_repeats=3, random_state=random_state)
# Resampling
class_weight = 'balanced'
resampling_enabled = True
resample_enn = SMOTEENN(
    enn=EditedNearestNeighbours(sampling_strategy='majority'))
resample_tome = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
# Undersampling
rus = RandomUnderSampler(random_state=random_state, replacement=True)
tl = RandomOverSampler(sampling_strategy='majority')
nm = NearMiss()
# Oversampling
ros = RandomOverSampler(random_state=random_state)
smote = SMOTE()
# Sampling Used
resampling_method = resample_tome

t = time.time()
cores = multiprocessing.cpu_count()
model_sizes = [300, 100]
scoring = 'recall'
scores = ['recall', 'accuracy', 'f1', 'roc_auc', 'explained_variance', 'matthews_corrcoef']
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
}
metrics_list = [
    'Mean Validation Score',
    'Explained Variance',
    'Accuracy',
    'Precision',
    'Recall',
    'F1-score',
    'ROC',
    'AUC',
    'Matthews Correlation Coefficient',
    f'{scoring.title()} Best Threshold',
    f'{scoring.title()} Best Score',
    'Log Loss/Cross Entropy',
    'Classification Report',
    'Confusion Matrix',
    'Accuracy_opt',
    'Precision_opt',
    'Recall_opt',
    'F1-score_opt',
    'Matthews Correlation Coefficient_opt',
    'Classification Report_opt',
    'Confusion Matrix_opt',
]

models_save_path = f'{data_dir}classification models/'
table_save_path = f'{data_dir}output tables/'
pickle_file_name = 'Classifiers Table.pkl'
csv_file_name = 'Classifiers Table.csv'
excel_file_name = 'Classifiers Table.xlsx'
latex_file_name = 'Classifiers Table.tex'
markdown_file_name = 'Classifiers Table.md'

analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'

## Vectorizers

In [None]:
### CountVectorizer
count = CountVectorizer()
params_count_pipe = {
    'CountVectorizer__analyzer': ['word'],
    'CountVectorizer__ngram_range': [(1, 3)],
    'CountVectorizer__lowercase': [True, False],
    'CountVectorizer__max_df': [0.9, 0.85, 0.8, 0.75, 0.70],
    'CountVectorizer__min_df': [0.10, 0.15, 0.2, 0.25, 0.30],
}

### TfidfVectorizer
tfidf = TfidfVectorizer()
params_tfidf_pipe = {
#     'TfidfVectorizer__stop_words': ['english'],
    'TfidfVectorizer__analyzer': ['word'],
    'TfidfVectorizer__ngram_range': [(1, 3)],
    'TfidfVectorizer__lowercase': [True, False],
#     'TfidfVectorizer__max_features': [None, 5000, 10000, 50000],
    'TfidfVectorizer___use_idf': [True],
#     'TfidfVectorizer___smooth_idf': [True, False],
    'TfidfVectorizer__max_df': [0.9, 0.85, 0.8, 0.75, 0.70],
    'TfidfVectorizer__min_df': [0.10, 0.15, 0.2, 0.25, 0.30],
}

### BOW FeatureUnion
bow = FeatureUnion(
    transformer_list=[('CountVectorizer', count), ('TfidfVectorizer', tfidf)]
)
params_bow_pipe = {**params_count_pipe, **params_tfidf_pipe}

## Vectorizers Dict
vectorizers_pipe = {
    'CountVectorizer': [count, params_count_pipe],
    'TfidfVectorizer': [tfidf, params_tfidf_pipe],
    'UnionBOW': [bow, params_bow_pipe],
    # "UnionWordEmbedding": [em, params_em_pipe],
}


## Selectors

In [None]:
# Selectors
selector = SelectKBest(score_func=chi2, k='all')
selector_name = selector.__class__.__name__

# model_selector = SelectFromModel()
# model_selector_name = model_selector.__class__.__name__

### SelectKBest
selectkbest = SelectKBest()
params_selectkbest_pipe = {
    'SelectKBest__score_func': [f_classif, chi2, mutual_info_classif, f_regression, mutual_info_regression],
    'SelectKBest__k': ['all'],
}

### SelectPercentile
selectpercentile = SelectPercentile()
params_selectpercentile_pipe = {
    'SelectPercentile__score_func': [f_classif, chi2, mutual_info_classif, f_regression, mutual_info_regression],
}

### SelectFpr
selectfpr = SelectFpr()
params_selectfpr_pipe = {
    'SelectFpr__score_func': [f_classif, chi2, mutual_info_classif, f_regression, mutual_info_regression],
}

### SelectFdr
selectfdr = SelectFdr()
params_selectfdr_pipe = {
    'SelectFdr__score_func': [f_classif, chi2, mutual_info_classif, f_regression, mutual_info_regression],
}

### SelectFwe
selectfwe = SelectFwe()
params_selectfwe_pipe = {
    'SelectFwe__score_func': [f_classif, chi2, mutual_info_classif, f_regression, mutual_info_regression],
}

## Selectors Dict
selectors_pipe = {
    'SelectKBest': [selectkbest, params_selectkbest_pipe],
    'SelectPercentile': [selectpercentile, params_selectpercentile_pipe],
    'SelectFpr': [selectfpr, params_selectfpr_pipe],
    'SelectFdr': [selectfdr, params_selectfdr_pipe],
    'SelectFwe': [selectfwe, params_selectfwe_pipe],
}

## Classifiers

In [None]:
# Classifiers
### Dummy Classifier
dummy = DummyClassifier()
params_dummy_freq = {'strategy': 'most_frequent', 'random_state': random_state}
params_dummy_stratified = {'strategy': 'stratified', 'random_state': random_state}
params_dummy_uniform = {'strategy': 'uniform', 'random_state': random_state}
params_dummy_pipe = {
    'DummyClassifier__strategy': [
        'stratified',
        'most_frequent',
        'prior',
        'uniform',
        'constant',
    ],
    'DummyClassifier__random_state': [random_state],
}

# ### Multinomial Naive Bayes
# nb = MultinomialNB()
# params_nb = {'alpha': 0.1, 'fit_prior': True, 'class_prior': None}
# params_nb_pipe = {
#     'MultinomialNB__fit_prior': [True],
#     'MultinomialNB__alpha': [0.1, 0.2, 0.3],
# }

### Bernoulli Naive Bayes
bnb = BernoulliNB()
params_bnb = {'alpha': 0.1, 'fit_prior': True, 'class_prior': None}
params_bnb_pipe = {
    'BernoulliNB__fit_prior': [True],
    'BernoulliNB__alpha': [0.1, 0.2, 0.3],
}

### Gaussian Naive Bayes
gnb = GaussianNB()
params_gnb = {'var_smoothing': 1e-9}
params_gnb_pipe = {
    'GaussianNB__var_smoothing': [1e-9],
}

### KNeighbors Classifier
knn = KNeighborsClassifier()
params_knn = {
    'n_neighbors': 3,
    'weights': 'uniform',
    'algorithm': 'auto',
    'leaf_size': 30,
    'p': 2,
    'metric': 'minkowski',
    'metric_params': None,
    'n_jobs': 1,
}
params_knn_pipe = {
    'KNeighborsClassifier__weights': ['uniform'],
    'KNeighborsClassifier__n_neighbors': [2, 5, 15],
    'KNeighborsClassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'KNeighborsClassifier__leaf_size': [30, 50, 100, 200, 300, 500],
    'KNeighborsClassifier__p': [1, 2, 3, 4, 5],
    'KNeighborsClassifier__metric': [
        'minkowski',
        'euclidean',
        'cosine',
        'correlation',
    ],
    'KNeighborsClassifier__metric_params': [None, {'p': 2}, {'p': 3}],
}

### Logistic Regression
lr = LogisticRegression()
params_lr = {
    'penalty': 'l2',
    'dual': False,
    'tol': 0.0001,
    'C': 1.0,
    'fit_intercept': True,
    'intercept_scaling': 1,
    'class_weight': class_weight,
    'random_state': random_state,
    'solver': 'liblinear',
    'max_iter': 100,
    'multi_class': 'ovr',
    'verbose': 0,
    'warm_start': False,
    'n_jobs': 1,
}
params_lr_pipe = {
    'LogisticRegression__penalty': ['l2'],
    'LogisticRegression__random_state': [random_state],
    'LogisticRegression__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'LogisticRegression__max_iter': [100, 200, 300, 500, 1000],
    'LogisticRegression__multi_class': ['ovr', 'multinomial'],
    'LogisticRegression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}

### Passive Aggressive
pa = PassiveAggressiveClassifier()
params_pa = {
    'C': 1.0,
    'fit_intercept': True,
    'max_iter': 1000,
    'tol': 0.0001,
    'class_weight': class_weight,
    'verbose': 0,
    'random_state': random_state,
    'loss': 'hinge',
    'n_jobs': 1,
}
params_pa_pipe = {
    'PassiveAggressiveClassifier__loss': ['hinge', 'squared_hinge'],
    'PassiveAggressiveClassifier__random_state': [random_state],
    'PassiveAggressiveClassifier__fit_intercept': [True, False],
    'PassiveAggressiveClassifier__class_weight': [None, 'balanced'],
    'PassiveAggressiveClassifier__max_iter': [100, 200, 300, 500, 1000],
}

### Stochastic Gradient Descent Aggressive
sgd = SGDClassifier()
params_sgd = {
    'fit_intercept': True,
    'max_iter': 1000,
    'tol': 0.0001,
    'class_weight': class_weight,
    'verbose': 0,
    'random_state': random_state,
    'loss': 'hinge',
    'n_jobs': 1,
}
params_sgd_pipe = {
    'SGDClassifier__loss': ['hinge', 'squared_hinge'],
    'SGDClassifier__random_state': [random_state],
    'SGDClassifier__fit_intercept': [True, False],
    'SGDClassifier__class_weight': [None, 'balanced'],
    'SGDClassifier__max_iter': [100, 200, 300, 500, 1000],
}

### SVM
svm = LinearSVC()
params_svm = {
    'penalty': 'l2',
    'loss': 'hinge',
    'dual': True,
    'tol': 0.0001,
    'C': 1.0,
    'fit_intercept': True,
    'intercept_scaling': 1,
    'class_weight': class_weight,
    'random_state': random_state,
    'max_iter': 1000,
    'multi_class': 'ovr',
    'verbose': 0,
}
params_svm_pipe = {
    'LinearSVC__penalty': ['l2'],
    'LinearSVC__loss': ['hinge', 'squared_hinge'],
    'LinearSVC__random_state': [random_state],
    'LinearSVC__max_iter': [100, 200, 300, 500, 1000],
    'LinearSVC__fit_intercept': [True, False],
    'LinearSVC__class_weight': [None, 'balanced'],
    'LinearSVC__multi_class': ['ovr', 'crammer_singer'],
}

### Decision Tree
dt = DecisionTreeClassifier()
params_dt = {
    'criterion': 'gini',
    'splitter': 'best',
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None,
    'random_state': random_state,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
}
params_dt_pipe = {
    'DecisionTreeClassifier__max_depth': [5, 10],
    'DecisionTreeClassifier__criterion': ['gini', 'entropy'],
    'DecisionTreeClassifier__random_state': [random_state],
    'DecisionTreeClassifier__splitter': ['best', 'random'],
    'DecisionTreeClassifier__max_features': [None, 'auto', 'sqrt', 'log2'],
}

### Random Forest
rf = RandomForestClassifier()
params_rf = {
    'n_estimators': 10,
    'criterion': 'log_loss',
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'bootstrap': True,
    'oob_score': False,
    'n_jobs': 1,
    'random_state': random_state,
    'verbose': 0,
    'warm_start': False,
    'class_weight': class_weight,
}
params_rf_pipe = {
    'RandomForestClassifier__n_estimators': [10, 20],
    'RandomForestClassifier__n_jobs': [-1],
    'RandomForestClassifier__max_depth': [5, 10],
    'RandomForestClassifier__max_feature': [*np.arange(0.1, 1.1, 0.1)],
    'RandomForestClassifier__random_state': [random_state],
    'RandomForestClassifier__class_weight': [None, 'balanced'],
}

### Extra Trees
et = ExtraTreesClassifier()
params_et = {
    'n_estimators': 10,
    'criterion': 'log_loss',
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'bootstrap': True,
    'oob_score': False,
    'n_jobs': 1,
    'random_state': random_state,
    'verbose': 0,
    'warm_start': False,
    'class_weight': class_weight,
}
params_et_pipe = {
    'ExtraTreesClassifier__n_estimators': [10, 20],
    'ExtraTreesClassifier__n_jobs': [-1],
    'ExtraTreesClassifier__max_depth': [5, 10],
    'ExtraTreesClassifier__max_feature': [*np.arange(0.1, 1.1, 0.1)],
    'ExtraTreesClassifier__random_state': [42, 200],
    'ExtraTreesClassifier__criterion': ['gini', 'entropy', 'log_loss'],
    'ExtraTreesClassifier__class_weight': [None, 'balanced'],
}

### Gradient Boosting
gbc = GradientBoostingClassifier()
params_gbc = {
    'loss': 'deviance',
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 1.0,
    'criterion': 'friedman_mse',
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_depth': 3,
    'min_impurity_decrease': 0.0,
    'init': None,
    'random_state': random_state,
    'max_features': None,
    'verbose': 0,
    'max_leaf_nodes': None,
    'warm_start': False,
}
params_gbc_pipe = {
    'GradientBoostingClassifier__max_depth': [5, 10],
    'GradientBoostingClassifier__criterion': ['gini', 'entropy'],
    'GradientBoostingClassifier__random_state': [random_state],
    'GradientBoostingClassifier__n_estimators': [10, 20],
    'GradientBoostingClassifier__loss': ['deviance', 'exponential'],
    'GradientBoostingClassifier__subsample': [*np.arange(0.1, 1.1, 0.1)],
    'GradientBoostingClassifier__max_features': [None, 'auto', 'sqrt', 'log2'],
}

### AdaBoost
ada = AdaBoostClassifier()
params_ada = {
    'base_estimator': None,
    'n_estimators': 50,
    'learning_rate': 1.0,
    'algorithm': 'SAMME.R',
    'random_state': random_state,
}
params_ada_pipe = {
    'AdaBoostClassifier__max_depth': [5, 10],
    'AdaBoostClassifier__criterion': ['gini', 'entropy'],
    'AdaBoostClassifier__random_state': [random_state],
    'AdaBoostClassifier__n_estimators': [50, 100, 150],
    'AdaBoostClassifier__base_estimator': [
        SVC(probability=True, kernel='linear'),
        LogisticRegression(),
        MultinomialNB(),
    ],
}

### XGBoost
xgb = XGBClassifier()
params_xgb = {
    'nthread':4, #when use hyperthread, xgboost may become slower
    'objective':'binary:logistic',
    'learning_rate': 0.05, #so called `eta` value
    'max_depth': 6,
    'min_child_weight': 11,
    'silent': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'n_estimators': 1000, #number of trees, change it to 1000 for better results
    'missing':-999,
    'seed': 1337,
    'eval_metric': 'auc',
    'sample_type': 'weighted',
    'verbosity': '0',
}
params_xgb_pipe = {
    'xgb__max_depth': [5, 10],
    'xgb__learning_rate': [0.05],
    'xgb__n_estimators': [1000],
    'xgb__seed': [42],
    'xgb__nthread': [1, 2, 3, 4],
    'xgb__objective': ['binary:logitraw', 'binary:logistic', 'binary:hinge'],
    'xgb__eval_metric': ['auc', 'rmse', 'rmsle', 'logloss'],
    'xgb__sample_type': ['weighted', 'uniform'],
}

### MLP Classifier
mlpc = MLPClassifier()
params_mlpc = {
    'hidden_layer_sizes': (100,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.0001,
    'batch_size': 'auto',
    'learning_rate': 'constant',
    'learning_rate_init': 0.001,
    'power_t': 0.5,
    'max_iter': 200,
    'shuffle': True,
    'random_state': random_state,
    'tol': 0.0001,
    'verbose': False,
    'warm_start': False,
    'momentum': 0.9,
    'nesterovs_momentum': True,
    'early_stopping': False,
    'validation_fraction': 0.1,
    'beta_1': 0.9,
    'beta_2': 0.999,
    'epsilon': 1e-08,
}
params_mlpc_pipe = {
    'MLPClassifier__hidden_layer_sizes': [(100,), (50,), (25,), (10,), (5,), (1,)],
    'MLPClassifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'MLPClassifier__solver': ['lbfgs', 'sgd', 'adam'],
    'MLPClassifier__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'MLPClassifier__random_state': [random_state],
}

mlpr = MLPRegressor()
params_mlpr = {
    'hidden_layer_sizes': (100,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.0001,
    'batch_size': 'auto',
    'learning_rate': 'constant',
    'learning_rate_init': 0.001,
    'power_t': 0.5,
    'max_iter': 200,
    'shuffle': True,
    'random_state': random_state,
    'tol': 0.0001,
    'verbose': False,
    'warm_start': False,
    'momentum': 0.9,
    'nesterovs_momentum': True,
    'early_stopping': False,
    'validation_fraction': 0.1,
    'beta_1': 0.9,
    'beta_2': 0.999,
    'epsilon': 1e-08,
}
params_mlpr_pipe = {
    'MLPRegressor__hidden_layer_sizes': [(100,), (50,), (25,), (10,), (5,), (1,)],
    'MLPRegressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'MLPRegressor__solver': ['lbfgs', 'sgd', 'adam'],
    'MLPRegressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'MLPRegressor__random_state': [random_state],
}

## Stacking and Voting Classifiers
estimators = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    (
        'Logistic Regression',
        LogisticRegression(random_state=42, class_weight='balanced'),
    ),
]

### Voting Classifier
voting_classifier = VotingClassifier(estimators=estimators)
params_voting_pipe = {
    'VotingClassifier__estimators': [
        ('dummy', dummy, params_dummy_freq),
        ('dummy', dummy, params_dummy_stratified),
        ('dummy', dummy, params_dummy_uniform),
        ('nb', nb, params_nb),
        ('bnb', bnb, params_bnb),
        ('gnb', gnb, params_gnb),
        ('knn', knn, params_knn),
        ('lr', lr, params_lr),
        ('pa', pa, params_pa),
        ('sgd', sgd, params_sgd),
        ('svm', svm, params_svm),
        ('dt', dt, params_dt),
        ('rf', rf, params_rf),
        ('gbc', gbc, params_gbc),
        ('ada', ada, params_ada),
        ('xgb', xgb, params_xgb),
        ('mlpc', mlpc, params_mlpc),
        ('mlpr', mlpr, params_mlpr),
    ],
    'VotingClassifier__voting': ['hard', 'soft'],
    'VotingClassifier__weights': [None, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]],
}

### Stacking Classifier
stacking_classifier = StackingClassifier(estimators=estimators)
# final_estimator = LogisticRegression(random_state=random_state, class_weight=class_weight)
final_estimator = RandomForestClassifier(
    random_state=42, class_weight={0: 1, 1: 2}
)
params_stacking_pipe = {
    'StackingClassifier__estimator': [
        ('dummy', dummy),
        ('nb', nb),
        ('bnb', bnb),
        ('gnb', gnb),
        ('knn', knn),
        ('lr', lr),
        ('pa', pa),
        ('sgd', sgd),
        ('svm', svm),
        ('dt', dt),
        ('rf', rf),
        ('gbc', gbc),
        ('ada', ada),
        ('mlpc', mlpc),
    ],
    'StackingClassifier__cv': [3, 5, 7, 9, 11, 13, 15],
    'StackingClassifier__n_jobs': [-1],
    'StackingClassifier__stack_method': ['predict_proba', 'decision_function'],
    'StackingClassifier__passthrough': [True, False],
}

## Classifiers Pipe dict
classifiers_pipe = {
    'DummyClassifier': [dummy, params_dummy_pipe],
    'MultinomialNB': [nb, params_nb_pipe],
    'BernoulliNB': [bnb, params_bnb_pipe],
    'GaussianNB': [gnb, params_gnb_pipe],
    'KNeighborsClassifier': [knn, params_knn_pipe],
    'LogisticRegression': [lr, params_lr_pipe],
    'PassiveAggressiveClassifier': [pa, params_pa_pipe],
    'SGDClassifier': [sgd, params_sgd_pipe],
    'LinearSVC': [svm, params_svm_pipe],
    'DecisionTreeClassifier': [dt, params_dt_pipe],
    'RandomForestClassifier': [rf, params_rf_pipe],
    'GradientBoostingClassifier': [gbc, params_gbc_pipe],
    'AdaBoostClassifier': [ada, params_ada_pipe],
    'XGBClassifier': [xgb, params_xgb_pipe],
    'MLPClassifier': [mlpc, params_mlpc_pipe],
    'MLPRegressor': [mlpr, params_mlpr_pipe],
    'VotingClassifier': [voting_classifier, params_voting_pipe],
    'StackingClassifier': [stacking_classifier, params_stacking_pipe],
}


In [None]:
def split_data(df_manual, col, text_col):
    
    train_ratio = 0.75
    test_ratio = 0.10
    validation_ratio = 0.15
    test_split = test_size = 1 - train_ratio
    validation_split = test_ratio / (test_ratio + validation_ratio)

    # BOW Split
    print('Splitting data into training and test sets.')
    df_manual.dropna(subset=['Warmth', 'Competence', text_col], how='any', inplace=True)

    train, test = train_test_split(
        df_manual, test_size=test_split, train_size = 1-test_split, random_state=random_state
    )

    validate, test = train_test_split(
        test, test_size=validation_split, random_state=random_state
    )

    X_train = np.array([x for x in train[f'{str(text_col)}'].astype('str').values])
#     prepared_X_train = X_train.to_list()

    y_train = column_or_1d(train[str(col)].astype('int64').values, warn=True)
#     prepared_y_train = y_train.to_list()

    X_test = np.array([x for x in test[f'{str(text_col)}'].astype('str').values])
#     prepared_X_test = X_test.to_list()

    y_test = column_or_1d(test[str(col)].astype('int64').values, warn=True)
#     prepared_y_test = y_test.to_list()

    X_validate = np.array([x for x in validate[f'{str(text_col)}'].astype('str').values])
#     prepared_X_validate = X_validate.to_list()

    y_validate = column_or_1d(validate[str(col)].astype('int64').values, warn=True)
#     prepared_y_validate = y_validate.to_list()


    return train, X_train, y_train, test, X_test, y_test, validate, X_validate, y_validate

In [None]:
df_manual = pd.read_pickle(f'{data_dir}df_manual_for_trainning.pkl').reset_index(drop=True)


In [None]:
print('Using GridSearchCV')

for col in tqdm.tqdm(analysis_columns):
    print('-' * 20)
    print('\n')
    print(f'============================ STARTING PROCESSING {col.upper()} ============================')
    print('\n')
    print('-' * 20)
    if (len(df_manual[df_manual[str(col)].map(df_manual[str(col)].value_counts() > 50)]) != 0):

        # BOW Split
        train, X_train, y_train, test, X_test, y_test, validate, X_validate, y_validate = split_data(df_manual, col, text_col)


        # Vectorization
        for vectorizer_name, vectorizer_and_params in vectorizers_pipe.items():
            vectorizer = vectorizer_and_params[0]
            vectorizer_params = vectorizer_and_params[1]
            print(f'Vectorizer: {vectorizer_name.upper()}')
            print('~'*40)
            
            # Selection
            for selector_name, selector_and_params in selectors_pipe.items():
                selector = selector_and_params[0]
                selector_params = selector_and_params[1]

                # Classification
                for classifier_name, classifier_and_params in classifiers_pipe.items():
                    classifier = classifier_and_params[0]
                    classifier_params = classifier_and_params[1]
                    print(f'Classifier: {classifier_name.upper()}')
                    print('~'*40)

                    # Pipeline
                    ## Steps
                    steps = [
                        (vectorizer_name, vectorizer),
                        (selector_name, selector),
                        (classifier_name, classifier)
                    ]
                    ## Params
                    param_grid = {
                        **vectorizer_params,
                        **selector_params,
                        **classifier_params,
                    }
                    ## Pipeline
                    pipe = Pipeline(steps=steps)

                    ## Vectorizers, selectors, classifiers
                    vectorizer = pipe[:-2]
                    selector = pipe[:-1]
                    classifier = pipe[:]

                    # Search
                    search = GridSearchCV(
                        estimator=pipe,
                        param_grid=param_grid,
                        n_jobs=-1,
                        scoring=scores,
                        cv=cv,
                        refit=scores[0],
                        return_train_score=True,
                    )

                    # Fit SearchCV
                    searchcv = search.fit(X_train, y_train)

                    # Best Parameters
                    best_index = searchcv.best_index_
                    cv_results = sorted(searchcv.cv_results_)
                    best_params = searchcv.best_params_
                    classifier = searchcv.best_estimator_
                    y_train_pred = classifier.predict(X_train)
                    best_score = searchcv.best_score_
                    n_splits = searchcv.n_splits_

                    print('=' * 20)
                    print(f'Best index for {scores[0]}: {best_index}')
                    print(f'Best classifier for {scores[0]}: {classifier}')
                    print(f'Best y_train_pred for {scores[0]}: {y_train_pred}')
                    print(f'Best score for {scores[0]}: {best_score}')
                    print(f'Number of splits for {scores[0]}: {n_splits}')

                    print('-' * 20)
                    report = classification_report(y_train, y_train_pred)
                    print(f'Classification Report:\n{report}')
                    ConfusionMatrixDisplay.from_estimator(
                        searchcv, X_test, y_test, xticks_rotation="vertical"
                    )
                    plt.tight_layout()
                    plt.show()
                    print('=' * 20)

                    # Make the predictions
                    score = searchcv.score(X_test, y_test)
                    y_test_pred = searchcv.predict(X_test)
                    if hasattr(searchcv, 'predict_proba'):
                        y_test_prob_pred = searchcv.predict_proba(X_test)[:, 1]
                        y_validate_prob_pred = searchcv.predict_proba(X_validate)[:, 1]
                    elif hasattr(searchcv, '_predict_proba_lr'):
                        y_test_prob_pred = searchcv._predict_proba_lr(X_test)[:, 1]
                        y_validate_prob_pred = searchcv._predict_proba_lr(X_validate)[:, 1]

                    # Fit Best Model
                    print(f'Fitting {classifier}.')
                    classifier.set_params(**classifier.get_params())
                    classifier = classifier.fit(X_train, y_train)
