In [None]:
import os  # isort:skip # fmt:skip # noqa # nopep8
import sys  # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path  # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # isort:skip # fmt:skip # noqa # nopep8
from supervised_estimators_pipe import * # isort:skip # fmt:skip # noqa # nopep8


### Set variables

In [None]:
print('-'*20)
            print('Training Confusion Matrix:\n')
            close_plots()
            fig, ax = plt.subplots()
            ax.set_title(
                f'{str(col)} - Training Confusion Matrix - {vectorizer_name} + {classifier_name}'
            )
            train_cm = metrics.ConfusionMatrixDisplay.from_estimator(
                estimator, X_train, y_train, ax=ax, cmap=plt.cm.Blues
            )
            show_and_close_plots()

In [None]:
# Variables
# Sklearn variables
method = 'Supervised'
final_models_save_path = f'{models_save_path}{method} Results/'
t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    'Mean Cross Validation Train Score': np.nan,
    'Mean Cross Validation Test Score': np.nan,
    f'Mean Explained Train Variance - {scoring.title()}': np.nan,
    f'Mean Explained Test Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    f'{scoring.title()} Best Score': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
DetectorFactory.seed = random_state
cores = multiprocessing.cpu_count()
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(
    bert_model_name, strip_accents=True
)
bert_model = BertForSequenceClassification.from_pretrained(
    bert_model_name
).to(device)

# Plotting variables
pp = pprint.PrettyPrinter(indent=4)
tqdm.tqdm.pandas(desc='progress-bar')
tqdm_auto.tqdm.pandas(desc='progress-bar')
tqdm.notebook.tqdm().pandas(desc='progress-bar')
tqdm_auto.notebook_tqdm().pandas(desc='progress-bar')
# pbar = progressbar.ProgressBar(maxval=10)
mpl.use('MacOSX')
mpl.style.use(f'{code_dir}/setup_module/apa.mplstyle-main/apa.mplstyle')
mpl.rcParams['text.usetex'] = True
font = {'family': 'arial', 'weight': 'normal', 'size': 10}
mpl.rc('font', **font)
plt.style.use('tableau-colorblind10')
plt.set_cmap('Blues')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.2f}'.format)


In [None]:
# FIXME
# Fit best model on validation set using SelectFromModel
# HACK
print(f'Fitting {estimator}.')
sys.exit(0)
# estimator.set_params(**estimator.get_params())
# # estimator.fit(X_val, y_val)
# features_important = SelectFromModel(estimator).fit_transform(X_val, y_val)
# estimator = estimator.fit(SelectFromModel(estimator).fit(X_val, y_val), y_val)

# vectorizer = estimator[0]
# selector = estimator[1]
# classifier = estimator[-1]
# if col == 'Warmth':
#     resampler = estimator[-2]

# X_val = vectorizer.fit_transform(X_val)
# X_val = selector.fit_transform(X_val, y_val)
# if col == 'Warmth':
#     X_val, y_val = resampler.fit_resample(X_val, y_val)

# model_selector = SelectFromModel(classifier)
# features_important = model_selector.fit_transform(X_val, y_val)
# classifier = classifier.fit(features_important, y_val)

# Functions

In [None]:
def show_and_close_plots():
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()


In [None]:
def close_plots():
    plt.clf()
    plt.cla()
    plt.close()


In [None]:
def make_df_metrics(vectorizers_pipe, classifiers_pipe, metrics_list, analysis_columns=None):

    if analysis_columns is None:
        analysis_columns = analysis_columns

    index = pd.MultiIndex.from_product(
        [list(map(lambda classifier: classifier, classifiers_pipe.keys()))],
        names=['Classifiers'],
    )
    columns = pd.MultiIndex.from_product(
        [
            analysis_columns,
            list(map(lambda vectorizer: vectorizer, vectorizers_pipe.keys())),
            metrics_list,
        ],
        names=['Variable', 'Vectorizer', 'Measures'],
    )
    return pd.DataFrame(index=index, columns=columns)


In [None]:
# Function to place Xy and CV data in df and save
def load_Xy_search_cv_estimator(
    col, vectorizer_name, classifier_name,
    compression=None, save_path=None
):

    if compression is None:
        compression = False
    if save_path is None:
        save_path = f'{final_models_save_path}SearchCV/'

    # Save search
    ## Save grid_search
    with open(
        f'{save_path}{method} Grid Search {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'rb'
    ) as f:
        grid_search = joblib.load(f)

    ## Save searchcv
    with open(
        f'{save_path}{method} SearchCV {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'rb'
    ) as f:
        searchcv = joblib.load(f)

    ## Save searchcv data
    df_cv_results.read_pickle(
        f'{save_path}{method} df_searchcv_results - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    # Save Xy data
    ## Save train data
    df_train_data.read_pickle(
        f'{save_path}{method} df_train_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )
    X_train = df_train_data['X_train'].values
    y_train = df_train_data['y_train'].values
    y_train_pred = df_train_data['y_train_pred'].values

    ## Save test data
    df_test_data.read_pickle(
        f'{save_path}{method} df_test_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )
    X_test = df_test_data['X_test'].values
    y_test = df_test_data['y_test'].values
    y_test_pred = df_test_data['y_test_pred'].values
    y_test_pred_prob = df_test_data['y_test_pred_prob'].values

    ## Save val data
    df_val_data = pd.DataFrame(
        {
            'X_val': X_val,
            'y_val': y_val,
        },
    )
    df_val_data.read_pickle(
        f'{save_path}{method} df_val_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )
    X_val = df_val_data['X_val'].values
    y_val = df_val_data['y_val'].values

    # Save feature importance
    if df_feature_importances is not None:
        df_feature_importances.to_pickle(
            f'{save_path}{method} df_feature_importances - {col}_{vectorizer_name}_{classifier_name}.pkl'
        )

    # Save estimator
    with open(
        f'{final_models_save_path}{method} Estimator {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'rb'
    ) as f:
        estimator = joblib.load(f)
    
    return (
        grid_search, searchcv,
        X_train, y_train, y_train_pred,
        X_test, y_test, y_test_pred, y_test_pred_prob,
        X_val, y_val,
        df_feature_importances, estimator,
    )


In [None]:
def check_data(
    X_train, y_train, y_train_pred,
    X_test, y_test, y_test_pred, y_test_pred_prob,
    X_val, y_val,
    estimator,
    text_col=None, analysis_columns=None):
    
    check_consistent_length(X_train, X_test, X_val)
    check_consistent_length(y_train, y_test, y_val, y_train_pred, y_test_pred, y_test_pred_prob)
    check_is_fitted(estimator)

    train_class_weights = compute_class_weight(class_weight = class_weight, classes = [0,1], y = y_train)
    train_class_weights_ratio = train_class_weights[0]/train_class_weights[1]
    train_class_weights_dict = dict(zip(np.unique(y_train), train_class_weights))
    
    test_class_weights = compute_class_weight(class_weight = class_weight, classes = [0,1], y = y_test)
    test_class_weights_ratio = test_class_weights[0]/test_class_weights[1]
    test_class_weights_dict = dict(zip(np.unique(y_test), test_class_weights))

    print('Done splitting data into training, testing, and validation sets.')
    print('='*20)
    print(f'Training set shape: {y_train.shape}')
    print('-'*10)
    print(f'Training set example:\n{X_train[0]}')
    print('~'*10)
    print(f'Testing set shape: {y_test.shape}')
    print('-'*10)
    print(f'Testing set example:\n{X_test[0]}')
    print('~'*10)
    print(f'Validation set shape: {y_val.shape}')
    print('-'*10)
    print(f'Validation set example:\n{X_val[0]}')
    print('~'*10)
    print(f'Training data class weights:\nRatio = {train_class_weights_ratio:.2f} (0 = {train_class_weights[0]:.2f}, 1 = {train_class_weights[1]:.2f})')
    print('-'*10)
    print(f'Testing data class weights:\nRatio = {test_class_weights_ratio:.2f} (0 = {test_class_weights[0]:.2f}, 1 = {test_class_weights[1]:.2f})')
    print('='*20)

    return (
        X_train, y_train,
        X_test, y_test,
        X_val, y_val,
        train_class_weights,
        train_class_weights_ratio,
        train_class_weights_dict,
        test_class_weights,
        test_class_weights_ratio,
        test_class_weights_dict
    )


In [None]:
def compute_metrics_using_estimator(
    estimator, X_test, y_test, col, vectorizer_name, classifier_name,
    return_train_score=None, cv=None
):
    if return_train_score is None:
        return_train_score = True
    if cv is None:
        cv = cv

    # Using estimator
    # Cross Validation
    print('-'*20)
    print('Cross Validating without scoring.')
    cv_score_noscoring = cross_validate(
        estimator,
        X_test,
        y_test,
        cv=cv,
        scoring=None,
        return_train_score=True,
    )

    # Cross Validation with scoring
    print('-'*20)
    print(f'Cross Validating with {scores} scoring.')
    cv_score_recall = cross_validate(
        estimator,
        X_test,
        y_test,
        cv=cv,
        scoring=scores,
        return_train_score=True,
    )

    # Get mean and std of cross validation scores
    print('-'*20)
    print('Getting mean and std of cross validation scores.')
    cv_train_scores = cv_score_noscoring['train_score'].mean()
    cv_test_scores = cv_score_noscoring['test_score'].mean()
    cv_train_recall = cv_score_recall['train_recall'].mean()
    cv_test_recall = cv_score_recall['test_recall'].mean()
    cv_train_explained_variance_recall = cv_score_recall['train_explained_variance'].mean()
    cv_test_explained_variance_recall = cv_score_recall['test_explained_variance'].mean()

    # Save cross validation scores to dataframe
    print('-'*20)
    print('Saving cross validation scores to dataframe.')
    df_cv_score_noscoring = pd.DataFrame(cv_score_noscoring)
    df_cv_score_noscoring.to_pickle(f'{df_save_dir}df_cv_score_noscoring - {col}_{vectorizer_name}_{classifier_name}.pkl')
    df_cv_score_recall = pd.DataFrame(cv_score_recall)
    df_cv_score_recall.to_pickle(f'{df_save_dir}df_cv_score_recall - {col}_{vectorizer_name}_{classifier_name}.pkl')

    return (
        df_cv_score_recall,
        cv_train_scores, cv_test_scores,
        cv_train_recall, cv_test_recall,
        cv_train_explained_variance_recall, cv_test_explained_variance_recall
    )


In [None]:
def plot_metrics_with_estimator(
    estimator, X_test, y_test, col, vectorizer_name, classifier_name,
    param_name=None, param_range=None, axis=None, cv=None, n_jobs=None, random_state=None, alpha=None, verbose=None
):
    if axis is None:
        axis = 1
    if cv is None:
        cv = cv
    if n_jobs is None:
        n_jobs = n_jobs
    if random_state is None:
        random_state = random_state
    if alpha is None:
        alpha = 0.1
    if verbose is None:
        verbose=1

    # Make param names and values
    if param_range is None:
        param_range = np.arange(1, 100, 10)
    if param_name is None:
        param_names = [
            param_name
            for param_dict in estimator.steps
            for param_name in 
            [
                name
                for name, value in param_dict[1].get_params().items()
                if name != 'random_state'
                and isinstance(value, (list, int, float))
                and not isinstance(value, bool)
            ]
        ]

    # Using estimator
    # Learning Curves
    print('Plotting Learning Curve.')
    print('-'*20)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=estimator,
        X=X_train,
        y=y_train,
        cv=cv,
        n_jobs=n_jobs,
        random_state=random_state,
        shuffle=True,
        scoring=scoring,
        verbose=verbose,
        # train_sizes=np.linspace(0.1, 1.0, 10),
    )
    train_scores_mean = np.mean(train_scores, axis=axis)
    train_scores_std = np.std(train_scores, axis=axis)
    test_scores_mean = np.mean(test_scores, axis=axis)
    test_scores_std = np.std(test_scores, axis=axis)

    close_plots()
    plt.figure()
    plt.suptitle(
        f'{str(col)} - Learning Curves for {scoring.title()} - {vectorizer_name} + {classifier_name}'
        )
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.grid()
    plt.fill_between(
        train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=alpha, color='r'
    )
    plt.fill_between(
        train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=alpha, color='g'
    )
    plt.plot(
        train_sizes, train_scores_mean, 'o-', color='r', label='Training score'
    )
    plt.plot(
        train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score'
    )
    plt.legend(loc='best')

    # Save figure
    for image_save_format in ['eps', 'png']:
        plt.savefig(f'{plot_save_path}{method} {str(col)} - Learning Curve - {vectorizer_name} + {classifier_name}.{image_save_format}', format=image_save_format)
    show_and_close_plots()

    # # Validation Curve
    # for param_name in param_names:
    #     if param_name:
    #         param_title = ' '.join(param_name.split('_')).title()
    #         print(f'Plotting Validation Curve for {param_title}.')
    #         print('-'*20)
    #         train_scores, test_scores = validation_curve(
    #             estimator=estimator,
    #             X=X_train,
    #             y=y_train,
    #             param_name=param_name,
    #             param_range=param_range,
    #             cv=cv,
    #             n_jobs=n_jobs,
    #             scoring=scoring,
    #             verbose=verbose,
    #         )
    #         train_scores_mean = np.mean(train_scores, axis=axis)
    #         train_scores_std = np.std(train_scores, axis=axis)
    #         test_scores_mean = np.mean(test_scores, axis=axis)
    #         test_scores_std = np.std(test_scores, axis=axis)

    #         # Ploting
    #         plt.figure()
    #         plt.suptitle(
    #             f'{str(col)} - Validation Curve for {scoring.title()}on {param_title} - {vectorizer_name} + {classifier_name}'
    #         )
    #         plt.xlabel(param_name)
    #         plt.ylabel('Score')
    #         plt.grid()
    #         plt.fill_between(
    #             param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=alpha, color='r'
    #         )
    #         plt.fill_between(
    #             param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=alpha, color='g'
    #         )
    #         plt.plot(
    #             param_range, train_scores_mean, 'o-', color='r', label='Training score'
    #         )
    #         plt.plot(
    #             param_range, test_scores_mean, 'o-', color='g', label='Cross-validation score'
    #         )
    #         plt.legend(loc='best')
    #         plt.show()

    #         # Save figure
    #         for image_save_format in ['eps', 'png']:
    #             plt.savefig(
    #                 f'{plot_save_path}{method} {str(col)} - Validation Curve for {param_title} - {vectorizer_name} + {classifier_name}.{image_save_format}',
    #                 format=image_save_format
    #             )


In [None]:
def compute_metrics_with_y_pred(
    y_test, y_test_pred, col, vectorizer_name, classifier_name,
    pos_label=None, labels=None
):

    if pos_label is None:
        pos_label = 1
    if labels is None:
        labels = [1, 0]

    # Using y_pred
    explained_variance = metrics.explained_variance_score(y_test, y_test_pred)
    accuracy = metrics.accuracy_score(y_test, y_test_pred)
    balanced_accuracy = metrics.balanced_accuracy_score(y_test, y_test_pred)
    precision = metrics.precision_score(y_test, y_test_pred, pos_label=1, labels=[1, 0])
    recall = metrics.recall_score(y_test, y_test_pred, pos_label=1, labels=[1, 0])
    f1 = metrics.f1_score(y_test, y_test_pred)
    mcc = metrics.matthews_corrcoef(y_test, y_test_pred)
    fm = metrics.fowlkes_mallows_score(y_test, y_test_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_test_pred)
    gmean_iba = imblearn.metrics.make_index_balanced_accuracy(alpha=0.1, squared=True)(geometric_mean_score)
    gmean = gmean_iba(y_test, y_test_pred)
    report = metrics.classification_report(y_test, y_test_pred)
    cm = metrics.confusion_matrix(y_test, y_test_pred)
    cm_normalized = metrics.confusion_matrix(y_test, y_test_pred, normalize='true')

    return (
        explained_variance, accuracy, balanced_accuracy, precision,
        recall, f1, mcc, fm, kappa, gmean, report, cm, cm_normalized
    )


In [None]:
# pos_label = 1
# close_plots()
# cm_curve = metrics.ConfusionMatrixDisplay.from_predictions(
#     y_test, y_test_pred
# )
# cm_normalized_curve = metrics.ConfusionMatrixDisplay.from_predictions(
#     y_test, y_test_pred, normalize='true'
# )
# roc_curve = metrics.RocCurveDisplay.from_predictions(
#     y_test, y_test_pred, pos_label=pos_label
# )
# pr_curve = metrics.PrecisionRecallDisplay.from_predictions(
#     y_test, y_test_pred, pos_label=pos_label
# )
# calibration_curve = CalibrationDisplay.from_predictions(
#     y_test, y_test_pred, pos_label=pos_label
# )
# show_and_close_plots()

# # Plots
# plots_dict = {
#     'Confusion Matrix': cm_curve,
#     'Normalized Confusion Matrix': cm_normalized_curve,
#     'ROC Curve': roc_curve,
#     'Precision-Recall Curve': pr_curve,
#     'Calibration Curve': calibration_curve,
# }

# print('=' * 20)
# # close_plots()
# print('Plotting metrics with y_pred_prob:')
# print('='*20)
# for plot_name, plot_ in plots_dict.items():
#     close_plots()
#     print(f'Plotting {plot_name}:')
#     plt.figure()
#     plt.suptitle(
#         f'{str(col)} - {plot_name} - {vectorizer_name} + {classifier_name}'
#         )
#     if plot_name == 'ROC Curve':
#         plt.plot([0, 1], [0, 1], 'r--', lw=1)
#     try:
#         plot_.plot(color='C0')
#     except (TypeError, AttributeError):
#         plot_.plot()
#         try:
#             plt.gca().get_lines()[0].set_color('blue')
#         except IndexError:
#             plot_.plot(cmap=plt.cm.Blues)
#     plt.legend(loc='best')
#     print('=' * 20)

#     # Save Plots
#     print(f'Saving {plot_name}...')
#     for image_save_format in ['eps', 'png']:
#         plt.savefig(
#             f'{plot_save_path}{method} {str(col)} - {plot_name} - {vectorizer_name} + {classifier_name}.{image_save_format}',
#             format=image_save_format, dpi=3000, bbox_inches='tight'
#         )
#     show_and_close_plots()
#     print(f'Saved {plot_name}!')
#     print('=' * 20)

In [None]:
def plot_metrics_with_y_pred(
    y_test, y_test_pred, col, vectorizer_name, classifier_name,
    pos_label=None
):
    if pos_label is None:
        pos_label = 1
    # Using y_pred_prob
    # Displays
    close_plots()
    cm_curve = metrics.ConfusionMatrixDisplay.from_predictions(
        y_test, y_test_pred
    )
    cm_normalized_curve = metrics.ConfusionMatrixDisplay.from_predictions(
        y_test, y_test_pred, normalize='true'
    )
    roc_curve = metrics.RocCurveDisplay.from_predictions(
        y_test, y_test_pred, pos_label=pos_label
    )
    pr_curve = metrics.PrecisionRecallDisplay.from_predictions(
        y_test, y_test_pred, pos_label=pos_label
    )
    calibration_curve = CalibrationDisplay.from_predictions(
        y_test, y_test_pred, pos_label=pos_label
    )
    show_and_close_plots()

    # Plots
    plots_dict = {
        'Confusion Matrix': cm_curve,
        'Normalized Confusion Matrix': cm_normalized_curve,
        'ROC Curve': roc_curve,
        'Precision-Recall Curve': pr_curve,
        'Calibration Curve': calibration_curve,
    }

    print('=' * 20)
    # close_plots()
    print('Plotting metrics with y_pred_prob:')
    print('='*20)

    for plot_name, plot_ in plots_dict.items():
        close_plots()
        print(f'Plotting {plot_name}:')
        fig, ax = plt.subplots()
        ax.set_title(
            f'{str(col)} - {plot_name} - {vectorizer_name} + {classifier_name}'
            )
        if plot_name == 'ROC Curve':
            ax.plot([0, 1], [0, 1], 'r--', lw=1)
        plot_.plot(ax=ax)
        print('=' * 20)

        # Save Plots
        print(f'Saving {plot_name}...')
        for image_save_format in ['eps', 'png']:
            plt.savefig(
                f'{plot_save_path}{method} {str(col)} - {plot_name} - {vectorizer_name} + {classifier_name}.{image_save_format}',
                format=image_save_format, dpi=3000, bbox_inches='tight'
            )
        show_and_close_plots()
        print(f'Saved {plot_name}!')
        print('=' * 20)

    # Visualisation with plot_metric
    bc = plot_metric.functions.BinaryClassification(
        y_test, y_test_pred, labels=[0, 1])

    # Figures
    close_plots()
    fig = plt.figure(figsize=(15, 10))
    plt.subplot2grid((2, 6), (1, 1), colspan=2)
    bc.plot_confusion_matrix(colorbar=True)
    plt.subplot2grid((2, 6), (1, 3), colspan=2)
    bc.plot_confusion_matrix(normalize=True, colorbar=True)
    plt.subplot2grid(shape=(2, 6), loc=(0, 0), colspan=2)
    bc.plot_roc_curve()
    plt.subplot2grid((2, 6), (0, 2), colspan=2)
    bc.plot_precision_recall_curve()
    plt.subplot2grid((2, 6), (0, 4), colspan=2)
    bc.plot_class_distribution()
    bc.print_report()
    for image_save_format in ['eps', 'png']:
        plt.savefig(
            f'{plot_save_path}{method} {str(col)} - plot_metric Curves - {vectorizer_name} + {classifier_name}.{image_save_format}',
            format=image_save_format,
            dpi=3000, bbox_inches='tight'
        )
    show_and_close_plots()

    # Heatmap
    print('Plotting Heatmap:')
    close_plots()
    classifications_dict = defaultdict(int)
    for _y_test, _y_test_pred in zip(y_test, y_test_pred):
        if _y_test != _y_test_pred:
            classifications_dict[(_y_test, _y_test_pred)] += 1

    dicts_to_plot = [
        {
            f'True {str(col)} value': _y_test,
            f'Predicted {str(col)} value': _y_test_pred,
            'Number of Classifications': _count,
        }
        for (_y_test, _y_test_pred), _count in classifications_dict.items()
    ]
    df_to_plot = pd.DataFrame(dicts_to_plot)
    df_wide = df_to_plot.pivot_table(
        index=f'True {str(col)} value', 
        columns=f'Predicted {str(col)} value', 
        values='Number of Classifications'
    )

    plt.figure(figsize=(9,7))
    sns.set(style='ticks', font_scale=1.2)
    sns.heatmap(df_wide, linewidths=1, cmap=plt.cm.Blues, annot=True)    
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.suptitle(f'{str(col)} Heatmap - {vectorizer_name} + {classifier_name}')
    print('Saving Heatmap...')
    for image_save_format in ['eps', 'png']:
        plt.savefig(
            f'{plot_save_path}{method} {str(col)} - Heatmap - {vectorizer_name} + {classifier_name}.{image_save_format}',
            format=image_save_format,
            dpi=3000, bbox_inches='tight'
        )
    print('Saved Heatmap!')
    show_and_close_plots()



In [None]:
def compute_metrics_with_y_pred_prob(
    y_test, y_test_pred_prob, col, vectorizer_name, classifier_name,
    pos_label=None
):

    if pos_label is None:
        pos_label = 1

    # Using y_pred_prob
    average_precision = metrics.average_precision_score(y_test, y_test_pred_prob)
    roc_auc = metrics.roc_auc_score(y_test, y_test_pred_prob)
    fpr, tpr, threshold = metrics.roc_curve(y_test, y_test_pred_prob, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    loss = metrics.log_loss(y_test, y_test_pred_prob)
    precision_pr, recall_pr, threshold_pr = metrics.precision_recall_curve(y_test, y_test_pred_prob, pos_label=1)

    return (
        average_precision, roc_auc, auc,
        fpr, tpr, threshold,loss,
        precision_pr, recall_pr, threshold_pr
    )


In [None]:
def compute_metrics(
    estimator, X_test, y_test, y_test_pred, y_test_pred_prob,
    col, vectorizer_name, classifier_name,
    with_estimator=None, with_y_pred=None, with_y_pred_prob=None,
    cv=None, pos_label=None, verbose=None, n_jobs=None,
):

    if cv is None:
        cv = cv
    if pos_label is None:
        pos_label = 1
    if verbose is None:
        verbose = 1
    if n_jobs is None:
        n_jobs = n_jobs

    if with_estimator is None:
        with_estimator = True
    if with_y_pred is None:
        with_y_pred = True
    if with_y_pred_prob is None:
        with_y_pred_prob = True

    # Get metrics
    print('='*20)
    # Using estimator
    if with_estimator:
        print('Computing metrics using estimator.')
        (
            df_cv_score_recall,
            cv_train_scores, cv_test_scores,
            cv_train_recall, cv_test_recall,
            cv_train_explained_variance_recall, cv_test_explained_variance_recall
        ) = compute_metrics_using_estimator(
             estimator, X_test, y_test, col, vectorizer_name, classifier_name,
        )
    # Using y_test_pred
    if with_y_pred:
        print('-'*20)
        print('Computing metrics using y_test_pred.')
        (
            explained_variance, accuracy, balanced_accuracy, precision,
            recall, f1, mcc, fm, kappa, gmean, report, cm, cm_normalized
        ) = compute_metrics_with_y_pred(
            y_test, y_test_pred, col, vectorizer_name, classifier_name
        )
    # Using y_test_pred_prob
    if with_y_pred_prob:
        print('-'*20)
        print('Computing metrics using y_test_pred_prob.')
        (
            average_precision, roc_auc, auc,
            fpr, tpr, threshold,loss,
            precision_pr, recall_pr, threshold_pr
        ) = compute_metrics_with_y_pred_prob(
            y_test, y_test_pred_prob, col, vectorizer_name, classifier_name,
        )

    #Place metrics into dict
    print('-'*20)
    print('Appending metrics to dict.')
    metrics_dict = {
        'Mean Cross Validation Train Score': float(cv_train_scores),
        'Mean Cross Validation Test Score': float(cv_test_scores),
        f'Mean Cross Validation Train - {scoring.title()}': float(cv_train_recall),
        f'Mean Cross Validation Test - {scoring.title()}': float(cv_test_recall),
        f'Mean Explained Train Variance - {scoring.title()}': float(cv_train_explained_variance_recall),
        f'Mean Explained Test Variance - {scoring.title()}': float(cv_test_explained_variance_recall),
        'Explained Variance': float(explained_variance),
        'Accuracy': float(accuracy),
        'Balanced Accuracy': float(balanced_accuracy),
        'Precision': float(precision),
        'Average Precision': float(average_precision),
        'Recall': float(recall),
        'F1-score': float(f1),
        'Matthews Correlation Coefficient': float(mcc),
        'Fowlkes–Mallows Index': float(fm),
        'ROC': float(roc_auc),
        'AUC': float(auc),
        f'{scoring.title()} Best Threshold': threshold,
        f'{scoring.title()} Best Score': float(best_score),
        'Log Loss/Cross Entropy': float(loss),
        'Cohen’s Kappa': float(kappa),
        'Geometric Mean': float(gmean),
        'Classification Report': report,
        'Confusion Matrix': cm,
        'Normalized Confusion Matrix': cm_normalized
    }

    return (
        metrics_dict, df_cv_score_recall,
        cv_train_scores, cv_test_scores,
        cv_train_recall, cv_test_recall,
        cv_train_explained_variance_recall, cv_test_explained_variance_recall,
        explained_variance, accuracy, balanced_accuracy, precision, recall,
        f1, mcc, fm, kappa, gmean, report, cm, cm_normalized,
        average_precision, roc_auc, auc, fpr, tpr, threshold, 
        loss, precision_pr, recall_pr, threshold_pr,
    )


In [None]:
def plot_metrics(
    estimator, X_test, y_test, y_test_pred, y_test_pred_prob,
    col, vectorizer_name, classifier_name, 
    with_estimator=None, with_y_pred=None, with_y_pred_prob=None
):
    if with_estimator is None:
        with_estimator = True
    if with_y_pred is None:
        with_y_pred = True
    if with_y_pred_prob is None:
        with_y_pred_prob = True

    # Plotting
    print('~'*20)
    print('Plotting metrics.')
    print('~'*20)
    # Using estimator
    if with_estimator:
        plot_metrics_with_estimator(
             estimator, X_test, y_test, col, vectorizer_name, classifier_name,
        )
    # Using y_test_pred
    if with_y_pred:
        plot_metrics_with_y_pred(
            y_test, y_test_pred, col, vectorizer_name, classifier_name,
        )
    print('='*20)


In [None]:
def evaluation(
    estimator, X_test, y_test, y_test_pred, y_test_pred_prob,
    best_score, df_metrics,
    col, vectorizer_name, classifier_name, scoring=None,
):
    if scoring is None:
        scoring = 'recall'

    # Get metrics dict
    (
        metrics_dict, df_cv_score_recall,
        cv_train_scores, cv_test_scores,
        cv_train_recall, cv_test_recall,
        cv_train_explained_variance_recall, cv_test_explained_variance_recall,
        explained_variance, accuracy, balanced_accuracy, precision, recall,
        f1, mcc, fm, kappa, gmean, report, cm, cm_normalized,
        average_precision, roc_auc, auc, fpr, tpr, threshold, 
        loss, precision_pr, recall_pr, threshold_pr,
    ) = compute_metrics(
        estimator, X_test, y_test, y_test_pred, y_test_pred_prob,
        col, vectorizer_name, classifier_name
    )

    # Print metrics
    print('=' * 20)
    print('~' * 20)
    print(f' Testing Metrics for {str(col)} - {vectorizer_name} + {classifier_name}')
    print('~' * 20)
    print(f'Classification Report:\n {metrics_dict["Classification Report"]}')
    print('-' * 20)
    for metric_name, metric_value in metrics_dict.items():
        with contextlib.suppress(TypeError, ValueError):
            metric_value = float(metric_value)
        if isinstance(metric_value, (int, float)):
            print(f'{metric_name}: {round(metric_value, 2)}')
        else:
            print(f'{metric_name}:\n{metric_value}')
        print('-' * 20)

        # Fill Table DF
        if isinstance(metric_value, float):
            df_metrics.loc[
                (classifier_name), (col, vectorizer_name, metric_name)
            ] = metric_value
        else:
            df_metrics.loc[
                (classifier_name), (col, vectorizer_name, metric_name)
            ] = str(metric_value)

    print('=' * 20)

    # Plot Metrics
    plot_metrics(
        estimator, X_test, y_test, y_test_pred, y_test_pred_prob,
        col, vectorizer_name, classifier_name,
    )

    return df_metrics, metrics_dict, df_cv_score_recall


In [None]:
# Function to place Xy and CV data in df and save
def save_Xy_search_cv_estimator(
    grid_search, searchcv,
    X_train, y_train, y_train_pred,
    X_test, y_test, y_test_pred, y_test_pred_prob,
    X_val, y_val,
    estimator,
    col, vectorizer_name, classifier_name,
    compression=None, save_path=None
):

    if compression is None:
        compression = False
    if save_path is None:
        save_path = f'{final_models_save_path}SearchCV/'

    # Save search
    ## Save grid_search
    with open(
        f'{save_path}{method} Grid Search {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb'
    ) as f:
        joblib.dump(grid_search, f, compress=compression)

    ## Save searchcv
    with open(
        f'{save_path}{method} SearchCV {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb'
    ) as f:
        joblib.dump(searchcv, f, compress=compression)

    ## Save searchcv data
    df_cv_results = pd.DataFrame(searchcv.cv_results_)
    df_cv_results.to_pickle(
        f'{save_path}{method} df_searchcv_results - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    # Save Xy data
    ## Save train data
    df_train_data = pd.DataFrame(
        {
            'X_train': X_train,
            'y_train': y_train,
            'y_train_pred': y_train_pred,
        },
    )
    df_train_data.to_pickle(
        f'{save_path}{method} df_train_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    ## Save test data
    df_test_data = pd.DataFrame(
        {
            'X_test': X_test,
            'y_test': y_test,
            'y_test_pred': y_test_pred,
            'y_test_pred_prob': y_test_pred_prob,
        },
    )
    df_test_data.to_pickle(
        f'{save_path}{method} df_test_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    ## Save val data
    df_val_data = pd.DataFrame(
        {
            'X_val': X_val,
            'y_val': y_val,
        },
    )
    df_val_data.to_pickle(
        f'{save_path}{method} df_val_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    # Save estimator
    with open(
        f'{save_path}{method} Estimator {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb'
    ) as f:
        joblib.dump(estimator, f, compress=compression)


In [None]:
# Save Model
def saving_model_and_table(df_metrics, estimator, col, vectorizer_name, classifier_name):

    # Save metrics df
    print(f'Saving Model and Table for {vectorizer_name} + {classifier_name}.')
    df_metrics.to_csv(f'{table_save_path}Classifiers Table.csv')
    df_metrics.to_pickle(f'{table_save_path}Classifiers Table.pkl')
    df_metrics.to_excel(f'{table_save_path}Classifiers Table.xlsx')
    df_metrics.to_latex(f'{table_save_path}Classifiers Table.tex')
    df_metrics.to_markdown(f'{table_save_path}Classifiers Table.md')

    # Save estimator
    with open(f'{final_models_save_path}{method} Estimator {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb') as f:
        joblib.dump(estimator, f)


In [None]:
# Save Model
def get_fitted_estimators():
    
    estimators_list = []

    for model_path in glob.glob(f'{final_models_save_path}*.pkl'):
        with open(model_path, 'rb') as f:
            estimators_list.append(joblib.load(f))

    return estimators_list


In [None]:
def comparison_plots(estimators_list, X_test, y_test, col, curves_dict=None, cmap=plt.cm.Blues):

    curves_dict = {
        'ROC Curve': metrics.RocCurveDisplay,
        'Precision Recall Curve': metrics.PrecisionRecallDisplay,
        'Calibration Curve': metrics.CalibrationDisplay,
    }

    assert len(estimators_list) != 0

    for curve_name, curve_package in curves_dict.items():
        print('-' * 20)
        print(f'{str(col)} - {str(curve_name)}')
        fig, ax = plt.subplots()
        ax.set_title(f'{str(col)} - {str(curve_name)}')
        for estimator in estimators_list:
            curve = curve_package.from_estimator(
                estimator, X_test, y_test, pos_label=1, ax=ax, cmap=cmap,
                name=f'{estimator.steps[0][0]} + {estimator.steps[1][0]} + {estimator.steps[-1][0]}'
            )
        show_and_close_plots()

        # Save Plots
        print('Saving plots.')
        for image_save_format in ['eps', 'png']:
            curve.figure_.savefig(
                f'{plot_save_path}{method} {str(col)} - All {str(curve_name)}s.{image_save_format}',
                format=image_save_format,
                dpi=3000, bbox_inches='tight'
            )


# Training

### READ DATA

In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_trainning.pkl').reset_index(drop=True)
# TODO REMOVE THIS!!!!!!
df_manual = df_manual.groupby(analysis_columns).sample(n=50).reset_index(drop = True)

In [None]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'

with joblib.parallel_backend(backend='multiprocessing', n_jobs=n_jobs):
    # Load Table DF
    # df_metrics = make_df_metrics(vectorizers_pipe, classifiers_pipe, list(metrics_dict.keys()), analysis_columns)

    for col in tqdm.tqdm(analysis_columns):

        print('-'*20)
        print(f'{"="*30} TRAINING {col.upper()} {"="*30}')
        print('-'*20)
        print(f'Vectorizers to be used ({len(list(vectorizers_pipe.values()))}):\n{list(vectorizers_pipe.keys())}')
        print(f'Total number of vectorizer parameters = {sum([len(list(vectorizers_pipe.values())[i][1]) for i in range(len(vectorizers_pipe))])}')
        print(f'Selectors to be used ({len(list(selectors_pipe.values()))}):\n{list(selectors_pipe.keys())}')
        print(f'Total number of selector parameters = {sum([len(list(selectors_pipe.values())[i][1]) for i in range(len(selectors_pipe))])}')
        print(f'Resamplers to be used ({len(list(resamplers_pipe.keys()))}):\n{list(resamplers_pipe.keys())}')
        print(f'Total number of resamplers parameters = {sum([len(list(resamplers_pipe.values())[i][1]) for i in range(len(resamplers_pipe))])}')
        print(f'Classifers to be used ({len(list(classifiers_pipe.keys()))}):\n{list(classifiers_pipe.keys())}')
        print(f'Total number of classifers parameters = {sum([len(list(classifiers_pipe.values())[i][1]) for i in range(len(classifiers_pipe))])}')
        

        assert len(df_manual[df_manual[str(col)].map(df_manual[str(col)].value_counts() > 1)]) != 0

        # Split
        (
            train, X_train, y_train,
            test, X_test, y_test,
            val, X_val, y_val,
            class_weights,
            class_weights_ratio,
            class_weights_dict
        ) = split_data(
            df_manual, col, text_col, analysis_columns,
        )

        for (
            vectorizer_name, vectorizer_and_params
        ), (
            selector_name, selector_and_params
        ), (
            resampler_name, resampler_and_params
        ), (
            classifier_name, classifier_and_params
        ) in tqdm_product(
            vectorizers_pipe.items(), selectors_pipe.items(), resamplers_pipe.items(), classifiers_pipe.items()
        ):

            vectorizer = vectorizer_and_params[0]
            vectorizer_params = vectorizer_and_params[1]

            selector = selector_and_params[0]
            selector_params = selector_and_params[1]

            resampler = resampler_and_params[0]
            resampler_params = resampler_and_params[1]

            classifier = classifier_and_params[0]
            classifier_params = classifier_and_params[1]

            # Pipeline
            ## Steps
            if col == 'Warmth':
                steps = [
                    (vectorizer_name, vectorizer),
                    (selector_name, selector),
                    (resampler_name, resampler),
                    (classifier_name, classifier)
                ]
            else:
                steps = [
                    (vectorizer_name, vectorizer),
                    (selector_name, selector),
                    (classifier_name, classifier)
                ]

            ## Params
            param_grid = {
                **vectorizer_params,
                **selector_params,
                **classifier_params,
            }

            ## Pipeline
            pipe = imblearn.pipeline.Pipeline(steps=steps)

            # Search
            print('-'*20)
            print(f'{"="*30} Using GridSearchCV {"="*30}')
            print('-'*20)
            print(f'GridSearchCV with:\nPipe:\n{pipe}\nParams:\n{param_grid}')
            print('+'*30)

        # with joblib.parallel_backend(backend='multiprocessing', n_jobs=n_jobs):
            grid_search = HalvingGridSearchCV(
                estimator=pipe,
                param_grid=param_grid,
                cv=cv,
                n_jobs=n_jobs,
                return_train_score=True,
                verbose=1,
                random_state=random_state,
                refit=refit,
                scoring=scoring,
            )
            # grid_search = GridSearchCV(
            #     estimator=pipe,
            #     param_grid=param_grid,
            #     cv=cv,
            #     n_jobs=n_jobs,
            #     return_train_score=True,
            #     verbose=1,
            #     # scoring=scores,
            #     # refit=scoring,
            # )

            # Fit SearchCV
            searchcv = grid_search.fit(X_train, y_train)

            # # Save SearchCV
            # with open(f'{final_models_save_path}{method} SearchCV {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb') as f:
            #     joblib.dump(searchcv, f)

            # HACK
            # Identify and name steps in estimator
            # estimator = searchcv.best_estimator_
            # vectorizer = searchcv.estimator[0]
            # vectorizer_name = vectorizer.__class__.__name__
            # selector = searchcv.estimator[1]
            # selector_name = selector.__class__.__name__
            # classifier = searchcv.estimator[-1]
            # classifier_name = classifier.__class__.__name__
            # if col == 'Warmth':
            #     resampler = searchcv.estimator[-2]
            #     resampler_name = resampler.__class__.__name__
            # Identify and name steps in estimator
            # estimator = searchcv.best_estimator_
            vectorizer = searchcv.best_estimator_[0]
            vectorizer_name = vectorizer.__class__.__name__
            selector = searchcv.best_estimator_[1]
            selector_name = selector.__class__.__name__
            classifier = searchcv.best_estimator_[-1]
            classifier_name = classifier.__class__.__name__
            if col == 'Warmth':
                resampler = searchcv.best_estimator_[-2]
                resampler_name = resampler.__class__.__name__

            # Best Parameters on CV
            # best_index = searchcv.best_index_
            # n_splits = searchcv.n_splits_
            # estimator = searchcv.best_estimator_
            # best_params = searchcv.best_params_
            # best_train_score = searchcv.best_score_
            # best_test_score = searchcv.score(X_test, y_test)
            # train_report = classification_report(y_train, y_train_pred)

            # Make predictions
            # estimator = searchcv.best_estimator_
            y_train_pred = (estimator:=searchcv.best_estimator_).predict(X_train)
            if hasattr(searchcv, 'predict_proba'):
                searchcv_predict_attr = searchcv.predict_proba
            elif hasattr(searchcv, '_predict_proba_lr'):
                searchcv_predict_attr = searchcv._predict_proba_lr
            y_test_pred = searchcv.predict(X_test)
            y_test_pred_prob = searchcv_predict_attr(X_test)[:, 1]

            print('='*20)
            print(
                f'GridSearch - Best mean train score: M = {float(best_mean_train_score:=searchcv.cv_results_["mean_train_score"][best_index:=searchcv.best_index_]):.2f}, SD = {int(best_std_train_score:=searchcv.cv_results_["std_train_score"][best_index]):.2f}\n'
            )
            print(
                f'GridSearch - Best mean test score: M = {float(best_mean_test_score:=searchcv.cv_results_["mean_test_score"][best_index]):.2f}, SD = {int(best_std_test_score:=searchcv.cv_results_["std_test_score"][best_index]):.2f}\n'
            )
            print(
                f'Number of splits: {int(n_splits:=searchcv.n_splits_)}\n'
            )
            print(
                f'Best estimator and parameters:\n{estimator}\n')
            print(
                f'Best parameters:\n{(best_params:=searchcv.best_params_)}\n'
            )
            print(
                f'Training Classification Report:\n{(train_report:=classification_report(y_train, y_train_pred))}\n'
            )
            print('-'*20)
            print(
                f'Best train score: {float(best_train_score:=searchcv.best_score_):.2f}\n'
            )
            print(
                f'Best test score: {float(best_test_score:=searchcv.score(X_test, y_test)):.2f}\n'
            )
            print('-'*20)
            print('Training Confusion Matrix:\n')
            close_plots()
            fig, ax = plt.subplots()
            ax.set_title(f'{str(col)} - Training Confusion Matrix - {vectorizer_name} + {classifier_name}')
            train_cm = metrics.ConfusionMatrixDisplay.from_estimator(
                estimator, X_train, y_train, ax=ax, cmap=plt.cm.Blues
            )
            show_and_close_plots()
            print('='*20)

            # Place Xy and CV data in df and save
            save_Xy_search_cv_estimator(
                grid_search, searchcv,
                X_train, y_train, y_train_pred,
                X_test, y_test, y_test_pred, y_test_pred_prob,
                X_val, y_val,
                estimator,
                col, vectorizer_name, classifier_name,
            )

            # HACK REMOVE THIS!!!!!!
            sys.exit(0)
            # Evaluate Model
            df_metrics, metrics_dict, df_cv_score_recall = evaluation(
                estimator, X_test, y_test, y_test_pred, y_test_pred_prob,
                best_score, df_metrics,
                col, vectorizer_name, classifier_name, 
            )

            # Fit best model on validation set
            print(f'Fitting {estimator}.')
            estimator.set_params(**estimator.get_params())
            estimator = estimator.fit(X_val, y_val)

            # Save Vectorizer, Selector, and Classifier
            saving_model_and_table(df_metrics, estimator, col, vectorizer_name, classifier_name)

        # Compare Estimators
        print('='*20)
        print(f'Comparing Estimators for {col}')
        comparison_plots(get_fitted_estimators(), X_test, y_test, col)
        print('='*20)

print('#'*40)
print('DONE!')
print('#'*40)
