In [None]:
import os # isort:skip # fmt:skip # noqa # nopep8 
import sys # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # isort:skip # fmt:skip # noqa # nopep8
from supervised_estimators_get_pipe import * # isort:skip # fmt:skip # noqa # nopep8


### Set variables

In [None]:
# Variables
# Sklearn variables
method = 'Supervised'
final_models_save_path = f'{models_save_path}{method} Results/'
t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    'Mean Cross Validation Train Score': np.nan,
    'Mean Cross Validation Test Score': np.nan,
    f'Mean Explained Train Variance - {scoring.title()}': np.nan,
    f'Mean Explained Test Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    f'{scoring.title()} Best Score': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
DetectorFactory.seed = random_state
cores = multiprocessing.cpu_count()
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(
    bert_model_name, strip_accents=True
)
bert_model = BertForSequenceClassification.from_pretrained(
    bert_model_name
).to(device)

# Plotting variables
pp = pprint.PrettyPrinter(indent=4)
tqdm.tqdm.pandas(desc='progress-bar')
tqdm_auto.tqdm.pandas(desc='progress-bar')
tqdm.notebook.tqdm().pandas(desc='progress-bar')
tqdm_auto.notebook_tqdm().pandas(desc='progress-bar')
# pbar = progressbar.ProgressBar(maxval=10)
mpl.use('MacOSX')
mpl.style.use(f'{code_dir}/setup_module/apa.mplstyle-main/apa.mplstyle')
mpl.rcParams['text.usetex'] = True
font = {'family': 'arial', 'weight': 'normal', 'size': 10}
mpl.rc('font', **font)
plt.style.use('tableau-colorblind10')
plt.set_cmap('Blues')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.2f}'.format)

# Functions

In [None]:
def show_and_close_plots():
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()


In [None]:
def close_plots():
    plt.clf()
    plt.cla()
    plt.close()


In [None]:
def split_data(df, col, text_col=None, analysis_columns=None):

    if text_col is None:
        text_col = 'Job Description spacy_sentencized'
    if analysis_columns is None:
        analysis_columns = ['Warmth', 'Competence']

    train_ratio = 0.75
    test_ratio = 0.10
    validation_ratio = 0.15
    test_split = test_size = 1 - train_ratio
    validation_split = test_ratio / (test_ratio + validation_ratio)

    # Split
    print('='*20)
    print('Splitting data into training, testing, and validation sets:')
    print(f'Ratios: train_size = {train_ratio}, test size = {test_ratio}, validation size = {validation_ratio}')

    df = df.dropna(subset=analysis_columns, how='any')
    df = df.reset_index(drop=True)

    train, test = train_test_split(
        df, train_size = 1-test_split, test_size = test_split, random_state=random_state
    )

    val, test = train_test_split(
        test, test_size=validation_split, random_state=random_state
    )

    X_train = np.array(list(train[text_col].astype('str').values))
    y_train = column_or_1d(train[col].astype('int64').values.tolist(), warn=True)

    X_test = np.array(list(test[text_col].astype('str').values))
    y_test = column_or_1d(test[col].astype('int64').values.tolist(), warn=True)

    X_val = np.array(list(val[text_col].astype('str').values))
    y_val = column_or_1d(val[col].astype('int64').values.tolist(), warn=True)

    train_class_weights = compute_class_weight(class_weight = class_weight, classes = [0,1], y = y_train)
    train_class_weights_ratio = train_class_weights_ratio[0]/train_class_weights_ratio[1]
    train_class_weights_dict = dict(zip(np.unique(y_train), train_class_weights))
    
    test_class_weights = compute_class_weight(class_weight = class_weight, classes = [0,1], y = y_test)
    test_class_weights_ratio = test_class_weights_ratio[0]/test_class_weights_ratio[1]
    test_class_weights_dict = dict(zip(np.unique(y_test), test_class_weights))

    print('Done splitting data into training, testing, and validation sets.')
    print('='*20)
    print(f'Training set shape: {y_train.shape}')
    print('-'*10)
    print(f'Training set example:\n{X_train[0]}')
    print('~'*10)
    print(f'Testing set shape: {y_test.shape}')
    print('-'*10)
    print(f'Testing set example:\n{X_test[0]}')
    print('~'*10)
    print(f'Validation set shape: {y_val.shape}')
    print('-'*10)
    print(f'Validation set example:\n{X_val[0]}')
    print('~'*10)
    print(f'Training data class weights:\nRatio = {train_class_weights_ratio:.2f} (0 = {train_class_weights[0]:.2f}, 1 = {train_class_weights[1]:.2f})')
    print('-'*10)
    print(f'Testing data class weights:\nRatio = {test_class_weights_ratio:.2f} (0 = {test_class_weights[0]:.2f}, 1 = {test_class_weights[1]:.2f})')
    print('='*20)

    return (
        train, X_train, y_train,
        test, X_test, y_test,
        val, X_val, y_val,
        train_class_weights,
        train_class_weights_ratio,
        train_class_weights_dict
        test_class_weights,
        test_class_weights_ratio,
        test_class_weights_dict
    )


In [None]:
# Function to place Xy and CV data in df and save
def save_Xy_search_cv_estimator(
    grid_search, searchcv,
    X_train, y_train, y_train_pred,
    X_test, y_test, y_test_pred, y_test_pred_prob,
    X_val, y_val,
    df_feature_importances, estimator,
    col, vectorizer_name, classifier_name,
    compression=None, save_path=None
):

    if compression is None:
        compression = False
    if save_path is None:
        save_path = f'{final_models_save_path}SearchCV/'

    # Save search
    ## Save grid_search
    with open(
        f'{save_path}{method} Grid Search {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb'
    ) as f:
        joblib.dump(grid_search, f, compress=compression)

    ## Save searchcv
    with open(
        f'{save_path}{method} SearchCV {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb'
    ) as f:
        joblib.dump(searchcv, f, compress=compression)

    ## Save searchcv data
    df_cv_results = pd.DataFrame(searchcv.cv_results_)
    df_cv_results.to_pickle(
        f'{save_path}{method} df_searchcv_results - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    # Save Xy data
    ## Save train data
    df_train_data = pd.DataFrame(
        {
            'X_train': X_train,
            'y_train': y_train,
            'y_train_pred': y_train_pred,
        },
    )
    df_train_data.to_pickle(
        f'{save_path}{method} df_train_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    ## Save test data
    df_test_data = pd.DataFrame(
        {
            'X_test': X_test,
            'y_test': y_test,
            'y_test_pred': y_test_pred,
            'y_test_pred_prob': y_test_pred_prob,
        },
    )
    df_test_data.to_pickle(
        f'{save_path}{method} df_test_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    ## Save val data
    df_val_data = pd.DataFrame(
        {
            'X_val': X_val,
            'y_val': y_val,
        },
    )
    df_val_data.to_pickle(
        f'{save_path}{method} df_val_data - {col}_{vectorizer_name}_{classifier_name}.pkl'
    )

    # Save feature importance
    if df_feature_importances is not None:
        df_feature_importances.to_pickle(
            f'{save_path}{method} df_feature_importances - {col}_{vectorizer_name}_{classifier_name}.pkl'
        )

    # Save estimator
    with open(
        f'{final_models_save_path}{method} Estimator {str(col)} - {vectorizer_name} + {classifier_name}.pkl', 'wb'
    ) as f:
        joblib.dump(estimator, f, compress=compression)


# Training

### READ DATA

In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_trainning.pkl').reset_index(drop=True)


In [None]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'

for col in tqdm.tqdm(analysis_columns):

    print('-'*20)
    print(f'{"="*30} TRAINING {col.upper()} {"="*30}')
    print('-'*20)
    print(f'Vectorizers to be used ({len(list(vectorizers_pipe.values()))}):\n{list(vectorizers_pipe.keys())}')
    print(f'Total number of vectorizer parameters = {sum([len(list(vectorizers_pipe.values())[i][1]) for i in range(len(vectorizers_pipe))])}')
    print(f'Selectors to be used ({len(list(selectors_pipe.values()))}):\n{list(selectors_pipe.keys())}')
    print(f'Total number of selector parameters = {sum([len(list(selectors_pipe.values())[i][1]) for i in range(len(selectors_pipe))])}')
    print(f'Resamplers to be used ({len(list(resamplers_pipe.keys()))}):\n{list(resamplers_pipe.keys())}')
    print(f'Total number of resamplers parameters = {sum([len(list(resamplers_pipe.values())[i][1]) for i in range(len(resamplers_pipe))])}')
    print(f'Classifers to be used ({len(list(classifiers_pipe.keys()))}):\n{list(classifiers_pipe.keys())}')
    print(f'Total number of classifers parameters = {sum([len(list(classifiers_pipe.values())[i][1]) for i in range(len(classifiers_pipe))])}')
    

    assert len(df_manual[df_manual[str(col)].map(df_manual[str(col)].value_counts() > 1)]) != 0

    # Split
    (
        train, X_train, y_train,
        test, X_test, y_test,
        val, X_val, y_val,
        train_class_weights,
        train_class_weights_ratio,
        train_class_weights_dict
        test_class_weights,
        test_class_weights_ratio,
        test_class_weights_dict
    ) = split_data(
        df_manual, col, text_col, analysis_columns,
    )

    for (
        vectorizer_name, vectorizer_and_params
    ), (
        selector_name, selector_and_params
    ), (
        resampler_name, resampler_and_params
    ), (
        classifier_name, classifier_and_params
    ) in tqdm_product(
        vectorizers_pipe.items(), selectors_pipe.items(), resamplers_pipe.items(), classifiers_pipe.items()
    ):

        # Identify names and params
        vectorizer = vectorizer_and_params[0]
        vectorizer_params = vectorizer_and_params[1]

        selector = selector_and_params[0]
        selector_params = selector_and_params[1]

        resampler = resampler_and_params[0]
        resampler_params = resampler_and_params[1]

        classifier = classifier_and_params[0]
        classifier_params = classifier_and_params[1]

        # Pipeline
        ## Steps
        if col == 'Warmth':
            steps = [
                (vectorizer_name, vectorizer),
                (selector_name, selector),
                (resampler_name, resampler),
                (classifier_name, classifier)
            ]
        else:
            steps = [
                (vectorizer_name, vectorizer),
                (selector_name, selector),
                (classifier_name, classifier)
            ]

        ## Params
        param_grid = {
            **vectorizer_params,
            **selector_params,
            **classifier_params,
        }

        ## Pipeline
        pipe = imblearn.pipeline.Pipeline(steps=steps)

        # Search
        print('-'*20)
        print(f'{"="*30} Using GridSearchCV {"="*30}')
        print('-'*20)
        print(f'GridSearchCV with:\nPipe:\n{pipe}\nParams:\n{param_grid}')
        print('+'*30)

        grid_search = HalvingGridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            cv=cv,
            n_jobs=n_jobs,
            return_train_score=True,
            verbose=1,
            scoring=scoring,
            error_score='raise',
            random_state=random_state,
            refit=refit,
        )
        ## Normalize unusual classifiers before fitting
        if classifier_name == 'GaussianNB':
            X_train = X_train.todense()
            X_test = X_test.todense()
            X_val = X_val.todense()

        # Fit SearchCV
        with joblib.parallel_backend(backend='multiprocessing', n_jobs=n_jobs):
            searchcv = grid_search.fit(X_train, y_train)

            # Reidentify and name best estimator and params
            estimator = searchcv.best_estimator_
            vectorizer = estimator[0]
            vectorizer_params = vectorizer.get_params()
            vectorizer_name = vectorizer.__class__.__name__
            selector = estimator[1]
            selector_params = selector.get_params()
            selector_name = selector.__class__.__name__
            classifier = estimator[-1]
            classifier_params = classifier.get_params()
            classifier_name = classifier.__class__.__name__
            if col == 'Warmth':
                resampler = estimator[-2]
                resampler_params = resampler.get_params()
                resampler_name = resampler.__class__.__name__

            # Normalize unusual classifiers after fitting
            ## Get feature importance if classifier provides them and use as X
            if any(hasattr(estimator, feature_attr) for feature_attr in ['feature_importances_', 'coef_']):
                feature_selector = SelectFromModel(estimator, prefit=True)
                X_train = feature_selector.transform(X_train)
                X_test = X_test[:, feature_selector.get_support()]
                df_feature_importances = pd.DataFrame(
                    {
                        'features': X_test.values,
                        'feature_importances': estimator.feature_importances_
                    }
                )
                df_feature_importances = df_feature_importances.sort_values('feature_importances', ascending=False)
                print(df_feature_importances.head(20))
                print(f'Best estimator has feature_importances of shape:\n{estimator}')
            else:
                importances = None
            ## For perceptron: calibrate classifier to get prediction probabilities
            if hasattr(searchcv, 'decision_function') and not all(hasattr(searchcv, pred_attr) for pred_attr in ['predict_proba', '_predict_proba_lr']):
                searchcv = CalibratedClassifierCV(
                    searchcv, cv=cv, method='sigmoid'
                ).fit(X_train, y_train)
            ## For Sequential classifier: compile for binary classification, optimize with adam and score on recall
            if classifier_name == 'Sequential':
                searchcv.compile(
                    loss='binary_crossentropy', optimizer='adam', metrics=list(scoring)
                ).fit(X_train, y_train)

            ## Set prediction probability attribute
            if hasattr(searchcv, 'predict_proba'):
                searchcv_predict_attr = searchcv.predict_proba
            elif hasattr(searchcv, '_predict_proba_lr'):
                searchcv_predict_attr = searchcv._predict_proba_lr

            # Get predictions and probabilities
            y_train_pred = estimator.predict(X_train)
            y_test_pred = searchcv.predict(X_test)
            y_test_pred_prob = searchcv_predict_attr(X_test)[:, 1]

            # Save Xy and CV data
            save_Xy_search_cv_estimator(
                grid_search, searchcv,
                X_train, y_train, y_train_pred,
                X_test, y_test, y_test_pred, y_test_pred_prob,
                X_val, y_val,
                df_feature_importances, estimator,
                col, vectorizer_name, classifier_name,
            )

            # Print results
            print('='*20)
            print(
                f'GridSearch - Best mean train score: M = {float(best_mean_train_score:=searchcv.cv_results_["mean_train_score"][best_index:=searchcv.best_index_]):.2f}, SD = {int(best_std_train_score:=searchcv.cv_results_["std_train_score"][best_index]):.2f}\n'
            )
            print(
                f'GridSearch - Best mean test score: M = {float(best_mean_test_score:=searchcv.cv_results_["mean_test_score"][best_index]):.2f}, SD = {int(best_std_test_score:=searchcv.cv_results_["std_test_score"][best_index]):.2f}\n'
            )
            print(
                f'Number of splits: {int(n_splits:=searchcv.n_splits_)}\n'
            )
            print(
                f'Best estimator and parameters:\n{estimator}\n')
            print(
                f'Best parameters:\n{(best_params:=searchcv.best_params_)}\n'
            )
            print(
                f'Training Classification Report:\n{(train_report:=classification_report(y_train, y_train_pred))}\n'
            )
            print('-'*20)
            print(
                f'Best train score: {float(best_train_score:=searchcv.best_score_):.2f}\n'
            )
            print(
                f'Best test score: {float(best_test_score:=searchcv.score(X_test, y_test)):.2f}\n'
            )
            print('='*20)

print('#'*40)
print('DONE!')
print('#'*40)
