In [None]:
import os # isort:skip # fmt:skip # noqa # nopep8 
import sys # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # isort:skip # fmt:skip # noqa # nopep8
from supervised_estimators_get_pipe import * # isort:skip # fmt:skip # noqa # nopep8


### Set variables

In [None]:
# Variables
warnings.filterwarnings('always')
# Sklearn variables
method = 'Supervised'
results_save_path = f'{models_save_path}{method} Results/'
searchcv_xy_save_path = f'{results_save_path}SearchCV+Xy/'
t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    f'{scoring.title()} Best Score': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
DetectorFactory.seed = random_state
cores = multiprocessing.cpu_count()
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(
    bert_model_name, strip_accents=True
)
bert_model = BertForSequenceClassification.from_pretrained(
    bert_model_name
).to(device)

# Plotting variables
pp = pprint.PrettyPrinter(indent=4)
tqdm.tqdm.pandas(desc='progress-bar')
tqdm_auto.tqdm.pandas(desc='progress-bar')
tqdm.notebook.tqdm().pandas(desc='progress-bar')
tqdm_auto.notebook_tqdm().pandas(desc='progress-bar')
# pbar = progressbar.ProgressBar(maxval=10)
mpl.use('MacOSX')
mpl.style.use(f'{code_dir}/setup_module/apa.mplstyle-main/apa.mplstyle')
mpl.rcParams['text.usetex'] = True
font = {'family': 'arial', 'weight': 'normal', 'size': 10}
mpl.rc('font', **font)
plt.style.use('tableau-colorblind10')
plt.set_cmap('Blues')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.2f}'.format)


# Functions

In [None]:
def get_existing_files(
    results_save_path= results_save_path,
    col_names_list=None,
    vectorizer_names_list=None,
    classifier_names_list=None,
):
    if col_names_list is None:
        col_names_list = []
    if vectorizer_names_list is None:
        vectorizer_names_list = []
    if classifier_names_list is None:
        classifier_names_list = []

    print(f'Searching for existing estimators in directory:\n{results_save_path}')

    for estimators_file in glob.glob(f'{results_save_path}*.pkl'):
        col_names_list.append(
            col := estimators_file.split(f'{method} Estimator - ')[-1].split(' - ')[0]
        )
        vectorizer_names_list.append(
            vectorizer_name := estimators_file.split(f'{col} - ')[-1].split(' + ')[0]
        )
        classifier_names_list.append(
            classifier_name := estimators_file.split(f'{vectorizer_name} + ')[-1].split(' (Save_protocol=')[0]
        )

    estimator_names_list = [
        f'{col} - {vectorizer_name} + {classifier_name}'
        for col, vectorizer_name, classifier_name in tqdm_product(
            list(set(col_names_list)),
            list(set(vectorizer_names_list)),
            list(set(classifier_names_list)),
        )
    ]
    return (
        list(set(col_names_list)),
        list(set(vectorizer_names_list)),
        list(set(classifier_names_list)),
        list(set(estimator_names_list))
    )


In [None]:
def class_weights_print_Xy(
    X_train, y_train,
    X_test, y_test,
):
    # Check for consistent length
    check_consistent_length(X_train, y_train)
    check_consistent_length(X_test, y_test)

    # Get train class weights
    train_class_weights = compute_class_weight(class_weight = class_weight, classes = np.unique(y_train), y = y_train)
    train_class_weights_ratio = train_class_weights[0]/train_class_weights[1]
    train_class_weights_dict = dict(zip(np.unique(y_train), train_class_weights))

    # Get train class weights
    test_class_weights = compute_class_weight(class_weight = class_weight, classes = np.unique(y_train), y = y_test)
    test_class_weights_ratio = test_class_weights[0]/test_class_weights[1]
    test_class_weights_dict = dict(zip(np.unique(y_test), test_class_weights))

    print('Done splitting data into training and testing sets.')
    print('='*20)
    print(f'Training set shape: {y_train.shape}')
    print('-'*10)
    print(f'Training set example:\n{X_train[0]}')
    print('~'*10)
    print(f'Testing set shape: {y_test.shape}')
    print('-'*10)
    print(f'Testing set example:\n{X_test[0]}')
    print('~'*10)
    print(f'Training data class weights:\nRatio = {train_class_weights_ratio:.2f} (0 = {train_class_weights[0]:.2f}, 1 = {train_class_weights[1]:.2f})')
    print('-'*10)
    print(f'Testing data class weights:\nRatio = {test_class_weights_ratio:.2f} (0 = {test_class_weights[0]:.2f}, 1 = {test_class_weights[1]:.2f})')
    print('='*20)

    return (
        train_class_weights, train_class_weights_ratio, train_class_weights_dict,
        test_class_weights, test_class_weights_ratio, test_class_weights_dict
    )


In [None]:
# Function to place Xy and CV data in df and save
def save_Xy(
    X_train, y_train,
    X_test, y_test,
    col,
    results_save_path=results_save_path,
    method=method, searchcv_xy_save_path=searchcv_xy_save_path,
    compression=None, protocol=None, path_suffix=None, data_dict=None
):

    if compression is None:
        compression = False
    if protocol is None:
        protocol = pickle.HIGHEST_PROTOCOL
    if path_suffix is None:
        path_suffix = f' - {str(col)} - (Save_protocol={protocol}).pkl'
    if data_dict is None:
        data_dict = {}

    # Make df_train_data
    df_train_data = pd.DataFrame(
        {
            'X_train': X_train,
            'y_train': y_train,
        },
    )
    # Make df_test_data
    df_test_data = pd.DataFrame(
        {
            'X_test': X_test,
            'y_test': y_test,
        },
    )

    # Assign dfs to variables
    data_dict['df_train_data'] = df_train_data
    data_dict['df_test_data'] = df_test_data

    # Save files
    print('='*20)
    print('Saving Xy...')
    for file_name, file_ in data_dict.items():
        file_.to_pickle(
            f'{results_save_path}{method} {file_name}{path_suffix}', protocol=protocol
        )
    print(f'Done saving Xy!\n{list(data_dict.keys())}')
    print('='*20)


In [None]:
def split_data(df, col, text_col=text_col, analysis_columns=analysis_columns, random_state=random_state):

    train_ratio = 0.75
    test_ratio = 0.25
    test_split = 1 - train_ratio

    # Split
    print('='*20)
    print('Splitting data into training and testing:')
    print(f'Ratios: train_size = {train_ratio}, test size = {test_ratio}')

    df = df.dropna(subset=analysis_columns, how='any')
    df = df.reset_index(drop=True)

    train, test = train_test_split(
        df, train_size = 1-test_split, test_size = test_split, random_state=random_state
    )

    X_train = np.array(list(train[text_col].astype('str').values))
    y_train = column_or_1d(train[col].astype('int64').values.tolist(), warn=True)

    X_test = np.array(list(test[text_col].astype('str').values))
    y_test = column_or_1d(test[col].astype('int64').values.tolist(), warn=True)

    # Get class weights and print info
    (
        train_class_weights, train_class_weights_ratio, train_class_weights_dict,
        test_class_weights, test_class_weights_ratio, test_class_weights_dict
    ) = class_weights_print_Xy(
        X_train, y_train,
        X_test, y_test,
    )

    return (
        train, X_train, y_train,
        test, X_test, y_test,
        train_class_weights,
        train_class_weights_ratio,
        train_class_weights_dict,
        test_class_weights,
        test_class_weights_ratio,
        test_class_weights_dict,
    )


In [None]:
def load_Xy(
    col, results_save_path=results_save_path, searchcv_xy_save_path=searchcv_xy_save_path, method=method, protocol=None,
    path_suffix=None, data_dict=None
):
    if protocol is None:
        protocol = pickle.HIGHEST_PROTOCOL
    if path_suffix is None:
        path_suffix = f' - {str(col)} - (Save_protocol={protocol}).pkl'
    if compression is None:
        compression = False
    if data is None:
        data_dict = {}

    print(f'Loading Xy from previous for {col}...')
    # Read all dfs
    for file_path in glob.glob(f'{results_save_path}*{path_suffix}'):
        file_name = file_path.split(f'{results_save_path}{method} ')[-1].split(path_suffix)[0]
        if 'df_' in file_name and 'cv_results' not in file_name and classifier_name not in ignore_classifiers_list:
            data_dict[file_name] = pd.read_pickle(file_path)

    try:
        # Train data
        df_train_data = data_dict['df_train_data']
        X_train = df_train_data['X_train'].values
        y_train = df_train_data['y_train'].values
        # Test data
        df_test_data = data_dict['df_test_data']
        X_test = df_test_data['X_test'].values
        y_test = df_test_data['y_test'].values

        print(f'Done loading Xy from previous for {col}!')

        # Get class weights and print info
        (
            train_class_weights, train_class_weights_ratio, train_class_weights_dict,
            test_class_weights_dict, test_class_weights_ratio, test_class_weights_dict
        ) = class_weights_print_Xy(
            X_train, y_train,
            X_test, y_test,
        )

        return (
            X_train, y_train,
            X_test, y_test,
            train_class_weights, train_class_weights_ratio, train_class_weights_dict,
            test_class_weights_dict, test_class_weights_ratio, test_class_weights_dict
        )
    except Exception:
        print(f'Error loading Xy from previous for {col}!')
        return None


In [None]:
# Function to normalize unusual classifiers after fitting
def normalize_after_fitting(estimator, X_train, y_train, X_test, y_test, searchcv):
    # Classifiers to normalize = ['GaussianNB', 'DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier', 'XGBClassifier', 'Perceptron', 'Sequential']

    # Get feature importance if classifier provides them and use as X
    if any(hasattr(estimator, feature_attr) for feature_attr in ['feature_importances_', 'coef_']):
        feature_selector = SelectFromModel(estimator, prefit=True)
        X_train = feature_selector.transform(X_train)
        X_test = X_test[:, feature_selector.get_support()]
        df_feature_importances = pd.DataFrame(
            {
                'features': X_test.values,
                'feature_importances': estimator.feature_importances_
            }
        )
        df_feature_importances = df_feature_importances.sort_values('feature_importances', ascending=False)
        print(df_feature_importances.head(20))
        print(f'Best estimator has feature_importances of shape:\n{estimator}')
    else:
        df_feature_importances = None

    # For perceptron: calibrate classifier to get prediction probabilities
    if not hasattr(searchcv, 'predict_proba') and not hasattr(searchcv, '_predict_proba_lr') and hasattr(searchcv, 'decision_function'):
        searchcv = CalibratedClassifierCV(
            searchcv, cv=cv, method='sigmoid'
        ).fit(X_train, y_train)

    # For Sequential classifier: compile for binary classification, optimize with adam and score on recall
    if classifier_name == 'Sequential':
        searchcv.compile(
            loss='binary_crossentropy', optimizer='adam', metrics=list(scoring)
        ).fit(X_train, y_train)

    return (
        estimator, X_train, y_train, X_test, y_test, searchcv, df_feature_importances
    )


In [None]:
# Function to place Xy and CV data in df and save
def save_Xy_search_cv_estimator(
    grid_search, searchcv, cv_results,
    train, X_train, y_train, y_train_pred,
    test, X_test, y_test, y_test_pred, y_test_pred_prob,
    df_feature_importances, estimator,
    col, vectorizer_name, classifier_name,
    results_save_path=results_save_path,
    method=method, searchcv_xy_save_path=searchcv_xy_save_path,
    compression=None, protocol=None,
    path_suffix=None, data_dict=None
):
    if compression is None:
        compression = False
    if protocol is None:
        protocol = pickle.HIGHEST_PROTOCOL
    if path_suffix is None:
        path_suffix = f' - {str(col)} - {vectorizer_name} + {classifier_name} (Save_protocol={protocol}).pkl'
    if data_dict is None:
        data_dict = {}

    # Make df_cv_results
    df_cv_results = pd.DataFrame(
        cv_results
    )
    # Make df_train_data
    df_train_data = pd.DataFrame(
        {
            'X_train': X_train,
            'y_train': y_train,
            'y_train_pred': y_train_pred,
        },
    )
    # Make df_test_data
    df_test_data = pd.DataFrame(
        {
            'X_test': X_test,
            'y_test': y_test,
            'y_test_pred': y_test_pred,
            'y_test_pred_prob': y_test_pred_prob,
        },
    )

    # Make data dict
    data_dict['Grid Search'] = grid_search
    data_dict['SearchCV'] = searchcv
    data_dict['df_cv_results'] = df_cv_results
    data_dict['df_train_data'] = df_train_data
    data_dict['df_test_data'] = df_test_data
    data_dict['Estimator'] = estimator
    if df_feature_importances is not None:
        data_dict['df_feature_importances'] = df_feature_importances

    # Save files
    print('='*20)
    print('Saving Xy, CV data, and estimator...')
    for file_name, file_ in data_dict.items():
        path = searchcv_xy_save_path if file_name != 'Estimator' else results_save_path
        if not isinstance(file_, pd.DataFrame) and 'df_' not in file_name:
            with open(
                f'{path}{method} {file_name}{path_suffix}', 'wb'
            ) as f:
                joblib.dump(file_, f, compress=compression, protocol=protocol)
        elif isinstance(file_, pd.DataFrame) and 'df_' in file_name:
            file_.to_pickle(
                f'{path}{method} {file_name}{path_suffix}', protocol=protocol
            )
    print(f'Done saving Xy, CV data, and estimator!\n{list(data_dict.keys())}')
    print('='*20)


In [None]:
# Assert that all classifiers were used
def assert_all_classifers_used(
    results_save_path=results_save_path, method=method, classifiers_pipe=classifiers_pipe, used_classifiers=None,
):

    if estimators_list is None:
        estimators_list = []
    if used_classifiers is None:
        used_classifiers = []

    for estimator_path in glob.glob(f'{results_save_path}{method} Estimator - *.pkl'):
        classifier_name = estimator_path.split(f'{results_save_path}{method} ')[1].split(' + ')[1].split(' (Save_protocol=')[0]
        used_classifiers.append(classifier_name)

    assert set(classifiers_pipe.keys()) == set(used_classifiers), 'Not all classifiers were used!'


# Training

### READ DATA

In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_trainning.pkl').reset_index(drop=True)
assert len(df_manual) == 5978, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH 5978 BUT IS OF LENGTH {len(df_manual)}'


In [None]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

text_col = 'Job Description spacy_sentencized'

# Get existing estimators
col_names_list, vectorizer_names_list, classifier_names_list, estimator_names_list = get_existing_files()

for col in tqdm.tqdm(analysis_columns):

    print('-'*20)
    print(f'{"="*30} TRAINING DATASET OF LENGTH {len(df_manual)} ON {col.upper()} {"="*30}')
    print('-'*20)
    print(
        f'Vectorizers to be used ({len(list(vectorizers_pipe.values()))}):\n{list(vectorizers_pipe.keys())}'
    )
    print(
        f'Total number of vectorizer parameters = {sum([len(list(vectorizers_pipe.values())[i][1]) for i in range(len(vectorizers_pipe))])}'
    )
    print(
        f'Selectors to be used ({len(list(selectors_pipe.values()))}):\n{list(selectors_pipe.keys())}'
    )
    print(
        f'Total number of selector parameters = {sum([len(list(selectors_pipe.values())[i][1]) for i in range(len(selectors_pipe))])}'
    )
    print(
        f'Resamplers to be used ({len(list(resamplers_pipe.keys()))}):\n{list(resamplers_pipe.keys())}'
    )
    print(
        f'Total number of resamplers parameters = {sum([len(list(resamplers_pipe.values())[i][1]) for i in range(len(resamplers_pipe))])}'
    )
    print(
        f'Classifers to be used ({len(list(classifiers_pipe.keys()))}):\n{list(classifiers_pipe.keys())}'
    )
    print(
        f'Total number of classifers parameters = {sum([len(list(classifiers_pipe.values())[i][1]) for i in range(len(classifiers_pipe))])}'
    )

    assert len(df_manual[df_manual[str(col)].map(df_manual[str(col)].value_counts() > 1)]) != 0, f'Dataframe has no {col} values!'

    # Split
    (
        train, X_train, y_train,
        test, X_test, y_test,
        train_class_weights,
        train_class_weights_ratio,
        train_class_weights_dict,
        test_class_weights,
        test_class_weights_ratio,
        test_class_weights_dict
    ) = split_data(
        df_manual, col, text_col, analysis_columns,
    )

    for (
        vectorizer_name, vectorizer_and_params
    ), (
        selector_name, selector_and_params
    ), (
        resampler_name, resampler_and_params
    ), (
        classifier_name, classifier_and_params
    ) in tqdm_product(
        vectorizers_pipe.items(), selectors_pipe.items(), resamplers_pipe.items(), classifiers_pipe.items()
    ):

        if f'{col} - {vectorizer_name} + {classifier_name}' in estimator_names_list:
            print('-'*20)
            print(
                f'Already trained {col} - {vectorizer_name} + {classifier_name}'
            )
            print('-'*20)
            # Load previous Xy
            (
                X_train, y_train,
                X_test, y_test,
                train_class_weights, train_class_weights_ratio, train_class_weights_dict,
                test_class_weights_dict, test_class_weights_ratio, test_class_weights_dict
            ) = load_Xy(
                col, vectorizer_name, classifier_name
            )
            continue

        # Identify names and params
        vectorizer = vectorizer_and_params[0]
        vectorizer_params = vectorizer_and_params[-1]

        selector = selector_and_params[0]
        selector_params = selector_and_params[-1]

        resampler = resampler_and_params[0]
        resampler_params = resampler_and_params[-1]

        classifier = classifier_and_params[0]
        classifier_params = classifier_and_params[-1]

        # Pipeline
        ## Steps
        if col == 'Warmth':
            steps = [
                (vectorizer_name, vectorizer),
                (selector_name, selector),
                (resampler_name, resampler),
                (classifier_name, classifier)
            ]
        else:
            steps = [
                (vectorizer_name, vectorizer),
                (selector_name, selector),
                (classifier_name, classifier)
            ]

        ## Params
        param_grid = {
            **vectorizer_params,
            **selector_params,
            **classifier_params,
        }

        ## Pipeline
        pipe = imblearn.pipeline.Pipeline(steps=steps)

        # Search
        print('-'*20)
        print(f'{"="*30} Using GridSearchCV {"="*30}')
        print('-'*20)
        print(f'GridSearchCV with:\nPipe:\n{pipe}\nParams:\n{param_grid}')
        print('+'*30)

        grid_search = HalvingGridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            cv=cv,
            n_jobs=n_jobs,
            return_train_score=True,
            verbose=1,
            error_score='raise',
            refit=refit,
            random_state=random_state,
            scoring=scorers['recall_score'],
        )

        ## Normalize unusual classifiers before fitting
        if classifier_name == 'GaussianNB':
            X_train = X_train.todense()
            X_test = X_test.todense()

        # Fit SearchCV
        with joblib.parallel_backend(backend='loky', n_jobs=n_jobs):
            print('Fitting GridSearchCV')
            searchcv = grid_search.fit(X_train, y_train)

            # Reidentify and name best estimator and params
            estimator = searchcv.best_estimator_
            cv_results = searchcv.cv_results_
            vectorizer = estimator[0]
            vectorizer_params = vectorizer.get_params()
            vectorizer_name = vectorizer.__class__.__name__
            selector = estimator[1]
            selector_params = selector.get_params()
            selector_name = selector.__class__.__name__
            classifier = estimator[-1]
            classifier_params = classifier.get_params()
            classifier_name = classifier.__class__.__name__
            if col == 'Warmth':
                resampler = estimator[-2]
                resampler_params = resampler.get_params()
                resampler_name = resampler.__class__.__name__

            # Normalize unusual classifiers after fitting
            (
                estimator, X_train, y_train, X_test, y_test, searchcv, df_feature_importances
            ) = normalize_after_fitting(
                estimator, X_train, y_train, X_test, y_test, searchcv
            )

            # Set prediction probability attribute
            if hasattr(searchcv, 'predict_proba'):
                searchcv_predict_attr = searchcv.predict_proba
            elif hasattr(searchcv, '_predict_proba_lr'):
                searchcv_predict_attr = searchcv._predict_proba_lr

            # Get predictions and probabilities
            y_train_pred = estimator.predict(X_train)
            y_test_pred = searchcv.predict(X_test)
            y_test_pred_prob = searchcv_predict_attr(X_test)[:, 1]

            # Save Xy data
            save_Xy(
                X_train, y_train,
                X_test, y_test,
                col,
            )
            # Save Xy and CV data
            save_Xy_search_cv_estimator(
                grid_search, searchcv, cv_results,
                train, X_train, y_train, y_train_pred,
                test, X_test, y_test, y_test_pred, y_test_pred_prob,
                df_feature_importances, estimator,
                col, vectorizer_name, classifier_name,
            )

# Assert that all classifiers were used
assert_all_classifers_used()
print('#'*40)
print('DONE!')
print('#'*40)
