In [None]:
import os  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path  # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import specification_curve_fork as specy # type:ignore # isort:skip # fmt:skip # noqa # nopep8


### Set variables

In [None]:
# Variables
method = 'Supervised'
with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
    results_save_path = f.read()
with open(f'{data_dir}{method}_done_xy_save_path.txt', 'r') as f:
    done_xy_save_path = f.read()
t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    f'{scoring.title()} Best Score': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()

# Plotting variables
pp = pprint.PrettyPrinter(indent=4)
tqdm.tqdm.pandas(desc='progress-bar')
tqdm_auto.tqdm.pandas(desc='progress-bar')
# tqdm.notebook.tqdm().pandas(desc='progress-bar')
tqdm_auto.notebook_tqdm().pandas(desc='progress-bar')
# pbar = progressbar.ProgressBar(maxval=10)
mpl.style.use(f'{code_dir}/setup_module/apa.mplstyle-main/apa.mplstyle')
mpl.rcParams['text.usetex'] = False
font = {'family': 'arial', 'weight': 'normal', 'size': 10}
mpl.rc('font', **font)
plt.style.use('tableau-colorblind10')
plt.set_cmap('Blues')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.2f}'.format)


# Classifying

### READ DATA

In [None]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_training.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe loaded with shape: {df_jobs.shape}')


In [None]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

protocol = pickle.HIGHEST_PROTOCOL
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
classified_columns = ['Warmth_Probability', 'Competence_Probability']
final_estimators_dict = {
    'Warmth': {
        'vectorizer_name': 'TfidfVectorizer',
        'classifier_name': 'LogisticRegression',
    },
    'Competence': {
        'vectorizer_name': 'TfidfVectorizer',
        'classifier_name': 'LogisticRegression',
    },
}

for col in tqdm.tqdm(analysis_columns):
    print('-'*20)
    final_estimators_dict[col]['path_suffix'] = path_suffix = f' - {col} - {(vectorizer_name := final_estimators_dict[col]["vectorizer_name"])} + {(classifier_name := final_estimators_dict[col]["classifier_name"])} (Save_protocol={protocol})'

    if classifier_name in list(classifiers_pipe.keys()):
        method = 'Supervised'
        print('-'*20)
        print('Loading Supervised Estimator.')
            with open(
                f'{results_save_path}{method} Fitted Estimator {path_suffix}.pkl', 'rb'
            ) as f:
                estimator = joblib.load(f)
            print('Done loading Fitted Estimator!')

            print('-'*20)
            print('Classifying data.')
            X = np.array(list(df_jobs[text_col].astype('str').values))
            df_jobs[col] = estimator.predict(X)
            if hasattr(estimator, 'predict_proba'):
                # Get the the whole of the last column, which is the  probability of 1, and flatten to list
                df_jobs[f'{col}_Probability'] = estimator.predict_proba(X)[:, -1]

            print(f'Done classifying data using {classifier_name} for {col}!')
            print('-'*20)

    elif classifier_name in list(transformers_pipe.keys()):
        method = 'Transformers'
        print('-'*20)
        print('Loading Transformer Estimator.')
        model = transformers_pipe[classifier_name]['model']
        estimator = model.from_pretrained(f'{results_save_path}{method} Fitted Estimator {path_suffix}')
        print('Done loading Fitted Estimator!')

        print('-'*20)
        print('Classifying data.')
        y_pred_logits, y_labels, metrics_dict = estimator.predict(test_dataset)
        df_jobs[col] = metrics_dict.pop('test_y_pred')
        df_jobs[f'{col}_Probability'] = metrics_dict.pop('test_y_pred_prob')[:, -1]
        metrics_dict = clean_metrics_dict(test_metrics_dict, list(test_metrics_dict.keys())[0].split('_')[0])

        print(f'Done classifying data using {classifier_name} for {col}!')
        print('-'*20)


## Inspect classified data

In [None]:
df_jobs = df_jobs.dropna(subset=['Warmth', 'Competence', 'Warmth_Probability', 'Competence_Probability'])


In [None]:
df_jobs.info()


In [None]:
df_jobs.describe()


In [None]:
get_df_info(df_jobs, ivs_all=[analysis_columns])


In [None]:
get_df_info(df_jobs, ivs_all=[classified_columns])


### Plot classified data


In [None]:
# Counts plot of classifed warmthh and competence
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.countplot(x='Warmth', data=df_jobs, ax=ax[0], palette='colorblind')
sns.countplot(x='Competence', data=df_jobs, ax=ax[1], palette='colorblind')
plt.show()


In [None]:
# Box plot of warmth and competence probabilities
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(x='Warmth', y='Warmth_Probability', data=df_jobs, ax=ax[0], palette='colorblind')
sns.boxplot(x='Competence', y='Competence_Probability', data=df_jobs, ax=ax[1], palette='colorblind')
plt.show()


In [None]:
# Specification curve analysis
print(f'Running specification curve analysis with:\nDEPENDENT VARIABLES = {dvs_prob}\nINDEPENDENT VARIABLES = {ivs_perc}\nCONTROLS = {controls}')
sc = specy.SpecificationCurve(df=dj_jobs, y_endog=dvs_prob, x_exog=ivs_perc, controls=controls)
sc.fit(estimator=sm.OLS)
sc.plot(show_plot=True)


### Save dataframe


In [None]:
# assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
# df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
# df_jobs.to_csv(f'{df_save_dir}df_jobs_for_analysis.csv', index=False)
