In [None]:
import os  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path  # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import specification_curve_fork as specy # type:ignore # isort:skip # fmt:skip # noqa # nopep8


### Set variables

In [None]:
# Variables
method = 'Transformers'
with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
    results_save_path = f.read()
with open(f'{data_dir}{method}_done_xy_save_path.txt', 'r') as f:
    done_xy_save_path = f.read()

t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    'Mean Cross Validation Train Score': np.nan,
    f'Mean Cross Validation Train - {scoring.title()}': np.nan,
    f'Mean Explained Train Variance - {scoring.title()}': np.nan,
    'Mean Cross Validation Test Score': np.nan,
    f'Mean Cross Validation Test - {scoring.title()}': np.nan,
    f'Mean Explained Test Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    f'{scoring.title()} Best Score': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan
}

# Set random seed
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()
torch.Generator(device_name).manual_seed(random_state)
cores = multiprocessing.cpu_count()
accelerator = Accelerator()
torch.autograd.set_detect_anomaly(True)
os.environ.get('TOKENIZERS_PARALLELISM')
best_trial_args = [
    'num_train_epochs', 'learning_rate',
]
training_args_dict = {
    'seed': random_state,
    'resume_from_checkpoint': True,
    'overwrite_output_dir': True,
    'logging_steps': 500,
    'evaluation_strategy': 'steps',
    'eval_steps': 500,
    'save_strategy': 'steps',
    'save_steps': 500,
    # 'metric_for_best_model': 'recall',
    # 'torch_compile': bool(transformers.file_utils.is_torch_available()),
    'use_mps_device': bool(device_name == 'mps' and torch.backends.mps.is_available()),
    'optim': 'adamw_torch',
    'load_best_model_at_end': True,
    'per_device_train_batch_size': 16,
    'per_device_eval_batch_size': 20,
    'warmup_steps': 100,
    'weight_decay': 0.01,
    # The below metrics are used by hyperparameter search
    'num_train_epochs': 3,
    'learning_rate': 5e-5,
}
training_args_dict_for_best_trial = {
    arg_name: arg_
    for arg_name, arg_ in training_args_dict.items()
    if arg_name not in best_trial_args
}


# Functions


In [None]:
class ToDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach().to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], device=device).clone().detach()
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


# Classifying

### READ DATA

In [None]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_classification.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe loaded with shape: {df_jobs.shape}')


In [None]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

method = 'Transformers'
protocol = pickle.HIGHEST_PROTOCOL
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
classified_columns = ['Warmth_Probability', 'Competence_Probability']
final_estimators_dict = {
    'Warmth': {
        'vectorizer_name': 'TfidfVectorizer',
        'classifier_name': 'LogisticRegression',
    },
    'Competence': {
        'vectorizer_name': 'TfidfVectorizer',
        'classifier_name': 'LogisticRegression',
    },
}

for col in tqdm.tqdm(analysis_columns):
    print('-'*20)
    final_estimators_dict[col]['path_suffix'] = path_suffix = f' - {col} - {(vectorizer_name := final_estimators_dict[col]["vectorizer_name"])} + {(classifier_name := final_estimators_dict[col]["classifier_name"])} (Save_protocol={protocol})'

    print('-'*20)
    print('Loading Transformer Estimator.')
    model = transformers_pipe[classifier_name]['model']
    estimator = model.from_pretrained(f'{results_save_path}{method} Fitted Estimator {path_suffix}')
    encodings = tokenizer(
        df_jobs[text_col].astype('str').values.tolist(),
        truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor
    ).to(device)
    # dataset = ToDataset(encodings, y)
    print('Done loading Transformer Estimator!')
    # Accelerate model
    (
        estimator, tokenizer
    ) = accelerator.prepare(
        estimator, tokenizer
    )
    # model.eval()

    # Get predictions
    print(f'Getting prediction results for {col}.')
    estimator = Trainer(
        model=estimator,
        args=TrainingArguments(**training_args_dict),
        tokenizer=tokenizer,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics_y_pred_prob,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        # data_collator=transformers.DataCollatorWithPadding(tokenizer),
    )
    optimizer = torch.optim.AdamW(
        estimator.model.parameters(),
        lr=3e-5,
    )

    print('-'*20)
    print('Classifying data.')
    y_pred_logits, y_labels, metrics_dict = estimator.predict(test_dataset)
    df_jobs[col] = metrics_dict.pop('test_y_pred')
    df_jobs[f'{col}_Probability'] = metrics_dict.pop('test_y_pred_prob')[:, -1]
    metrics_dict = clean_metrics_dict(test_metrics_dict, list(test_metrics_dict.keys())[0].split('_')[0])

    print(f'Done classifying data for {col}!')
    print('-'*20)


## Inspect classified data

In [None]:
metrics_dict

In [None]:
df_jobs = df_jobs.dropna(subset=dvs_all)


In [None]:
df_jobs.info()


In [None]:
df_jobs.describe()


In [None]:
get_df_info(df_jobs, ivs_all=[analysis_columns])


In [None]:
get_df_info(df_jobs, ivs_all=[classified_columns])


### Plot classified data


In [None]:
# Counts plot of classifed warmthh and competence
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.countplot(x='Warmth', data=df_jobs, ax=ax[0], palette='colorblind')
sns.countplot(x='Competence', data=df_jobs, ax=ax[1], palette='colorblind')
plt.show()


In [None]:
# Box plot of warmth and competence probabilities
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(x='Warmth', y='Warmth_Probability', data=df_jobs, ax=ax[0], palette='colorblind')
sns.boxplot(x='Competence', y='Competence_Probability', data=df_jobs, ax=ax[1], palette='colorblind')
plt.show()


In [None]:
# Specification curve analysis
print(f'Running specification curve analysis with:\nDEPENDENT VARIABLES = {dvs_prob}\nINDEPENDENT VARIABLES = {ivs_perc}\nCONTROLS = {controls}')
sc = specy.SpecificationCurve(df=dj_jobs, y_endog=dvs_prob, x_exog=ivs_perc, controls=controls)
sc.fit(estimator=sm.OLS)
sc.plot(show_plot=True)


### Save dataframe


In [None]:
# # assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
# df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
# df_jobs.to_csv(f'{df_save_dir}df_jobs_for_analysis.csv', index=False)
