In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module.estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


### Set variables

In [None]:
# Variables
t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
protocol = pickle.HIGHEST_PROTOCOL
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
classified_columns = ['Warmth_Probability', 'Competence_Probability']
metrics_dict = {
    f'{scoring.title()} Best Score': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Brier Score': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()
torch.Generator(device_name).manual_seed(random_state)
cores = multiprocessing.cpu_count()
accelerator = Accelerator()
torch.autograd.set_detect_anomaly(True)
os.environ.get('TOKENIZERS_PARALLELISM')
os.environ.get('PYTORCH_MPS_HIGH_WATERMARK_RATIO')
os.environ.get('TRANSFORMERS_CACHE')
openai_token = os.environ['OPENAI_API_KEY']
huggingface_token = os.environ['HUGGINGFACE_API_KEY']
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4'
quantization_config_dict = {
    'load_in_8bit': True,
    'llm_int8_skip_modules': ['lm_head'],
}
hyperparameter_tuning = True


# Functions


In [None]:
def load_classified_df(
    df, df_name, df_len, done_dfs_name, df_save_dir
):
    print(f'Loading {df_name}_{done_dfs_name}...')
    df = pd.read_pickle(f'{df_save_dir}{df_name}_{done_dfs_name}.pkl')
    assert len(df) == df_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_len} BUT IS OF LENGTH {len(df)}'
    print(f'Dataframe {df_name}_{done_dfs_name} loaded with shape: {df.shape}')

    return df


In [None]:
def extract_prediction(text):
    if pred := classifier(text):
        return pd.Series([pred[0]['label'].split('LABEL_')[1], pred[0]['score']])
    else:
        return None, None


In [None]:
class ToDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # check is encodings and labels are tensors
        for key, val in self.encodings.items():
            if not torch.is_tensor(val[idx]):
                self.encodings[key][idx] = torch.tensor(val[idx], dtype=torch.long, device=device)
        if not torch.is_tensor(self.labels[idx]):
            self.labels[idx] = torch.tensor(self.labels[idx], dtype=torch.long, device=device)
        item = {key: val[idx].to(device) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


In [None]:
class ImbTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights, self.weight, self.pos_weight = self._calculate_class_weights(self.train_dataset)

    def _calculate_class_weights(self, dataset):
        # Count the number of samples in each class
        class_counts = torch.bincount(torch.tensor(dataset.labels, device=device))

        # Calculate the inverse frequency of each class with Laplace smoothing
        inv_frequencies = (len(dataset) + 1) / (class_counts + 1)
        class_weights = inv_frequencies / torch.sum(inv_frequencies)

        # Calculate weight and pos_weight
        num_negative = class_counts[0].item()
        num_positive = class_counts[1].item()
        weight_neg = num_positive / (num_negative + 1e-5)
        weight_pos = num_negative / (num_positive + 1e-5)
        weight = torch.tensor([weight_neg, weight_pos], device=device)
        pos_weight = torch.tensor([weight_pos], device=device)

        return class_weights, weight, pos_weight

    def _calculate_calibration_loss(self, logits, labels):
        return nn.BCEWithLogitsLoss(
            weight=self.weight.to(device)
        )(
            logits.to(device),
            torch.nn.functional.one_hot(labels, logits.size(-1)).long().float().to(device)
        )

    def compute_loss(self, model, inputs, return_outputs=False):
        loss = super().compute_loss(model, inputs, return_outputs=True)[0]
        labels = inputs.pop('labels').to(device)
        outputs = model(**inputs)
        logits = outputs.get('logits').to(device)
        y_pred = torch.argmax(logits, dim=-1).to(device)
        y_pred_prob = torch.softmax(logits, dim=-1)[:, 1].to(device)

        # Get recall and precision
        accuracy = binary_accuracy(y_pred, labels).to(device)
        recall = binary_recall(y_pred, labels).to(device)
        precision = binary_precision(y_pred, labels).to(device)

        if accuracy < 0.7 and recall < 0.7 and precision < 0.7:
            loss = self._calculate_calibration_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
# Function to get y_pred and y_pred_prob
def preprocess_logits_for_metrics_from_logits(y_pred_logits, print_enabled=None):

    if print_enabled is None:
        print_enabled = True

    # Get y_pred
    if not torch.is_tensor(y_pred_logits):
        y_pred_logits_tensor = torch.tensor(y_pred_logits, device=device)
    if print_enabled:
        print('-'*20)
        print('Getting y_pred through argmax of y_pred_logits...')
    try:
        y_pred_array = torch.argmax(y_pred_logits_tensor, axis=-1)
        if print_enabled: print('Using torch.argmax.')
    except Exception:
        y_pred_array = y_pred_logits.argmax(axis=-1)
        if print_enabled: print('Using np.argmax.')
    if print_enabled:
        print(f'y_pred_array shape: {y_pred_array.shape}')
        print('-'*20)
        print('Flattening y_pred...')
    y_pred = y_pred_array.flatten().tolist()
    if print_enabled:
        print(f'y_pred length: {len(y_pred)}')
        print('-'*20)

    # Get y_pred_prob
    if print_enabled:
        print('-'*20)
        print('Getting y_pred_prob through softmax of y_pred_logits...')
    try:
        y_pred_prob_array = torch.nn.functional.softmax(y_pred_logits_tensor, dim=-1)
        if print_enabled: print('Using torch.nn.functional.softmax.')
    except Exception:
        y_pred_prob_array = scipy.special.softmax(y_pred_logits, axis=-1)
        if print_enabled: print('Using scipy.special.softmax.')
    # from: https://discuss.huggingface.co/t/different-results-predicting-from-trainer-and-model/12922
    assert all(y_pred_prob_array.argmax(axis=-1) == y_pred_array), 'Argmax of y_pred_prob_array does not match y_pred_array.'
    y_pred_prob = y_pred_prob_array[:, -1].flatten().tolist()
    if print_enabled:
        print(f'y_pred_prob shape: {y_pred_prob_array.shape}')
        print('-'*20)
        print('Flattening y_pred_prob and extracting probabilities of 1...')
        print(f'y_pred length: {len(y_pred_prob)}')
        print('-'*20)

    return (
        y_pred_array, y_pred, y_pred_prob_array, y_pred_prob
    )


In [None]:
def prob_confirmatory_tests(y_pred, y_pred_prob):

    # Confirmatory Regression
    print('+'*20)
    print('Confirmatory Tests validating the linear relationship between y_pred and y_pred_prob')
    print('-'*20)
    print('T-Test y_pred_prob ~ y_pred:')
    levene = scipy.stats.levene(y_pred_prob, y_pred)
    equal_var_levene = levene.pvalue < 0.05
    print(scipy.stats.ttest_ind(y_pred_prob, y_pred, equal_var=equal_var_levene))

    print('\n')
    print('-'*20)
    print('Logit y_pred ~ y_pred_prob:')
    try:
        logit_model = sm.Logit(endog=y_pred, exog=y_pred_prob)
        logit_results = logit_model.fit()
        std_coef = logit_results.params[0] / np.std(y_pred_prob)
        std_err = logit_results.bse[0]
        log_likelihood = logit_results.llf
        print(logit_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
        print(f'Log Likelihood: {log_likelihood}')
    except Exception as e:
        print(type(e).__name__)

    print('-'*20)
    print('\n')
    print('-'*20)
    print('OLS y_pred_prob ~ y_pred:')
    try:
        ols_model = sm.OLS(endog=y_pred_prob, exog=y_pred)
        ols_results = ols_model.fit()
        std_coef = ols_results.params[0] / np.std(y_pred)
        std_err = ols_results.bse[0]
        print(ols_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
    except Exception as e:
        print(type(e).__name__)

    print('-'*20)
    print('+'*20)
    print('\n')


# Classifying

### READ DATA

In [None]:
# # ATTN: IF THIS IS THE FIRST TIME YOU ARE CLASSIFYING JOBS, UNCOMMENT AND RUN THIS CODE
# with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
#     df_jobs_len = int(f.read())
# df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_classification.pkl')
# assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
# print(f'Dataframe df_jobs loaded with shape: {df_jobs.shape}')
# with open(f'{data_dir}df_manual_len.txt', 'r') as f:
#     df_manual_len = int(f.read())
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_training.pkl')
# assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
# print(f'Dataframe df_manual loaded with shape: {df_manual.shape}')


In [None]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

with open(f'{data_dir}df_manual_len.txt', 'r') as f:
    df_manual_len = int(f.read())

for done_dfs_name in ['classified', 'classified_Warmth_Competence', 'classified_Warmth']:
    if (os.path.exists(f'{df_save_dir}df_jobs_{done_dfs_name}.pkl') and os.path.getsize(f'{df_save_dir}df_jobs_{done_dfs_name}.pkl') > 0) and (os.path.exists(f'{df_save_dir}df_manual_{done_dfs_name}.pkl') and os.path.getsize(f'{df_save_dir}df_manual_{done_dfs_name}.pkl') > 0):

        df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_{done_dfs_name}.pkl')
        assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
        df_manual = pd.read_pickle(f'{df_save_dir}df_manual_{done_dfs_name}.pkl')
        assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'

        if (
            done_dfs_name == 'classified'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' in df_jobs.columns
            and 'Competence_Probability' in df_jobs.columns
        ):
            df_jobs = load_classified_df(df_jobs, 'df_jobs', df_jobs_len, done_dfs_name, df_save_dir)
            df_manual = load_classified_df(df_manual, 'df_manual', df_manual_len, done_dfs_name, df_save_dir)
            break

        elif (
            done_dfs_name == 'classified_Warmth_Competence'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' in df_jobs.columns
            and 'Competence_Probability' in df_jobs.columns
        ):
            df_jobs = load_classified_df(df_jobs, 'df_jobs', df_jobs_len, done_dfs_name, df_save_dir)
            df_manual = load_classified_df(df_manual, 'df_manual', df_manual_len, done_dfs_name, df_save_dir)
            break

        elif (
            done_dfs_name == 'classified_Warmth'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' not in df_jobs.columns
            and 'Competence_Probability' not in df_jobs.columns
        ):
            df_jobs = load_classified_df(df_jobs, 'df_jobs', df_jobs_len, done_dfs_name, df_save_dir)
            df_manual = load_classified_df(df_manual, 'df_manual', df_manual_len, done_dfs_name, df_save_dir)
            break

else:
    print('Loading df_jobs_for_classification...')
    df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_classification.pkl')
    assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
    print(f'Dataframe df_jobs_for_classification loaded with shape: {df_jobs.shape}')
    df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_training.pkl')
    assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
    print(f'Dataframe df_manual_for_training loaded with shape: {df_manual.shape}')


In [None]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

done_cols = ''

final_estimators_dict = {
    'Warmth': {
        'classifier_name': 'BertForSequenceClassification',
        'vectorizer_name': ''.join(transformers_pipe["BertForSequenceClassification"]["model_name"].split('-')).upper(),
    },
    'Competence': {
        'classifier_name': 'BertForSequenceClassification',
        'vectorizer_name': ''.join(transformers_pipe["BertForSequenceClassification"]["model_name"].split('-')).upper(),
    },
}

for col in tqdm.tqdm(analysis_columns):
    if col not in df_jobs.columns and f'{col}_predicted' not in df_manual.columns:
        print('-'*20)
        final_estimators_dict[col]['path_suffix'] = path_suffix = f' - {col} - {(vectorizer_name := final_estimators_dict[col]["vectorizer_name"])} + {(classifier_name := final_estimators_dict[col]["classifier_name"])} (Save_protocol={protocol})'

        if classifier_name in list(classifiers_pipe.keys()):
            method = 'Supervised'
            with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
                results_save_path = f.read()
            print('-'*20)
            print(f'Using {classifier_name} from {method} pipeline.')
            print('Loading Supervised Estimator.')
            with open(
                f'{results_save_path}{method} Fitted Estimator {path_suffix}.pkl', 'rb'
            ) as f:
                estimator = joblib.load(f)
            print('Done loading Supervised Estimator!')

            print('-'*20)
            print('Classifying data.')
            # df_jobs
            print('Classifying df_jobs.')
            X = np.array(list(df_jobs[text_col].astype('str').values))
            df_jobs[col] = estimator.predict(X)
            if hasattr(estimator, 'predict_proba'):
                # Get the the whole of the last column, which is the  probability of 1, and flatten to list
                df_jobs[f'{col}_Probability'] = estimator.predict_proba(X)[:, -1]
            # df_manual
            print('Classifying df_manual to generate instrumental variables.')
            X_instrument = np.array(list(df_manual[text_col].astype('str').values))
            df_manual[f'{col}_predicted'] = estimator.predict(X_instrument)
            if hasattr(estimator, 'predict_proba'):
                # Get the the whole of the last column, which is the  probability of 1, and flatten to list
                df_manual[f'{col}_Probability_predicted'] = estimator.predict_proba(X_instrument)[:, -1]

            print(f'Done classifying data using {classifier_name} for {col}!')
            print('-'*20)

        elif classifier_name in list(transformers_pipe.keys()):
            method = 'Transformers'
            with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
                results_save_path = f.read().strip('\n')
            with open(f'{data_dir}{method}_done_xy_save_path.txt', 'r') as f:
                done_xy_save_path = f.read().strip('\n')
            with open(f'{done_xy_save_path}{method} training_args_dict - {col} - {vectorizer_name} + {classifier_name}.json', 'r') as f:
                training_args_dict = json.load(f)
            print('-'*20)
            print(f'Using {classifier_name} from {method} pipeline.')
            model = transformers_pipe[classifier_name]['model']
            tokenizer = transformers_pipe[classifier_name]['tokenizer']
            config = transformers_pipe[classifier_name]['config']

            print(f'Loading Fitted Transformer {classifier_name} from pretrained.')
            estimator_dir = f'{results_save_path}{method} Fitted Estimator{path_suffix}.model'
            fitted_estimator = model.from_pretrained(estimator_dir, trust_remote_code=True)
            if hasattr(fitted_estimator, 'to'):
                fitted_estimator = fitted_estimator.to(device)
            tokenizer = tokenizer.from_pretrained(estimator_dir, trust_remote_code=True)
            config = config.from_pretrained(f'{estimator_dir}/config.json', trust_remote_code=True)
            print(f'Done loading Fitted Transformer {classifier_name} from pretrained!')

            try:
                print('Using transformers pipeline.')
                # Get predictions
                # Accelerate model
                (
                    fitted_estimator, tokenizer
                ) = accelerator.prepare(
                    fitted_estimator, tokenizer
                )
                classifier = transformers.pipeline(
                    model=fitted_estimator, tokenizer=tokenizer, function_to_apply='softmax', device=device, framework='pt', task='text-classification', return_all_scores=False
                )
                # df_jobs
                print('Classifying df_jobs.')
                df_jobs[[col, f'{col}_Probability']] = df_jobs[text_col].astype(str).progress_apply(extract_prediction)
                df_jobs[col] = df_jobs[col].astype(int)
                df_jobs[f'{col}_Probability'] = df_jobs[f'{col}_Probability'].astype(float)
                # df_manual
                print('Classifying df_manual to generate instrumental variables.')
                df_manual[[f'{col}_predicted', f'{col}_Probability_predicted']] = df_manual[text_col].astype(str).progress_apply(extract_prediction)
                df_manual[f'{col}_predicted'] = df_manual[f'{col}_predicted'].astype(int)
                df_manual[f'{col}_Probability_predicted'] = df_manual[f'{col}_Probability_predicted'].astype(float)
                # for idx_, row in tqdm.tqdm(df_jobs[text_col].items()):
                #     pred = classifier(row)
                #     df_jobs.loc[idx_, col] = pred[0]['label'].split('LABEL_')[1]
                #     df_jobs.loc[idx_, f'{col}_Probability'] = pred[0]['score']
                # df_jobs[col] = df_jobs[text_col].astype(str).progress_apply(lambda x: [pred['label'].split('LABEL_')[1] for pred in classifier(x)][0])
                # df_jobs[f'{col}_Probability'] = df_jobs[text_col].astype(str).progress_apply(lambda x: [pred['score'] for pred in classifier(x)][0])

            except Exception as e:
                print(f'Transformers pipeline caused {type(e).__name__}. Using Trainer instead.')
                # Tokenize df_jobs
                X = df_jobs[text_col].astype('str').values.tolist()
                encodings = tokenizer(
                X, truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor
                ).to(device)
                dataset = ToDataset(encodings)
                # Tokenize df_manual
                X_instrument = df_manual[text_col].astype('str').values.tolist()
                encodings_instrument = tokenizer(
                X_instrument, truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor
                ).to(device)
                dataset_instrument = ToDataset(encodings_instrument)
                # Accelerate model
                (
                    fitted_estimator, tokenizer, dataset, dataset_instrument
                ) = accelerator.prepare(
                    fitted_estimator, tokenizer, dataset, dataset_instrument
                )

                print(f'Getting estimator for {col}.')
                estimator = ImbTrainer(
                    model=fitted_estimator,
                    tokenizer=tokenizer,
                    args=TrainingArguments(**training_args_dict),
                )
                if estimator.place_model_on_device:
                    estimator.model.to(device)

                # df_jobs
                print('-'*20)
                print(f'Classifying data using {classifier_name} for {col}.')
                print('Classifying df_jobs.')
                (y_pred_logits, y_labels, metrics) = estimator.predict(dataset)
                y_pred_array, y_pred, y_pred_prob_array, y_pred_prob = preprocess_logits_for_metrics_from_logits(y_pred_logits)

                # Assign to dataframe
                df_jobs[col] = y_pred
                df_jobs[f'{col}_Probability'] = y_pred_prob
                df_jobs[col] = df_jobs[col].astype(int)
                df_jobs[f'{col}_Probability'] = df_jobs[f'{col}_Probability'].astype(float)

                # df_manual
                print('Classifying df_manual to generate instrumental variables.')
                (y_pred_logits_instrument, y_labels_instrument, metrics_instrument) = estimator.predict(dataset_instrument)
                y_pred_array_instrument, y_pred_instrument, y_pred_prob_array_instrument, y_pred_prob_instrument = preprocess_logits_for_metrics_from_logits(y_pred_logits_instrument)

                # Assign to dataframe
                df_manual[f'{col}_predicted'] = y_pred_instrument
                df_manual[f'{col}_Probability_predicted'] = y_pred_prob_instrument
                df_manual[f'{col}_predicted'] = df_manual[f'{col}_predicted'].astype(int)
                df_manual[f'{col}_Probability_predicted'] = df_manual[f'{col}_Probability_predicted'].astype(float)

            print(f'Done classifying data using {classifier_name} for {col}!')
            print('-'*20)

        done_cols += f'_{col}'
        assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
        # df_jobs
        df_jobs.to_pickle(f'{df_save_dir}df_jobs_classified{done_cols}.pkl')
        df_jobs.to_csv(f'{df_save_dir}df_jobs_classified{done_cols}.csv', index=False)
        # df_manual
        df_manual.to_pickle(f'{df_save_dir}df_manual_classified{done_cols}.pkl')
        df_manual.to_csv(f'{df_save_dir}df_manual_classified{done_cols}.csv', index=False)
    else:
        print('-'*20)
        print(f'Column {col} already exists in dataframe. Skipping.')
        print('-'*20)
    # Confirmatory Regression
    print('='*20)
    print(f'Confirmatory test for df_jobs {col}')
    prob_confirmatory_tests(df_jobs[col].values, df_jobs[f'{col}_Probability'].values)
    print(f'Confirmatory test for df_manual {col}')
    prob_confirmatory_tests(df_manual[col], df_manual[f'{col}_predicted'].values)
    print('='*20)



## Inspect classified data

In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_classified.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_classified.csv', index=False)



In [None]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_classified.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'


In [None]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_classified.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_classified.csv', index=False)


In [None]:
with open(f'{data_dir}df_manual_len.txt', 'r') as f:
    df_manual_len = int(f.read())

df_manual = pd.read_pickle(f'{df_save_dir}df_manual_classified.pkl')
assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'


In [None]:
df_jobs.head()


In [None]:
df_jobs.info()


In [None]:
df_jobs.describe()


In [None]:
df_jobs[['Job ID', 'Job Description spacy_sentencized']].info()


In [None]:
df_jobs[['Job ID', 'Job Description spacy_sentencized']].head()


In [None]:
df_manual.head()


In [None]:
df_manual.info()


In [None]:
df_manual[['Job ID', 'Job Description spacy_sentencized']].info()


In [None]:
df_manual[['Job ID', 'Job Description spacy_sentencized']].head()


## Merge df_jobs with df_manual

In [None]:
merge_on_cols_list = ['Job ID', 'Job Description spacy_sentencized', 'Search Keyword', 'Platform', 'Job Title', 'Company Name', 'Location']
fill_on_cols_list = ['Job Description', 'Industry', 'Rating', 'Employment Type', 'Company URL', 'Job URL', 'Job Age', 'Job Age Number', 'Collection Date', 'Data Row', 'Tracking ID', 'Job Date', 'Type of ownership', 'Language', 'Job Description_num_words', 'Job Description_num_unique_words', 'Job Description_num_chars', 'Job Description_num_chars_no_whitespact_and_punt', 'Job Description_num_punctuations']


In [None]:
df_jobs = pd.merge(
    df_jobs,
    df_manual,
    how='outer', on=merge_on_cols_list, suffixes=('', '_actual')
).drop_duplicates(
    subset=merge_on_cols_list
).reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
df_jobs.isna().sum()


In [None]:
def fill_contradicting_cols(df_jobs):

    for col in dvs:
        fill_contradicting_dict = {
            col: f'{col}_actual',
            f'{col}_Probability': f'{col}_Probability_predicted'
        }
        for main_col, fill_col in fill_contradicting_dict.items():
            print(f'Filling {main_col} with values from {fill_col}')
            df_jobs[main_col] = df_jobs[main_col].fillna(df_jobs[fill_col])
            mask = (
                (df_jobs[main_col] != df_jobs[fill_col])
                & (~df_jobs[fill_col].isna())
                | (df_jobs[main_col].isna())
            )
            df_jobs.loc[mask, main_col] = df_jobs.loc[mask, fill_col]
            assert df_jobs[main_col].isna().sum() == 0, f'Missing values found in {main_col} column'

    return df_jobs


In [None]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].head()


In [None]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].tail()


In [None]:
df_jobs = fill_contradicting_cols(df_jobs)


In [None]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].head()


In [None]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].tail()


In [None]:
# Fill in other nan values from columns with '_actual' suffix from df_manual
for col in df_jobs.columns:
    if '_actual' not in col\
        and '_predicted' not in col\
            and '_Probability' not in col\
                and col not in dvs\
                    and col not in merge_on_cols_list + fill_on_cols_list\
                        and df_jobs[col].isna().sum() != 0:
                        df_jobs[col] = df_jobs[col].fillna(df_jobs[f'{col}_actual'])


In [None]:
df_jobs.info()


In [None]:
df_jobs.info()


In [None]:
df_jobs.isna().sum()


In [None]:
df_jobs = df_jobs.drop(
    columns=
    [
        col
        for col in df_jobs.columns
        if '_actual' in col
        and col not in ['Warmth_actual', 'Competence_actual']
        or col in fill_on_cols_list
    ]
).reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
df_jobs.isna().sum()


## Clean df_jobs

In [None]:
%%time
# Conver Job ID and Sentence to str
str_cols = [
    'Job ID',
    'Job Description spacy_sentencized',
]

for col in str_cols:
    df_jobs[col] = df_jobs[col].astype(str, errors='ignore').progress_apply(lambda x: x.strip().replace('[', '').replace(']', ''))
    df_jobs[col] = df_jobs[col].apply(lambda x: x.strip())
    df_jobs[col] = df_jobs[col].apply(lambda x: unicodedata.normalize('NFKD', x.encode('ascii', 'ignore').decode('utf-8', 'ignore')))
    print(f'{col} converted to str.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, str))) else f'{col} NOT converted to str.')


In [None]:
# Convert Warmth and Competence to int
# Warmth 1 = 1741, Competence 1 = 3043
int_cols = [
    'Warmth',
    'Competence',
    'Warmth_actual',
    'Competence_actual',
    'Warmth_predicted',
    'Competence_predicted',
]

for col in int_cols:
    df_jobs[col] = df_jobs[col].astype(np.int64, errors='ignore')
    print(f'{col} converted to int.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, int))) else f'{col} NOT converted to int.')
    print(f'{col} value counts:\n{df_jobs[col].value_counts()}')


In [None]:
# Convert Warmth and Competence to int
# Warmth 1 = 1741, Competence 1 = 3043
float_cols = [
    'Warmth_Probability',
    'Competence_Probability',
    'Warmth_Probability_predicted',
    'Competence_Probability_predicted',
]

for col in float_cols:
    df_jobs[col] = df_jobs[col].astype(np.float64, errors='ignore')
    print(f'{col} converted to float.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, float))) else f'{col} NOT converted to int.')


In [None]:
df_jobs.info()


In [None]:
df_jobs = df_jobs.dropna(
    subset=[
        col
        for col in df_jobs.columns
        if '_actual' not in col and '_predicted' not in col
    ]
)


In [None]:
df_jobs.info()


In [None]:
df_jobs = df_jobs.drop_duplicates(subset=merge_on_cols_list)


In [None]:
df_jobs.info()


In [None]:
df_jobs.isna().sum()


In [None]:
df_jobs.describe()


In [None]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted',
    ]
].head()


In [None]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted',
    ]
].describe()



In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_for_analysis.csv', index=False)


In [None]:
print(f'Saving classified df_jobs length {len(df_jobs)} to txt file.')
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'w') as f:
    f.write(str(len(df_jobs)))


In [None]:
get_df_info(
    df_jobs,
    ivs_all=analysis_columns + [f'{col}_actual' for col in analysis_columns] + [f'{col}_predicted' for col in analysis_columns]
)


In [None]:
get_df_info(
    df_jobs,
    ivs_all=classified_columns + [f'{col}_predicted' for col in classified_columns ]
)


In [None]:
get_df_info(
    df_jobs,
    ivs_all=dvs_all + [f'{col}_actual' for col in dvs_all if '_Probability' not in col] + [f'{col}_predicted' for col in dvs_all]
)


### Save dataframe


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_for_analysis.csv', index=False)


In [None]:
print(f'Saving classified df_jobs length {len(df_jobs)} to txt file.')
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'w') as f:
    f.write(str(len(df_jobs)))


In [None]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_for_analysis.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_for_analysis.csv', index=False)


In [None]:
print(f'Saving classified df_manual length {len(df_manual)} to txt file.')
with open(f'{data_dir}df_manual_for_analysis_len.txt', 'w') as f:
    f.write(str(len(df_manual)))
