In [1]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module.estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


Using MPS


0it [00:00, ?it/s]

Using MPS


<Figure size 640x480 with 0 Axes>

### Set variables

In [3]:
# Variables
t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
protocol = pickle.HIGHEST_PROTOCOL
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
classified_columns = ['Warmth_Probability', 'Competence_Probability']
metrics_dict = {
    f'{scoring.title()} Best Score': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Brier Score': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()
torch.Generator(device_name).manual_seed(random_state)
cores = multiprocessing.cpu_count()
accelerator = Accelerator()
torch.autograd.set_detect_anomaly(True)
os.environ.get('TOKENIZERS_PARALLELISM')
os.environ.get('PYTORCH_MPS_HIGH_WATERMARK_RATIO')
os.environ.get('TRANSFORMERS_CACHE')
openai_token = os.environ['OPENAI_API_KEY']
huggingface_token = os.environ['HUGGINGFACE_API_KEY']
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4'
quantization_config_dict = {
    'load_in_8bit': True,
    'llm_int8_skip_modules': ['lm_head'],
}
hyperparameter_tuning = True


Using MPS


# Functions


In [4]:
def load_classified_df(
    df, df_name, df_len, done_dfs_name, df_save_dir
):
    print(f'Loading {df_name}_{done_dfs_name}...')
    df = pd.read_pickle(f'{df_save_dir}{df_name}_{done_dfs_name}.pkl')
    assert len(df) == df_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_len} BUT IS OF LENGTH {len(df)}'
    print(f'Dataframe {df_name}_{done_dfs_name} loaded with shape: {df.shape}')

    return df


In [5]:
def extract_prediction(text):
    if pred := classifier(text):
        return pd.Series([pred[0]['label'].split('LABEL_')[1], pred[0]['score']])
    else:
        return None, None


In [6]:
class ToDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            key: val[idx].clone().detach().to(device)
            for key, val in self.encodings.items()
        }

    def __len__(self):
        return len(self.encodings['input_ids'])


In [7]:
class ImbTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = self._calculate_class_weights(self.train_dataset)
        self.class_weighted_loss = None
        self.calibration_loss = None

    def _calculate_class_weights(self, dataset):
        # Count the number of samples in each class
        class_counts = torch.zeros(self.model.config.num_labels)
        for label in dataset.labels:
            class_counts[label] += 1

        # Calculate the inverse frequency of each class
        inv_frequencies = 1 / class_counts

        # Normalize the inverse frequencies so that they sum up to 1
        sum_inv_frequencies = torch.sum(inv_frequencies)
        return inv_frequencies / sum_inv_frequencies

    def _calculate_calibration_loss(self, logits, labels):
        # Compute the predicted probabilities using softmax
        probabilities = torch.nn.functional.softmax.softmax(logits, dim=-1)

        # Compute the calibration loss
        try:
            calibration_loss = nn.BCEWithLogitsLoss()(probabilities, labels)
        except:
            # Calibration loss (Negative Log-Likelihood)
            nll_loss = -torch.log(probabilities.gather(dim=1, index=labels.view(-1, 1)))
            calibration_loss = nll_loss.mean()

        return calibration_loss

    def compute_total_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels')
        outputs = model(**inputs)

        # Calculate the class_weight loss (cross-entropy loss)
        class_loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(device))
        class_weighted_loss = class_loss_fct(outputs.logits, labels)

        # Calculate the calibration loss
        calibration_loss = self._calculate_calibration_loss(outputs.logits, labels)

        # You can adjust the trade-off between class-weighted and calibration loss using a hyperparameter
        trade_off = 0.5  # Adjust this value as needed
        try:
            total_loss = trade_off * class_weighted_loss + (1 - trade_off) * calibration_loss
        except:
            total_loss = class_loss + calibration_loss

        return (total_loss, outputs) if return_outputs else total_loss


In [8]:
# Function to get y_pred and y_pred_prob
def preprocess_logits_for_metrics_from_logits(y_pred_logits):

    # HACK
    # Get y_pred
    print('-'*20)
    y_pred_logits_tensor = torch.tensor(y_pred_logits, device=device)
    print('Getting y_pred through argmax of y_pred_logits...')
    try:
        y_pred_array = torch.argmax(y_pred_logits_tensor, axis=-1).cpu().numpy()
        print('Using torch.argmax.')
    except Exception:
        y_pred_array = y_pred_logits.argmax(axis=-1)
        print('Using np.argmax.')
    print(f'y_pred_array shape: {y_pred_array.shape}')
    print('-'*20)
    print('Flattening y_pred...')
    y_pred = y_pred_array.flatten().tolist()
    print(f'y_pred length: {len(y_pred)}')
    print('-'*20)

    # Get y_pred_prob
    print('-'*20)
    print('Getting y_pred_prob through softmax of y_pred_logits...')
    try:
        y_pred_prob_array = torch.nn.functional.softmax(y_pred_logits_tensor, dim=-1).cpu().numpy()
        print('Using torch.nn.functional.softmax.')
    except Exception:
        y_pred_prob_array = scipy.special.softmax(y_pred_logits, axis=-1)
        print('Using scipy.special.softmax.')
    # from: https://discuss.huggingface.co/t/different-results-predicting-from-trainer-and-model/12922
    assert all(y_pred_prob_array.argmax(axis=-1) == y_pred_array), 'Argmax of y_pred_prob_array does not match y_pred_array.'
    print(f'y_pred_prob shape: {y_pred_prob_array.shape}')
    print('-'*20)
    print('Flattening y_pred_prob and extracting probabilities of 1...')
    y_pred_prob = y_pred_prob_array[:, -1].flatten().tolist()
    print(f'y_pred length: {len(y_pred_prob)}')
    print('-'*20)

    y_pred_logits_tensor.clone().detach()

    return (
        y_pred_array, y_pred, y_pred_prob_array, y_pred_prob
    )


In [9]:
def prob_confirmatory_tests(y_pred, y_pred_prob):

    # Confirmatory Regression
    print('+'*20)
    print('Confirmatory Tests validating the linear relationship between y_pred and y_pred_prob')
    print('-'*20)
    print('T-Test y_pred_prob ~ y_pred:')
    levene = scipy.stats.levene(y_pred_prob, y_pred)
    equal_var_levene = levene.pvalue < 0.05
    print(scipy.stats.ttest_ind(y_pred_prob, y_pred, equal_var=equal_var_levene))

    print('\n')
    print('-'*20)
    print('Logit y_pred ~ y_pred_prob:')
    try:
        logit_model = sm.Logit(endog=y_pred, exog=y_pred_prob)
        logit_results = logit_model.fit()
        std_coef = logit_results.params[0] / np.std(y_pred_prob)
        std_err = logit_results.bse[0]
        log_likelihood = logit_results.llf
        print(logit_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
        print(f'Log Likelihood: {log_likelihood}')
    except Exception as e:
        print(type(e).__name__)

    print('-'*20)
    print('\n')
    print('-'*20)
    print('OLS y_pred_prob ~ y_pred:')
    try:
        ols_model = sm.OLS(endog=y_pred_prob, exog=y_pred)
        ols_results = ols_model.fit()
        std_coef = ols_results.params[0] / np.std(y_pred)
        std_err = ols_results.bse[0]
        print(ols_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
    except Exception as e:
        print(type(e).__name__)

    print('-'*20)
    print('+'*20)
    print('\n')


# Classifying

### READ DATA

In [10]:
# # ATTN: IF THIS IS THE FIRST TIME YOU ARE CLASSIFYING JOBS, UNCOMMENT AND RUN THIS CODE
# with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
#     df_jobs_len = int(f.read())
# df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_classification.pkl')
# assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
# print(f'Dataframe df_jobs loaded with shape: {df_jobs.shape}')
# with open(f'{data_dir}df_manual_len.txt', 'r') as f:
#     df_manual_len = int(f.read())
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_training.pkl')
# assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
# print(f'Dataframe df_manual loaded with shape: {df_manual.shape}')


In [11]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

with open(f'{data_dir}df_manual_len.txt', 'r') as f:
    df_manual_len = int(f.read())

for done_dfs_name in ['classified', 'classified_Warmth_Competence', 'classified_Warmth']:
    if (os.path.exists(f'{df_save_dir}df_jobs_{done_dfs_name}.pkl') and os.path.getsize(f'{df_save_dir}df_jobs_{done_dfs_name}.pkl') > 0) and (os.path.exists(f'{df_save_dir}df_manual_{done_dfs_name}.pkl') and os.path.getsize(f'{df_save_dir}df_manual_{done_dfs_name}.pkl') > 0):

        df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_{done_dfs_name}.pkl')
        assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
        df_manual = pd.read_pickle(f'{df_save_dir}df_manual_{done_dfs_name}.pkl')
        assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'

        if (
            done_dfs_name == 'classified'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' in df_jobs.columns
            and 'Competence_Probability' in df_jobs.columns
        ):
            df_jobs = load_classified_df(df_jobs, 'df_jobs', df_jobs_len, done_dfs_name, df_save_dir)
            df_manual = load_classified_df(df_manual, 'df_manual', df_manual_len, done_dfs_name, df_save_dir)
            break

        elif (
            done_dfs_name == 'classified_Warmth_Competence'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' in df_jobs.columns
            and 'Competence_Probability' in df_jobs.columns
        ):
            df_jobs = load_classified_df(df_jobs, 'df_jobs', df_jobs_len, done_dfs_name, df_save_dir)
            df_manual = load_classified_df(df_manual, 'df_manual', df_manual_len, done_dfs_name, df_save_dir)
            break

        elif (
            done_dfs_name == 'classified_Warmth'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' not in df_jobs.columns
            and 'Competence_Probability' not in df_jobs.columns
        ):
            df_jobs = load_classified_df(df_jobs, 'df_jobs', df_jobs_len, done_dfs_name, df_save_dir)
            df_manual = load_classified_df(df_manual, 'df_manual', df_manual_len, done_dfs_name, df_save_dir)
            break

else:
    print('Loading df_jobs_for_classification...')
    df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_classification.pkl')
    assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
    print(f'Dataframe df_jobs_for_classification loaded with shape: {df_jobs.shape}')
    df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_training.pkl')
    assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
    print(f'Dataframe df_manual_for_training loaded with shape: {df_manual.shape}')


Loading df_jobs_classified...
Dataframe df_jobs_classified loaded with shape: (307154, 92)
Loading df_manual_classified...
Dataframe df_manual_classified loaded with shape: (5947, 76)


In [12]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

done_cols = ''
final_estimators_dict = {
    'Warmth': {
        'vectorizer_name': 'BERTBASEUNCASED',
        'classifier_name': 'BertForSequenceClassification',
    },
    'Competence': {
        'vectorizer_name': 'BERTBASEUNCASED',
        'classifier_name': 'BertForSequenceClassification',
    },
}

for col in tqdm.tqdm(analysis_columns):
    if col not in df_jobs.columns and f'{col}_predicted' in df_manual.columns:
        print('-'*20)
        final_estimators_dict[col]['path_suffix'] = path_suffix = f' - {col} - {(vectorizer_name := final_estimators_dict[col]["vectorizer_name"])} + {(classifier_name := final_estimators_dict[col]["classifier_name"])} (Save_protocol={protocol})'

        if classifier_name in list(classifiers_pipe.keys()):
            method = 'Supervised'
            with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
                results_save_path = f.read()
            print('-'*20)
            print(f'Using {classifier_name} from {method} pipeline.')
            print('Loading Supervised Estimator.')
            with open(
                f'{results_save_path}{method} Fitted Estimator {path_suffix}.pkl', 'rb'
            ) as f:
                estimator = joblib.load(f)
            print('Done loading Supervised Estimator!')

            print('-'*20)
            print('Classifying data.')
            # df_jobs
            print('Classifying df_jobs.')
            X = np.array(list(df_jobs[text_col].astype('str').values))
            df_jobs[col] = estimator.predict(X)
            if hasattr(estimator, 'predict_proba'):
                # Get the the whole of the last column, which is the  probability of 1, and flatten to list
                df_jobs[f'{col}_Probability'] = estimator.predict_proba(X)[:, -1]
            # df_manual
            print('Classifying df_manual to generate instrumental variables.')
            X_instrument = np.array(list(df_manual[text_col].astype('str').values))
            df_manual[f'{col}_predicted'] = estimator.predict(X_instrument)
            if hasattr(estimator, 'predict_proba'):
                # Get the the whole of the last column, which is the  probability of 1, and flatten to list
                df_manual[f'{col}_Probability_predicted'] = estimator.predict_proba(X_instrument)[:, -1]

            print(f'Done classifying data using {classifier_name} for {col}!')
            print('-'*20)

        elif classifier_name in list(transformers_pipe.keys()):
            method = 'Transformers'
            with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
                results_save_path = f.read().strip('\n')
            with open(f'{data_dir}{method}_done_xy_save_path.txt', 'r') as f:
                done_xy_save_path = f.read().strip('\n')
            with open(f'{done_xy_save_path}{method} training_args_dict - {col} - {vectorizer_name} + {classifier_name}.json', 'r') as f:
                training_args_dict = json.load(f)
            print('-'*20)
            print(f'Using {classifier_name} from {method} pipeline.')
            model = transformers_pipe[classifier_name]['model']
            tokenizer = transformers_pipe[classifier_name]['tokenizer']
            config = transformers_pipe[classifier_name]['config']

            print(f'Loading Fitted Transformer {classifier_name} from pretrained.')
            estimator_dir = f'{results_save_path}{method} Fitted Estimator{path_suffix}.model'
            fitted_estimator = model.from_pretrained(estimator_dir, trust_remote_code=True)
            if hasattr(fitted_estimator, 'to'):
                fitted_estimator = fitted_estimator.to(device)
            tokenizer = tokenizer.from_pretrained(estimator_dir, trust_remote_code=True)
            config = config.from_pretrained(f'{estimator_dir}/config.json', trust_remote_code=True)
            print(f'Done loading Fitted Transformer {classifier_name} from pretrained!')

            try:
                print('Using transformers pipeline.')
                # Get predictions
                # Accelerate model
                (
                    fitted_estimator, tokenizer
                ) = accelerator.prepare(
                    fitted_estimator, tokenizer
                )
                classifier = transformers.pipeline(
                    model=fitted_estimator, tokenizer=tokenizer, function_to_apply='softmax', device=device, framework='pt', task='text-classification', return_all_scores=False
                )
                # df_jobs
                print('Classifying df_jobs.')
                df_jobs[[col, f'{col}_Probability']] = df_jobs[text_col].astype(str).progress_apply(extract_prediction, result_type='expand')
                # df_manual
                print('Classifying df_manual to generate instrumental variables.')
                df_manual[[f'{col}_predicted', f'{col}_Probability_predicted']] = df_manual[text_col].astype(str).progress_apply(extract_prediction, result_type='expand')
                # for idx_, row in tqdm.tqdm(df_jobs[text_col].items()):
                #     pred = classifier(row)
                #     df_jobs.loc[idx_, col] = pred[0]['label'].split('LABEL_')[1]
                #     df_jobs.loc[idx_, f'{col}_Probability'] = pred[0]['score']
                # df_jobs[col] = df_jobs[text_col].astype(str).progress_apply(lambda x: [pred['label'].split('LABEL_')[1] for pred in classifier(x)][0])
                # df_jobs[f'{col}_Probability'] = df_jobs[text_col].astype(str).progress_apply(lambda x: [pred['score'] for pred in classifier(x)][0])

            except Exception as e:
                print(f'Transformers pipeline caused {type(e).__name__}. Using Trainer instead.')
                # Tokenize df_jobs
                X = df_jobs[text_col].astype('str').values.tolist()
                encodings = tokenizer(
                X, truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor, add_special_tokens=True
                ).to(device)
                dataset = ToDataset(encodings)
                # Tokenize df_manual
                X_instrument = df_manual[text_col].astype('str').values.tolist()
                encodings_instrument = tokenizer(
                X_instrument, truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor, add_special_tokens=True
                ).to(device)
                dataset_instrument = ToDataset(encodings_instrument)
                # Accelerate model
                (
                    fitted_estimator, tokenizer, dataset, dataset_instrument
                ) = accelerator.prepare(
                    fitted_estimator, tokenizer, dataset, dataset_instrument
                )

                print(f'Getting estimator for {col}.')
                estimator = ImbTrainer(
                    model=fitted_estimator,
                    tokenizer=tokenizer,
                    args=TrainingArguments(**training_args_dict),
                )
                if estimator.place_model_on_device:
                    estimator.model.to(device)

                # df_jobs
                print('-'*20)
                print(f'Classifying data using {classifier_name} for {col}.')
                print('Classifying df_jobs.')
                (y_pred_logits, y_labels, metrics) = estimator.predict(dataset)
                y_pred_array, y_pred, y_pred_prob_array, y_pred_prob = preprocess_logits_for_metrics_from_logits(y_pred_logits)

                # Assign to dataframe
                df_jobs[col] = y_pred
                df_jobs[f'{col}_Probability'] = y_pred_prob

                # df_manual
                print('Classifying df_manual to generate instrumental variables.')
                (y_pred_logits_instrument, y_labels_instrument, metrics_instrument) = estimator.predict(dataset_instrument)
                y_pred_array_instrument, y_pred_instrument, y_pred_prob_array_instrument, y_pred_prob_instrument = preprocess_logits_for_metrics_from_logits(y_pred_logits_instrument)

                # Assign to dataframe
                df_manual[f'{col}_predicted'] = y_pred_instrument
                df_manual[f'{col}_Probability_predicted'] = y_pred_prob_instrument

            # Confirmatory Regression
            prob_confirmatory_tests(df_jobs[col].values, df_jobs[f'{col}_Probability'].values)
            prob_confirmatory_tests(df_manual[col].values, df_manual[f'{col}_Probability_predicted'].values)

            print(f'Done classifying data using {classifier_name} for {col}!')
            print('-'*20)

        done_cols += f'_{col}'
        assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
        # df_jobs
        df_jobs.to_pickle(f'{df_save_dir}df_jobs_classified{done_cols}.pkl')
        df_jobs.to_csv(f'{df_save_dir}df_jobs_classified{done_cols}.csv', index=False)
        # df_manual
        df_manual.to_pickle(f'{df_save_dir}df_manual_classified{done_cols}.pkl')
        df_manual.to_csv(f'{df_save_dir}df_manual_classified{done_cols}.csv', index=False)
    else:
        print('-'*20)
        print(f'Column {col} already exists in dataframe. Skipping.')
        print('-'*20)



########################################
Starting!
########################################


100%|██████████| 2/2 [00:00<00:00, 10420.63it/s]

--------------------
Column Warmth already exists in dataframe. Skipping.
--------------------
--------------------
Column Competence already exists in dataframe. Skipping.
--------------------
CPU times: user 1.89 ms, sys: 1.92 ms, total: 3.81 ms
Wall time: 3.01 ms





## Inspect classified data

In [13]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_classified.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_classified.csv', index=False)



In [14]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_classified.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'


In [15]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_classified.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_classified.csv', index=False)


In [16]:
with open(f'{data_dir}df_manual_len.txt', 'r') as f:
    df_manual_len = int(f.read())

df_manual = pd.read_pickle(f'{df_save_dir}df_manual_classified.pkl')
assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'


In [17]:
df_jobs.head()


Unnamed: 0,Search Keyword,Platform,Job ID,Job Title,Company Name,Location,Job Description,Rating,Employment Type,Company URL,Job URL,Job Age,Job Age Number,Collection Date,Data Row,Tracking ID,Industry,Job Date,Type of ownership,Language,Dutch Requirement in Job Ad,English Requirement in Job Ad,Dutch Requirement in Job Ad_No,Dutch Requirement in Job Ad_Yes,English Requirement in Job Ad_No,English Requirement in Job Ad_Yes,Sector Code,Sector,Keywords Count,Gender_Female_n,Gender_Female_% per Sector,Gender_Female_% per Social Category,Gender_Female_% per Workforce,Gender_Male_n,Gender_Male_% per Sector,Gender_Male_% per Social Category,Gender_Male_% per Workforce,Gender,Age_Older_n,Age_Older_% per Sector,Age_Older_% per Social Category,Age_Older_% per Workforce,Age_Younger_n,Age_Younger_% per Sector,Age_Younger_% per Social Category,Age_Younger_% per Workforce,Age,Sector_n,% Sector per Workforce,Gender_Female,Gender_Male,Gender_Mixed,Age_Mixed,Age_Older,Age_Younger,Gender_Num,Age_Num,Interaction_Female_Older_% per Sector,Interaction_Female_Younger_% per Sector,Interaction_Male_Older_% per Sector,Interaction_Male_Younger_% per Sector,Platform_Num,Platform_LinkedIn,Platform_Indeed,Platform_Glassdoor,Job Description spacy_sentencized,Job Description spacy_sentencized_num_words,Job Description spacy_sentencized_num_unique_words,Job Description spacy_sentencized_num_chars,Job Description spacy_sentencized_num_chars_no_whitespact_and_punt,Job Description spacy_sentencized_num_punctuations,Job Description_num_words,Job Description_num_unique_words,Job Description_num_chars,Job Description_num_chars_no_whitespact_and_punt,Job Description_num_punctuations,Job Description spacy_sentencized_lower,Dutch Requirement in Sentence,English Requirement in Sentence,Dutch Requirement in Sentence_No,Dutch Requirement in Sentence_Yes,English Requirement in Sentence_No,English Requirement in Sentence_Yes,Job Description spacy_tokenized,Job Description spacy_sentencized_cleaned,Job Description nltk_tokenized,Job Description gensim_tokenized,Job Description bert_tokenized,Warmth,Warmth_Probability,Competence,Competence_Probability
0,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,1,0,1,0,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1470.63,2843.37,1938.77,3748.49,1,0,1,0,About Our Client,3,3,16,14,0,558,320,3876,3240,23,about our client,No,No,1,0,1,0,"[about, our, client]",about our client,[client],[client],"[about, our, client]",0,0.02,0,0.01
1,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,1,0,1,0,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1470.63,2843.37,1938.77,3748.49,1,0,1,0,The Global KYC organisation is part of ING's C...,10,10,56,45,1,558,320,3876,3240,23,the global kyc organisation is part of ing's c...,No,No,1,0,1,0,"[the, global, kyc, organisation, is, part, of,...",the global kyc organisation is part of ing 's ...,"[global, kyc, organisation, part, ing, 's, coo...","[global, kyc, organis, ing, coo, domain]","[the, global, ky, ##c, organisation, is, part,...",0,0.01,0,0.1
2,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,1,0,1,0,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1470.63,2843.37,1938.77,3748.49,1,0,1,0,Its purpose is Enabling people and organisatio...,20,19,131,111,1,558,320,3876,3240,23,its purpose is enabling people and organisatio...,No,No,1,0,1,0,"[its, purpose, is, enabling, people, and, orga...",its purpose is enabling people and organisatio...,"[purpose, enabling, people, organisations, use...","[purpos, enabl, peopl, organis, us, bank, serv...","[its, purpose, is, enabling, people, and, orga...",1,0.75,1,0.56
3,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,1,0,1,0,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1470.63,2843.37,1938.77,3748.49,1,0,1,0,Our Global KYC organisation is a first line of...,34,31,239,203,1,558,320,3876,3240,23,our global kyc organisation is a first line of...,No,No,1,0,1,0,"[our, global, kyc, organisation, is, a, first,...",our global kyc organisation is a first line of...,"[global, kyc, organisation, first, line, defen...","[global, kyc, organis, line, defenc, depart, p...","[our, global, ky, ##c, organisation, is, a, fi...",0,0.07,1,0.88
4,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,1,0,1,0,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1470.63,2843.37,1938.77,3748.49,1,0,1,0,In our Global KYC organisation you will be wor...,18,18,128,109,1,558,320,3876,3240,23,in our global kyc organisation you will be wor...,No,No,1,0,1,0,"[in, our, global, kyc, organisation, you, will...",in our global kyc organisation you will be wor...,"[global, kyc, organisation, working, many, col...","[global, kyc, organis, work, colleagu, differ,...","[in, our, global, ky, ##c, organisation, you, ...",1,0.89,0,0.08


In [18]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 92 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      307154 non-null  object  
 1   Platform                                                            307154 non-null  object  
 2   Job ID                                                              307154 non-null  object  
 3   Job Title                                                           307154 non-null  object  
 4   Company Name                                                        307149 non-null  object  
 5   Location                                                            307154 non-null  object  
 6   Job Description                                                     307154 non-null  object  
 7 

In [19]:
df_jobs.describe()


Unnamed: 0,Rating,Data Row,Dutch Requirement in Job Ad_No,Dutch Requirement in Job Ad_Yes,English Requirement in Job Ad_No,English Requirement in Job Ad_Yes,Keywords Count,Gender_Female_n,Gender_Female_% per Sector,Gender_Female_% per Social Category,Gender_Female_% per Workforce,Gender_Male_n,Gender_Male_% per Sector,Gender_Male_% per Social Category,Gender_Male_% per Workforce,Age_Older_n,Age_Older_% per Sector,Age_Older_% per Social Category,Age_Older_% per Workforce,Age_Younger_n,Age_Younger_% per Sector,Age_Younger_% per Social Category,Age_Younger_% per Workforce,Sector_n,% Sector per Workforce,Gender_Female,Gender_Male,Gender_Mixed,Age_Mixed,Age_Older,Age_Younger,Gender_Num,Age_Num,Interaction_Female_Older_% per Sector,Interaction_Female_Younger_% per Sector,Interaction_Male_Older_% per Sector,Interaction_Male_Younger_% per Sector,Platform_Num,Platform_LinkedIn,Platform_Indeed,Platform_Glassdoor,Job Description spacy_sentencized_num_words,Job Description spacy_sentencized_num_unique_words,Job Description spacy_sentencized_num_chars,Job Description spacy_sentencized_num_chars_no_whitespact_and_punt,Job Description spacy_sentencized_num_punctuations,Job Description_num_words,Job Description_num_unique_words,Job Description_num_chars,Job Description_num_chars_no_whitespact_and_punt,Job Description_num_punctuations,Dutch Requirement in Sentence_No,Dutch Requirement in Sentence_Yes,English Requirement in Sentence_No,English Requirement in Sentence_Yes,Warmth,Warmth_Probability,Competence,Competence_Probability
count,165831.0,141182.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0
mean,-0.38,359.68,0.97,0.03,0.94,0.06,6.81,656.79,45.36,5.47,2.6,712.05,54.6,5.37,2.82,528.39,40.86,4.97,2.09,840.04,59.04,5.74,3.32,1369.08,5.42,0.25,0.37,0.38,0.64,0.2,0.15,1.11,0.95,1846.01,2681.76,2238.91,3219.13,0.62,0.46,0.45,0.08,17.67,15.81,114.11,95.86,0.31,613.29,316.26,4089.23,3387.65,21.97,0.97,0.03,0.94,0.06,0.31,0.31,0.52,0.47
std,1.63,284.66,0.17,0.17,0.23,0.23,2.88,1047.82,19.49,8.72,4.15,1235.51,19.52,9.32,4.89,796.79,10.12,7.49,3.15,1437.93,9.98,9.83,5.69,2216.35,8.77,0.44,0.48,0.49,0.48,0.4,0.36,0.78,0.6,974.24,1138.22,1110.8,1207.73,0.64,0.5,0.5,0.28,16.45,12.77,107.75,90.37,0.53,524.43,147.74,3623.96,3021.22,34.37,0.17,0.17,0.23,0.23,0.46,0.38,0.5,0.35
min,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,7.0,12.5,0.06,0.03,21.0,15.63,0.16,0.08,15.0,18.94,0.14,0.06,13.0,44.44,0.09,0.05,29.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,595.24,662.2,721.04,839.94,0.0,0.0,0.0,0.0,3.0,2.0,6.0,1.0,0.0,4.0,4.0,31.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-1.0,111.0,1.0,0.0,1.0,0.0,4.0,87.0,27.59,0.72,0.34,189.0,34.87,1.43,0.75,95.0,32.76,0.89,0.38,195.0,51.18,1.33,0.77,290.0,1.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1092.89,1854.93,1627.74,1936.98,0.0,0.0,0.0,0.0,7.0,7.0,45.0,38.0,0.0,353.0,213.0,2289.0,1891.0,0.0,1.0,0.0,1.0,0.0,0.0,0.02,0.0,0.07
50%,-1.0,291.0,1.0,0.0,1.0,0.0,7.0,226.0,43.13,1.88,0.89,210.0,56.87,1.58,0.83,205.0,41.67,1.93,0.81,288.0,56.82,1.97,1.14,398.0,1.57,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1366.56,2843.37,1938.77,3571.64,1.0,0.0,0.0,0.0,13.0,12.0,84.0,70.0,0.0,536.0,311.0,3557.0,2930.0,14.0,1.0,0.0,1.0,0.0,0.0,0.07,1.0,0.54
75%,-1.0,578.0,1.0,0.0,1.0,0.0,9.0,416.0,65.13,3.46,1.65,557.0,72.41,4.2,2.2,661.0,48.82,6.22,2.62,708.0,67.24,4.84,2.8,1399.0,5.54,1.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,2746.21,3460.74,2372.18,3939.36,1.0,1.0,1.0,0.0,22.0,20.0,144.0,120.0,1.0,751.0,408.0,4973.0,4125.0,35.0,1.0,0.0,1.0,0.0,1.0,0.77,1.0,0.82
max,5.0,1000.0,1.0,1.0,1.0,1.0,11.0,3970.0,84.3,33.04,15.71,4510.0,87.5,34.04,17.85,2844.0,58.33,26.75,11.25,5228.0,80.81,35.73,20.69,7931.0,31.39,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3888.45,4529.66,4699.07,4869.2,2.0,1.0,1.0,1.0,349.0,209.0,2496.0,2142.0,11.0,10385.0,1107.0,66310.0,55654.0,631.0,1.0,1.0,1.0,1.0,1.0,0.96,1.0,0.94


In [20]:
df_jobs[['Job ID', 'Job Description spacy_sentencized']].info()


<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 2 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Job ID                             307154 non-null  object
 1   Job Description spacy_sentencized  307154 non-null  object
dtypes: object(2)
memory usage: 7.0+ MB


In [21]:
df_jobs[['Job ID', 'Job Description spacy_sentencized']].head()


Unnamed: 0,Job ID,Job Description spacy_sentencized
0,pj_da9f2c12243d7031,About Our Client
1,pj_da9f2c12243d7031,The Global KYC organisation is part of ING's C...
2,pj_da9f2c12243d7031,Its purpose is Enabling people and organisatio...
3,pj_da9f2c12243d7031,Our Global KYC organisation is a first line of...
4,pj_da9f2c12243d7031,In our Global KYC organisation you will be wor...


In [22]:
df_manual.head()


Unnamed: 0,Job ID,Job Description spacy_sentencized,Warmth,Competence,Dutch Requirement in Sentence,English Requirement in Sentence,Dutch Requirement in Sentence_No,Dutch Requirement in Sentence_Yes,English Requirement in Sentence_No,English Requirement in Sentence_Yes,Search Keyword,Platform,Job Title,Company Name,Location,Industry,Dutch Requirement in Job Ad,English Requirement in Job Ad,Dutch Requirement in Job Ad_No,Dutch Requirement in Job Ad_Yes,English Requirement in Job Ad_No,English Requirement in Job Ad_Yes,Sector Code,Sector,Keywords Count,Gender_Female_n,Gender_Female_% per Sector,Gender_Female_% per Social Category,Gender_Female_% per Workforce,Gender_Male_n,Gender_Male_% per Sector,Gender_Male_% per Social Category,Gender_Male_% per Workforce,Gender,Age_Older_n,Age_Older_% per Sector,Age_Older_% per Social Category,Age_Older_% per Workforce,Age_Younger_n,Age_Younger_% per Sector,Age_Younger_% per Social Category,Age_Younger_% per Workforce,Age,Sector_n,% Sector per Workforce,Gender_Female,Gender_Male,Gender_Mixed,Age_Mixed,Age_Older,Age_Younger,Gender_Num,Age_Num,Interaction_Female_Older_% per Sector,Interaction_Female_Younger_% per Sector,Interaction_Male_Older_% per Sector,Interaction_Male_Younger_% per Sector,Platform_Num,Platform_LinkedIn,Platform_Indeed,Platform_Glassdoor,Job Description spacy_sentencized_lower,Job Description spacy_tokenized,Job Description spacy_sentencized_cleaned,Job Description spacy_sentencized_num_words,Job Description spacy_sentencized_num_unique_words,Job Description spacy_sentencized_num_chars,Job Description spacy_sentencized_num_chars_no_whitespact_and_punt,Job Description spacy_sentencized_num_punctuations,Job Description nltk_tokenized,Job Description gensim_tokenized,Job Description bert_tokenized,Warmth_predicted,Warmth_Probability_predicted,Competence_predicted,Competence_Probability_predicted
0,3768944208,Were Happeo a diverse team of passionate prob...,1,1,No,No,1,0,1,0,social work activity,Glassdoor,Inbound Marketer,Happeo,Amsterdam,Ondernemingssoftware en netwerkoplossingen,No,No,1,0,1,0,Q,Health and social work activities,11.0,1208.0,84.3,10.05,4.78,224.0,15.63,1.69,0.89,Female,661.0,46.13,6.22,2.62,770.0,53.73,5.26,3.05,Mixed Age,1433.0,5.67,1,0,0,1,0,0,0,1,3888.45,4529.66,721.04,839.94,2,0,0,1,were happeo a diverse team of passionate prob...,"[were, happeo, a, diverse, team, of, passionat...",were happeo a diverse team of passionate probl...,22,21,153,128,1,"[happeo, diverse, team, passionate, problem-so...","[happeo, divers, team, passion, problem, solve...","[were, ha, ##ppe, ##o, a, diverse, team, of, p...",1,0.94,1,0.78
1,3768944208,Work closely with our Sales and Product leader...,1,1,No,No,1,0,1,0,social work activity,Glassdoor,Inbound Marketer,Happeo,Amsterdam,Ondernemingssoftware en netwerkoplossingen,No,No,1,0,1,0,Q,Health and social work activities,11.0,1208.0,84.3,10.05,4.78,224.0,15.63,1.69,0.89,Female,661.0,46.13,6.22,2.62,770.0,53.73,5.26,3.05,Mixed Age,1433.0,5.67,1,0,0,1,0,0,0,1,3888.45,4529.66,721.04,839.94,2,0,0,1,work closely with our sales and product leader...,"[work, closely, with, our, sales, and, product...",work closely with our sales and product leader...,23,21,140,116,0,"[work, closely, sales, product, leadership, pi...","[work, close, sale, product, leadership, pinpo...","[work, closely, with, our, sales, and, product...",1,0.89,1,0.86
2,3768944208,Assist Marketing & Product to position our pro...,1,1,No,No,1,0,1,0,social work activity,Glassdoor,Inbound Marketer,Happeo,Amsterdam,Ondernemingssoftware en netwerkoplossingen,No,No,1,0,1,0,Q,Health and social work activities,11.0,1208.0,84.3,10.05,4.78,224.0,15.63,1.69,0.89,Female,661.0,46.13,6.22,2.62,770.0,53.73,5.26,3.05,Mixed Age,1433.0,5.67,1,0,0,1,0,0,0,1,3888.45,4529.66,721.04,839.94,2,0,0,1,assist marketing & product to position our pro...,"[assist, marketing, product, to, position, our...",assist marketing product to position our produ...,17,16,112,93,0,"[assist, marketing, product, position, product...","[assist, market, product, posit, product, impr...","[assist, marketing, &, product, to, position, ...",1,0.51,1,0.89
3,3768944208,Youre not scared of a real scale-up environmen...,1,1,No,No,1,0,1,0,social work activity,Glassdoor,Inbound Marketer,Happeo,Amsterdam,Ondernemingssoftware en netwerkoplossingen,No,No,1,0,1,0,Q,Health and social work activities,11.0,1208.0,84.3,10.05,4.78,224.0,15.63,1.69,0.89,Female,661.0,46.13,6.22,2.62,770.0,53.73,5.26,3.05,Mixed Age,1433.0,5.67,1,0,0,1,0,0,0,1,3888.45,4529.66,721.04,839.94,2,0,0,1,youre not scared of a real scale-up environmen...,"[you, re, not, scared, of, a, real, scale, up,...",you re not scared of a real scale up environme...,21,18,118,95,0,"[youre, scared, real, scale-up, environment, f...","[your, scare, real, scale, environ, initi, ide...","[your, ##e, not, scared, of, a, real, scale, -...",1,0.79,1,0.8
4,3768944208,You will be joining a company with highly skil...,1,1,No,No,1,0,1,0,social work activity,Glassdoor,Inbound Marketer,Happeo,Amsterdam,Ondernemingssoftware en netwerkoplossingen,No,No,1,0,1,0,Q,Health and social work activities,11.0,1208.0,84.3,10.05,4.78,224.0,15.63,1.69,0.89,Female,661.0,46.13,6.22,2.62,770.0,53.73,5.26,3.05,Mixed Age,1433.0,5.67,1,0,0,1,0,0,0,1,3888.45,4529.66,721.04,839.94,2,0,0,1,you will be joining a company with highly skil...,"[you, will, be, joining, a, company, with, hig...",you will be joining a company with highly skil...,37,32,220,179,2,"[joining, company, highly, skilled, smart, div...","[join, compani, highli, skill, smart, divers, ...","[you, will, be, joining, a, company, with, hig...",1,0.92,1,0.81


In [23]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
Index: 5947 entries, 0 to 5954
Data columns (total 76 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   Job ID                                                              5947 non-null   object 
 1   Job Description spacy_sentencized                                   5947 non-null   object 
 2   Warmth                                                              5947 non-null   int64  
 3   Competence                                                          5947 non-null   int64  
 4   Dutch Requirement in Sentence                                       5947 non-null   object 
 5   English Requirement in Sentence                                     5947 non-null   object 
 6   Dutch Requirement in Sentence_No                                    5947 non-null   int64  
 7   Dutch Requirement in

In [24]:
df_manual[['Job ID', 'Job Description spacy_sentencized']].info()


<class 'pandas.core.frame.DataFrame'>
Index: 5947 entries, 0 to 5954
Data columns (total 2 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Job ID                             5947 non-null   object
 1   Job Description spacy_sentencized  5947 non-null   object
dtypes: object(2)
memory usage: 139.4+ KB


In [25]:
df_manual[['Job ID', 'Job Description spacy_sentencized']].head()


Unnamed: 0,Job ID,Job Description spacy_sentencized
0,3768944208,Were Happeo a diverse team of passionate prob...
1,3768944208,Work closely with our Sales and Product leader...
2,3768944208,Assist Marketing & Product to position our pro...
3,3768944208,Youre not scared of a real scale-up environmen...
4,3768944208,You will be joining a company with highly skil...


## Merge df_jobs with df_manual

In [26]:
merge_on_cols_list = ['Job ID', 'Job Description spacy_sentencized', 'Search Keyword', 'Platform', 'Job Title', 'Company Name', 'Location']
fill_on_cols_list = ['Job Description', 'Industry', 'Rating', 'Employment Type', 'Company URL', 'Job URL', 'Job Age', 'Job Age Number', 'Collection Date', 'Data Row', 'Tracking ID', 'Job Date', 'Type of ownership', 'Language', 'Job Description_num_words', 'Job Description_num_unique_words', 'Job Description_num_chars', 'Job Description_num_chars_no_whitespact_and_punt', 'Job Description_num_punctuations']


In [27]:
df_jobs = pd.merge(
    df_jobs,
    df_manual,
    how='outer', on=merge_on_cols_list, suffixes=('', '_actual')
).drop_duplicates(
    subset=merge_on_cols_list
).reset_index(drop=True)


In [28]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309446 entries, 0 to 309445
Columns: 161 entries, Search Keyword to Competence_Probability_predicted
dtypes: category(2), float64(113), object(46)
memory usage: 376.0+ MB


In [29]:
df_jobs.isna().sum()


Search Keyword                                                                    0
Platform                                                                          0
Job ID                                                                            0
Job Title                                                                         0
Company Name                                                                      5
Location                                                                          0
Job Description                                                                2292
Rating                                                                       143615
Employment Type                                                               28394
Company URL                                                                    8885
Job URL                                                                        2292
Job Age                                                                     

In [30]:
def fill_contradicting_cols(df_jobs):

    for col in dvs:
        fill_contradicting_dict = {
            col: f'{col}_actual',
            f'{col}_Probability': f'{col}_Probability_predicted'
        }
        for main_col, fill_col in fill_contradicting_dict.items():
            print(f'Filling {main_col} with values from {fill_col}')
            df_jobs[main_col] = df_jobs[main_col].fillna(df_jobs[fill_col])
            mask = (
                (df_jobs[main_col] != df_jobs[fill_col])
                & (~df_jobs[fill_col].isna())
                | (df_jobs[main_col].isna())
            )
            df_jobs.loc[mask, main_col] = df_jobs.loc[mask, fill_col]
            assert df_jobs[main_col].isna().sum() == 0, f'Missing values found in {main_col} column'

    return df_jobs


In [31]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].head()


Unnamed: 0,Warmth,Warmth_Probability,Warmth_actual,Warmth_predicted,Warmth_Probability_predicted,Competence,Competence_Probability,Competence_actual,Competence_predicted,Competence_Probability_predicted
0,0.0,0.02,,,,0.0,0.01,,,
1,0.0,0.01,,,,0.0,0.1,,,
2,1.0,0.75,,,,1.0,0.56,,,
3,0.0,0.07,,,,1.0,0.88,,,
4,1.0,0.89,,,,0.0,0.08,,,


In [32]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].tail()


Unnamed: 0,Warmth,Warmth_Probability,Warmth_actual,Warmth_predicted,Warmth_Probability_predicted,Competence,Competence_Probability,Competence_actual,Competence_predicted,Competence_Probability_predicted
309441,,,0.0,0.0,0.01,,,0.0,0.0,0.02
309442,,,0.0,0.0,0.01,,,0.0,0.0,0.01
309443,,,0.0,0.0,0.01,,,0.0,0.0,0.01
309444,,,0.0,0.0,0.13,,,0.0,0.0,0.01
309445,,,0.0,0.0,0.01,,,0.0,0.0,0.01


In [33]:
df_jobs = fill_contradicting_cols(df_jobs)


Filling Warmth with values from Warmth_actual
Filling Warmth_Probability with values from Warmth_Probability_predicted
Filling Competence with values from Competence_actual
Filling Competence_Probability with values from Competence_Probability_predicted


In [34]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].head()


Unnamed: 0,Warmth,Warmth_Probability,Warmth_actual,Warmth_predicted,Warmth_Probability_predicted,Competence,Competence_Probability,Competence_actual,Competence_predicted,Competence_Probability_predicted
0,0.0,0.02,,,,0.0,0.01,,,
1,0.0,0.01,,,,0.0,0.1,,,
2,1.0,0.75,,,,1.0,0.56,,,
3,0.0,0.07,,,,1.0,0.88,,,
4,1.0,0.89,,,,0.0,0.08,,,


In [35]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted'
    ]
].tail()


Unnamed: 0,Warmth,Warmth_Probability,Warmth_actual,Warmth_predicted,Warmth_Probability_predicted,Competence,Competence_Probability,Competence_actual,Competence_predicted,Competence_Probability_predicted
309441,0.0,0.01,0.0,0.0,0.01,0.0,0.02,0.0,0.0,0.02
309442,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01
309443,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01
309444,0.0,0.13,0.0,0.0,0.13,0.0,0.01,0.0,0.0,0.01
309445,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01


In [36]:
# Fill in other nan values from columns with '_actual' suffix from df_manual
for col in df_jobs.columns:
    if '_actual' not in col\
        and '_predicted' not in col\
            and '_Probability' not in col\
                and col not in dvs\
                    and col not in merge_on_cols_list + fill_on_cols_list\
                        and df_jobs[col].isna().sum() != 0:
                        df_jobs[col] = df_jobs[col].fillna(df_jobs[f'{col}_actual'])


In [37]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309446 entries, 0 to 309445
Columns: 161 entries, Search Keyword to Competence_Probability_predicted
dtypes: category(2), float64(113), object(46)
memory usage: 376.0+ MB


In [38]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309446 entries, 0 to 309445
Columns: 161 entries, Search Keyword to Competence_Probability_predicted
dtypes: category(2), float64(113), object(46)
memory usage: 376.0+ MB


In [39]:
df_jobs.isna().sum()


Search Keyword                                                                    0
Platform                                                                          0
Job ID                                                                            0
Job Title                                                                         0
Company Name                                                                      5
Location                                                                          0
Job Description                                                                2292
Rating                                                                       143615
Employment Type                                                               28394
Company URL                                                                    8885
Job URL                                                                        2292
Job Age                                                                     

In [40]:
df_jobs = df_jobs.drop(
    columns=
    [
        col
        for col in df_jobs.columns
        if '_actual' in col
        and col not in ['Warmth_actual', 'Competence_actual']
        or col in fill_on_cols_list
    ]
).reset_index(drop=True)


In [41]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309446 entries, 0 to 309445
Data columns (total 79 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      309446 non-null  object  
 1   Platform                                                            309446 non-null  object  
 2   Job ID                                                              309446 non-null  object  
 3   Job Title                                                           309446 non-null  object  
 4   Company Name                                                        309441 non-null  object  
 5   Location                                                            309446 non-null  object  
 6   Dutch Requirement in Job Ad                                         309446 non-null  object 

In [42]:
df_jobs.isna().sum()


Search Keyword                                                             0
Platform                                                                   0
Job ID                                                                     0
Job Title                                                                  0
Company Name                                                               5
Location                                                                   0
Dutch Requirement in Job Ad                                                0
English Requirement in Job Ad                                              0
Dutch Requirement in Job Ad_No                                             0
Dutch Requirement in Job Ad_Yes                                            0
English Requirement in Job Ad_No                                           0
English Requirement in Job Ad_Yes                                          0
Sector Code                                                                0

## Clean df_jobs

In [43]:
%%time
# Conver Job ID and Sentence to str
str_cols = [
    'Job ID',
    'Job Description spacy_sentencized',
]

for col in str_cols:
    df_jobs[col] = df_jobs[col].astype(str, errors='ignore').progress_apply(lambda x: x.strip().replace('[', '').replace(']', ''))
    df_jobs[col] = df_jobs[col].apply(lambda x: x.strip())
    df_jobs[col] = df_jobs[col].apply(lambda x: unicodedata.normalize('NFKD', x.encode('ascii', 'ignore').decode('utf-8', 'ignore')))
    print(f'{col} converted to str.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, str))) else f'{col} NOT converted to str.')


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Job ID converted to str.


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Job Description spacy_sentencized converted to str.
CPU times: user 1.4 s, sys: 64.7 ms, total: 1.46 s
Wall time: 1.7 s


In [44]:
# Convert Warmth and Competence to int
# Warmth 1 = 1741, Competence 1 = 3043
int_cols = [
    'Warmth',
    'Competence',
    'Warmth_actual',
    'Competence_actual',
    'Warmth_predicted',
    'Competence_predicted',
]

for col in int_cols:
    df_jobs[col] = df_jobs[col].astype(np.int64, errors='ignore')
    print(f'{col} converted to int.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, int))) else f'{col} NOT converted to int.')
    print(f'{col} value counts:\n{df_jobs[col].value_counts()}')


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Warmth converted to int.
Warmth value counts:
Warmth
0    213883
1     95563
Name: count, dtype: int64


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Competence converted to int.
Competence value counts:
Competence
1    159600
0    149846
Name: count, dtype: int64


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Warmth_actual NOT converted to int.
Warmth_actual value counts:
Warmth_actual
0.00    4332
1.00    1615
Name: count, dtype: int64


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Competence_actual NOT converted to int.
Competence_actual value counts:
Competence_actual
0.00    3180
1.00    2767
Name: count, dtype: int64


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Warmth_predicted NOT converted to int.
Warmth_predicted value counts:
Warmth_predicted
0.00    4120
1.00    1827
Name: count, dtype: int64


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Competence_predicted NOT converted to int.
Competence_predicted value counts:
Competence_predicted
0.00    3052
1.00    2895
Name: count, dtype: int64


In [45]:
# Convert Warmth and Competence to int
# Warmth 1 = 1741, Competence 1 = 3043
float_cols = [
    'Warmth_Probability',
    'Competence_Probability',
    'Warmth_Probability_predicted',
    'Competence_Probability_predicted',
]

for col in float_cols:
    df_jobs[col] = df_jobs[col].astype(np.float64, errors='ignore')
    print(f'{col} converted to float.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, float))) else f'{col} NOT converted to int.')


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Warmth_Probability converted to float.


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Competence_Probability converted to float.


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Warmth_Probability_predicted converted to float.


progress-bar:   0%|          | 0/309446 [00:00<?, ?it/s]

Competence_Probability_predicted converted to float.


In [46]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309446 entries, 0 to 309445
Data columns (total 79 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      309446 non-null  object  
 1   Platform                                                            309446 non-null  object  
 2   Job ID                                                              309446 non-null  object  
 3   Job Title                                                           309446 non-null  object  
 4   Company Name                                                        309441 non-null  object  
 5   Location                                                            309446 non-null  object  
 6   Dutch Requirement in Job Ad                                         309446 non-null  object 

In [47]:
df_jobs = df_jobs.dropna(
    subset=[
        col
        for col in df_jobs.columns
        if '_actual' not in col and '_predicted' not in col
    ]
)


In [48]:
df_jobs = df_jobs.drop_duplicates(subset=merge_on_cols_list)


In [49]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
Index: 309438 entries, 0 to 309445
Data columns (total 79 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      309438 non-null  object  
 1   Platform                                                            309438 non-null  object  
 2   Job ID                                                              309438 non-null  object  
 3   Job Title                                                           309438 non-null  object  
 4   Company Name                                                        309438 non-null  object  
 5   Location                                                            309438 non-null  object  
 6   Dutch Requirement in Job Ad                                         309438 non-null  object  
 7 

In [50]:
df_jobs.isna().sum()


Search Keyword                                                             0
Platform                                                                   0
Job ID                                                                     0
Job Title                                                                  0
Company Name                                                               0
Location                                                                   0
Dutch Requirement in Job Ad                                                0
English Requirement in Job Ad                                              0
Dutch Requirement in Job Ad_No                                             0
Dutch Requirement in Job Ad_Yes                                            0
English Requirement in Job Ad_No                                           0
English Requirement in Job Ad_Yes                                          0
Sector Code                                                                0

In [51]:
df_jobs.describe()


Unnamed: 0,Dutch Requirement in Job Ad_No,Dutch Requirement in Job Ad_Yes,English Requirement in Job Ad_No,English Requirement in Job Ad_Yes,Keywords Count,Gender_Female_n,Gender_Female_% per Sector,Gender_Female_% per Social Category,Gender_Female_% per Workforce,Gender_Male_n,Gender_Male_% per Sector,Gender_Male_% per Social Category,Gender_Male_% per Workforce,Age_Older_n,Age_Older_% per Sector,Age_Older_% per Social Category,Age_Older_% per Workforce,Age_Younger_n,Age_Younger_% per Sector,Age_Younger_% per Social Category,Age_Younger_% per Workforce,Sector_n,% Sector per Workforce,Gender_Female,Gender_Male,Gender_Mixed,Age_Mixed,Age_Older,Age_Younger,Gender_Num,Age_Num,Interaction_Female_Older_% per Sector,Interaction_Female_Younger_% per Sector,Interaction_Male_Older_% per Sector,Interaction_Male_Younger_% per Sector,Platform_Num,Platform_LinkedIn,Platform_Indeed,Platform_Glassdoor,Job Description spacy_sentencized_num_words,Job Description spacy_sentencized_num_unique_words,Job Description spacy_sentencized_num_chars,Job Description spacy_sentencized_num_chars_no_whitespact_and_punt,Job Description spacy_sentencized_num_punctuations,Dutch Requirement in Sentence_No,Dutch Requirement in Sentence_Yes,English Requirement in Sentence_No,English Requirement in Sentence_Yes,Warmth,Warmth_Probability,Competence,Competence_Probability,Warmth_actual,Competence_actual,Warmth_predicted,Warmth_Probability_predicted,Competence_predicted,Competence_Probability_predicted
count,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,309438.0,5944.0,5944.0,5944.0,5944.0,5944.0,5944.0
mean,0.97,0.03,0.94,0.06,6.81,656.01,45.36,5.46,2.6,710.98,54.59,5.37,2.81,527.64,40.84,4.96,2.09,838.94,59.06,5.73,3.32,1367.24,5.41,0.25,0.37,0.38,0.64,0.2,0.16,1.11,0.95,1845.17,2682.77,2237.44,3220.32,0.63,0.46,0.46,0.08,17.63,15.78,113.92,95.69,0.31,0.97,0.03,0.94,0.06,0.31,0.31,0.52,0.47,0.27,0.47,0.31,0.31,0.49,0.44
std,0.17,0.17,0.23,0.23,2.88,1046.01,19.47,8.7,4.14,1232.31,19.5,9.3,4.88,795.31,10.11,7.48,3.15,1434.3,9.98,9.8,5.68,2211.18,8.75,0.44,0.48,0.49,0.48,0.4,0.36,0.78,0.6,973.3,1136.55,1109.26,1206.5,0.63,0.5,0.5,0.28,16.43,12.76,107.64,90.27,0.54,0.17,0.17,0.23,0.23,0.46,0.38,0.5,0.35,0.44,0.5,0.46,0.39,0.5,0.38
min,0.0,0.0,0.0,0.0,1.0,7.0,12.5,0.06,0.03,21.0,15.63,0.16,0.08,15.0,18.94,0.14,0.06,13.0,44.44,0.09,0.05,29.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,595.24,662.2,721.04,839.94,0.0,0.0,0.0,0.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,4.0,87.0,27.59,0.72,0.34,189.0,34.87,1.43,0.75,95.0,32.76,0.89,0.38,195.0,51.18,1.33,0.77,290.0,1.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1092.89,1854.93,1627.74,1936.98,0.0,0.0,0.0,0.0,7.0,7.0,45.0,38.0,0.0,1.0,0.0,1.0,0.0,0.0,0.02,0.0,0.07,0.0,0.0,0.0,0.01,0.0,0.03
50%,1.0,0.0,1.0,0.0,7.0,226.0,43.13,1.88,0.89,210.0,56.87,1.58,0.83,205.0,41.67,1.93,0.81,288.0,56.82,1.97,1.14,398.0,1.57,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1366.56,2843.37,1938.77,3571.64,1.0,0.0,0.0,0.0,13.0,12.0,83.0,70.0,0.0,1.0,0.0,1.0,0.0,0.0,0.07,1.0,0.54,0.0,0.0,0.0,0.05,0.0,0.44
75%,1.0,0.0,1.0,0.0,9.0,416.0,65.13,3.46,1.65,557.0,72.41,4.2,2.2,661.0,48.82,6.22,2.62,708.0,67.24,4.84,2.8,1399.0,5.54,1.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,2746.21,3460.74,2372.18,3939.36,1.0,1.0,1.0,0.0,22.0,20.0,143.0,120.0,1.0,1.0,0.0,1.0,0.0,1.0,0.77,1.0,0.82,1.0,1.0,1.0,0.84,1.0,0.84
max,1.0,1.0,1.0,1.0,11.0,3970.0,84.3,33.04,15.71,4510.0,87.5,34.04,17.85,2844.0,58.33,26.75,11.25,5228.0,80.81,35.73,20.69,7931.0,31.39,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3888.45,4529.66,4699.07,4869.2,2.0,1.0,1.0,1.0,349.0,209.0,2496.0,2142.0,11.0,1.0,1.0,1.0,1.0,1.0,0.96,1.0,0.94,1.0,1.0,1.0,0.96,1.0,0.94


In [52]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted',
    ]
].head()


Unnamed: 0,Warmth,Warmth_Probability,Warmth_actual,Warmth_predicted,Warmth_Probability_predicted,Competence,Competence_Probability,Competence_actual,Competence_predicted,Competence_Probability_predicted
0,0,0.02,,,,0,0.01,,,
1,0,0.01,,,,0,0.1,,,
2,1,0.75,,,,1,0.56,,,
3,0,0.07,,,,1,0.88,,,
4,1,0.89,,,,0,0.08,,,


In [53]:
df_jobs[
    [
        'Warmth', 'Warmth_Probability', 'Warmth_actual', 'Warmth_predicted', 'Warmth_Probability_predicted',
        'Competence', 'Competence_Probability', 'Competence_actual', 'Competence_predicted', 'Competence_Probability_predicted',
    ]
].describe()



Unnamed: 0,Warmth,Warmth_Probability,Warmth_actual,Warmth_predicted,Warmth_Probability_predicted,Competence,Competence_Probability,Competence_actual,Competence_predicted,Competence_Probability_predicted
count,309438.0,309438.0,5944.0,5944.0,5944.0,309438.0,309438.0,5944.0,5944.0,5944.0
mean,0.31,0.31,0.27,0.31,0.31,0.52,0.47,0.47,0.49,0.44
std,0.46,0.38,0.44,0.46,0.39,0.5,0.35,0.5,0.5,0.38
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.02,0.0,0.0,0.01,0.0,0.07,0.0,0.0,0.03
50%,0.0,0.07,0.0,0.0,0.05,1.0,0.54,0.0,0.0,0.44
75%,1.0,0.77,1.0,1.0,0.84,1.0,0.82,1.0,1.0,0.84
max,1.0,0.96,1.0,1.0,0.96,1.0,0.94,1.0,1.0,0.94


In [54]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_correction.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_for_correction.csv', index=False)


In [55]:
print(f'Saving classified df_jobs length {len(df_jobs)} to txt file.')
with open(f'{data_dir}df_jobs_for_correction_len.txt', 'w') as f:
    f.write(str(len(df_jobs)))


Saving classified df_jobs length 309438 to txt file.


In [56]:
get_df_info(
    df_jobs,
    ivs_all=analysis_columns + [f'{col}_actual' for col in analysis_columns] + [f'{col}_predicted' for col in analysis_columns]
)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Index: 309438 entries, 0 to 309445
Data columns (total 79 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      309438 non-null  object  
 1   Platform                                                            309438 non-null  object  
 2   Job ID                                                              309438 non-null  object  
 3   Job Title                                                           309438 non-null  object  
 4   Company Name                                                        309438 non-null  object  
 5   Location                                                            309438 non-null  object  
 6   Dutch Requirement in Job Ad                                         309438 non-null  o

In [57]:
get_df_info(
    df_jobs,
    ivs_all=classified_columns + [f'{col}_predicted' for col in classified_columns ]
)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Index: 309438 entries, 0 to 309445
Data columns (total 79 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      309438 non-null  object  
 1   Platform                                                            309438 non-null  object  
 2   Job ID                                                              309438 non-null  object  
 3   Job Title                                                           309438 non-null  object  
 4   Company Name                                                        309438 non-null  object  
 5   Location                                                            309438 non-null  object  
 6   Dutch Requirement in Job Ad                                         309438 non-null  o

In [58]:
get_df_info(
    df_jobs,
    ivs_all=dvs_all + [f'{col}_actual' for col in dvs_all if '_Probability' not in col] + [f'{col}_predicted' for col in dvs_all]
)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Index: 309438 entries, 0 to 309445
Data columns (total 79 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      309438 non-null  object  
 1   Platform                                                            309438 non-null  object  
 2   Job ID                                                              309438 non-null  object  
 3   Job Title                                                           309438 non-null  object  
 4   Company Name                                                        309438 non-null  object  
 5   Location                                                            309438 non-null  object  
 6   Dutch Requirement in Job Ad                                         309438 non-null  o

### Save dataframe


In [59]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_correction.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_for_correction.csv', index=False)


In [60]:
print(f'Saving classified df_jobs length {len(df_jobs)} to txt file.')
with open(f'{data_dir}df_jobs_for_correction_len.txt', 'w') as f:
    f.write(str(len(df_jobs)))


Saving classified df_jobs length 309438 to txt file.


In [61]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_for_correction.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_for_correction.csv', index=False)


In [62]:
print(f'Saving classified df_manual length {len(df_manual)} to txt file.')
with open(f'{data_dir}df_manual_for_correction_len.txt', 'w') as f:
    f.write(str(len(df_manual)))


Saving classified df_manual length 5947 to txt file.
