In [1]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = Path.cwd()
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


Using MPS


0it [00:00, ?it/s]

Using MPS


<Figure size 640x480 with 0 Axes>

### Set variables

In [3]:
# Variables
method = 'Supervised'
with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
    results_save_path = f.read()
with open(f'{data_dir}{method}_done_xy_save_path.txt', 'r') as f:
    done_xy_save_path = f.read()
t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
protocol = pickle.HIGHEST_PROTOCOL
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
classified_columns = ['Warmth_Probability', 'Competence_Probability']
metrics_dict = {
    f'{scoring.title()} Best Score': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Brier Score': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()
torch.Generator(device_name).manual_seed(random_state)
cores = multiprocessing.cpu_count()
accelerator = Accelerator()
torch.autograd.set_detect_anomaly(True)
os.environ.get('TOKENIZERS_PARALLELISM')


Using MPS


# Functions


In [None]:
def load_classified_df(
    done_dfs_name, df_jobs_len, df_save_dir
):
    print(f'Loading {done_dfs_name}...')
    df_jobs = pd.read_pickle(f'{df_save_dir}{done_dfs_name}.pkl')
    assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
    print(f'Dataframe {done_dfs_name} loaded with shape: {df_jobs.shape}')

    return df_jobs


In [4]:
class ToDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            key: val[idx].clone().detach().to(device)
            for key, val in self.encodings.items()
        }

    def __len__(self):
        return len(self.encodings['input_ids'])


In [5]:
class ImbTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = self._calculate_class_weights(self.train_dataset)

    def _calculate_class_weights(self, dataset):
        # Count the number of samples in each class
        class_counts = torch.zeros(self.model.config.num_labels)
        for label in dataset.labels:
            class_counts[label] += 1

        # Calculate the inverse frequency of each class
        inv_frequencies = 1 / class_counts

        # Normalize the inverse frequencies so that they sum up to 1
        sum_inv_frequencies = torch.sum(inv_frequencies)
        return inv_frequencies / sum_inv_frequencies

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(device))
        loss = loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss


In [6]:
# Function to get y_pred and y_pred_prob
def preprocess_logits_for_metrics_from_logits(y_pred_logits):

    # Get y_pred
    print('-'*20)
    y_pred_logits_tensor = torch.tensor(y_pred_logits, device=device)
    print('Getting y_pred through argmax of y_pred_logits...')
    try:
        y_pred_array = torch.argmax(y_pred_logits_tensor, axis=-1).cpu().numpy()
        print('Using torch.argmax.')
    except Exception:
        y_pred_array = y_pred_logits.argmax(axis=-1)
        print('Using np.argmax.')
    print(f'y_pred_array shape: {y_pred_array.shape}')
    print('-'*20)
    print('Flattening y_pred...')
    y_pred = y_pred_array.flatten().tolist()
    print(f'y_pred length: {len(y_pred)}')
    print('-'*20)

    # Get y_pred_prob
    print('-'*20)
    print('Getting y_pred_prob through softmax of y_pred_logits...')
    try:
        y_pred_prob_array = torch.nn.functional.softmax(y_pred_logits_tensor, dim=-1).cpu().numpy()
        print('Using torch.nn.functional.softmax.')
    except Exception:
        y_pred_prob_array = scipy.special.softmax(y_pred_logits, axis=-1)
        print('Using scipy.special.softmax.')
    # from: https://discuss.huggingface.co/t/different-results-predicting-from-trainer-and-model/12922
    assert all(y_pred_prob_array.argmax(axis=-1) == y_pred_array), 'Argmax of y_pred_prob_array does not match y_pred_array.'
    print(f'y_pred_prob shape: {y_pred_prob_array.shape}')
    print('-'*20)
    print('Flattening y_pred_prob and extracting probabilities of 1...')
    y_pred_prob = y_pred_prob_array[:, -1].flatten().tolist()
    print(f'y_pred length: {len(y_pred_prob)}')
    print('-'*20)

    y_pred_logits_tensor.clone().detach()

    return (
        y_pred_array, y_pred, y_pred_prob_array, y_pred_prob
    )


In [None]:
def prob_confirmatory_tests(y_pred, y_pred_prob):

    # Confirmatory Regression
    print('+'*20)
    print('Confirmatory Tests validating the linear relationship between y_pred and y_pred_prob')
    print('-'*20)
    print('T-Test y_pred_prob ~ y_pred:')
    levene = scipy.stats.levene(y_pred_prob, y_pred)
    equal_var_levene = levene.pvalue < 0.05
    print(scipy.stats.ttest_ind(y_pred_prob, y_pred, equal_var=equal_var_levene))


    print('\n')
    print('-'*20)
    print('Logit y_pred ~ y_pred_prob:')
    try:
        logit_model = sm.Logit(endog=y_pred, exog=y_pred_prob)
        logit_results = logit_model.fit()
        std_coef = logit_results.params[0] / np.std(y_pred_prob)
        std_err = logit_results.bse[0]
        log_likelihood = logit_results.llf
        print(logit_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
        print(f'Log Likelihood: {log_likelihood}')
    except np.linalg.LinAlgError:
        print('Logit LinAlgError: Singular matrix. Skipping confirmatory tests.')

    print('-'*20)
    print('\n')
    print('-'*20)
    print('OLS y_pred_prob ~ y_pred:')
    try:
        ols_model = sm.OLS(endog=y_pred_prob, exog=y_pred)
        ols_results = ols_model.fit()
        std_coef = ols_results.params[0] / np.std(y_pred)
        std_err = ols_results.bse[0]
        print(ols_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
    except np.linalg.LinAlgError:
        print('OLS LinAlgError: Singular matrix. Skipping confirmatory tests.')

    print('-'*20)
    print('+'*20)
    print('\n')


# Classifying

### READ DATA

In [7]:
# # ATTN: IF THIS IS THE FIRST TIME YOU ARE CLASSIFYING JOBS, UNCOMMENT AND RUN THIS CODE
# with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
#     df_jobs_len = int(f.read())
# df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_classification.pkl')
# assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'


In [8]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

for done_dfs_name in ['df_jobs_classified', 'df_jobs_classified_Warmth_Competence', 'df_jobs_classified_Warmth']:
    if os.path.exists(f'{df_save_dir}{done_dfs_name}.pkl') and os.path.getsize(f'{df_save_dir}{done_dfs_name}.pkl') > 0:

        df_jobs = pd.read_pickle(f'{df_save_dir}{done_dfs_name}.pkl')
        assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'

        if (
            done_dfs_name == 'df_jobs_classified'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' in df_jobs.columns
            and 'Competence_Probability' in df_jobs.columns
        ):
            df_jobs = load_classified_df(done_dfs_name, df_jobs_len, df_save_dir)
            break

        elif (
            done_dfs_name == 'df_jobs_classified_Warmth_Competence'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' in df_jobs.columns
            and 'Competence_Probability' in df_jobs.columns
        ):
            df_jobs = load_classified_df(done_dfs_name, df_jobs_len, df_save_dir)
            break

        elif (
            done_dfs_name == 'df_jobs_classified_Warmth'
            and 'Warmth' in df_jobs.columns
            and 'Warmth_Probability' in df_jobs.columns
            and 'Competence' not in df_jobs.columns
            and 'Competence_Probability' not in df_jobs.columns
        ):
            df_jobs = load_classified_df(done_dfs_name, df_jobs_len, df_save_dir)
            break

    else:
        print('Loading df_jobs_for_classification...')
        df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_classification.pkl')
        assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
        print(f'Dataframe df_jobs_for_classification loaded with shape: {df_jobs.shape}')


In [9]:
%%time
print('#'*40)
print('Starting!')
print('#'*40)

done_cols = ''
final_estimators_dict = {
    'Warmth': {
        'vectorizer_name': 'BERTBASEUNCASED',
        'classifier_name': 'BertForSequenceClassification',
    },
    'Competence': {
        'vectorizer_name': 'BERTBASEUNCASED',
        'classifier_name': 'BertForSequenceClassification',
    },
}

for col in tqdm.tqdm(analysis_columns):
    if col not in df_jobs.columns:
        print('-'*20)
        final_estimators_dict[col]['path_suffix'] = path_suffix = f' - {col} - {(vectorizer_name := final_estimators_dict[col]["vectorizer_name"])} + {(classifier_name := final_estimators_dict[col]["classifier_name"])} (Save_protocol={protocol})'

        if classifier_name in list(classifiers_pipe.keys()):
            method = 'Supervised'
            with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
                results_save_path = f.read()
            print('-'*20)
            print(f'Using {classifier_name} from {method} pipeline.')
            print('Loading Supervised Estimator.')
            with open(
                f'{results_save_path}{method} Fitted Estimator {path_suffix}.pkl', 'rb'
            ) as f:
                estimator = joblib.load(f)
            print('Done loading Supervised Estimator!')

            print('-'*20)
            print('Classifying data.')
            X = np.array(list(df_jobs[text_col].astype('str').values))
            df_jobs[col] = estimator.predict(X)
            if hasattr(estimator, 'predict_proba'):
                # Get the the whole of the last column, which is the  probability of 1, and flatten to list
                df_jobs[f'{col}_Probability'] = estimator.predict_proba(X)[:, -1]

            print(f'Done classifying data using {classifier_name} for {col}!')
            print('-'*20)

        elif classifier_name in list(transformers_pipe.keys()):
            method = 'Transformers'
            with open(f'{data_dir}{method}_results_save_path.txt', 'r') as f:
                results_save_path = f.read()
            with open(f'{data_dir}{method}_done_xy_save_path.txt', 'r') as f:
                done_xy_save_path = f.read()
            with open(f'{done_xy_save_path}{method} training_args_dict - {col} - {vectorizer_name} + {classifier_name}.json', 'r') as f:
                training_args_dict = json.load(f)
            print('-'*20)
            print(f'Using {classifier_name} from {method} pipeline.')
            model = transformers_pipe[classifier_name]['model']
            tokenizer = transformers_pipe[classifier_name]['tokenizer']
            config = transformers_pipe[classifier_name]['config']

            print(f'Loading Fitted Transformer {classifier_name} from pretrained.')
            estimator_dir = f'{results_save_path}{method} Fitted Estimator{path_suffix}.model'
            fitted_estimator = model.from_pretrained(estimator_dir)
            if hasattr(fitted_estimator, 'to'):
                fitted_estimator = fitted_estimator.to(device)
            tokenizer = tokenizer.from_pretrained(estimator_dir)
            config = config.from_pretrained(f'{estimator_dir}/config.json')
            print(f'Done loading Fitted Transformer {classifier_name} from pretrained!')

            # Tokenize
            X = df_jobs[text_col].astype('str').values.tolist()
            encodings = tokenizer(
            X, truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor
            ).to(device)
            dataset = ToDataset(encodings)
            # Accelerate model
            (
                fitted_estimator, tokenizer, dataset
            ) = accelerator.prepare(
                fitted_estimator, tokenizer, dataset
            )

            # Get agent
            # agent = LocalAgent.from_pretrained(estimator_dir, device_map='auto', torch_dtype=torch.bfloat16)
            # agent = LocalAgent(model=fitted_estimator, tokenizer=tokenizer)
            # llm = HuggingFacePipeline.from_model_id(model_id='gpt2', task='text-generation')
            # template = """Question: {question}
            # Answer: Let's think step by step."""
            # prompt = PromptTemplate(template=template, input_variables=["question"])
            # llm_chain = LLMChain(prompt=prompt, llm=llm)
            # question = "Write code to clean a dataframe"
            # print(llm_chain.run(question))

            # Get predictions
            # classifier = transformers.pipeline(model=fitted_estimator, tokenizer=tokenizer, function_to_apply='softmax', device=device, framework='pt', task='text-classification', return_all_scores=False)
            # df_jobs[col] = df_jobs[text_col].astype(str).apply(lambda x: [pred['label'].split('LABEL_')[1] for pred in classifier(x)][0])
            # df_jobs[f'{col}_Probability'] = df_jobs[text_col].astype(str).apply(lambda x: [pred['score'] for pred in classifier(x)][0])

            print(f'Getting estimator for {col}.')
            estimator = Trainer(
                model=fitted_estimator,
                tokenizer=tokenizer,
                args=TrainingArguments(**training_args_dict),
            )
            if estimator.place_model_on_device:
                estimator.model.to(device)

            print('-'*20)
            print(f'Classifying data using {classifier_name} for {col}.')
            (y_pred_logits, y_labels, metrics) = estimator.predict(dataset)
            y_pred_array, y_pred, y_pred_prob_array, y_pred_prob = preprocess_logits_for_metrics_from_logits(y_pred_logits)

            # Confirmatory Regression
            prob_confirmatory_tests(y_pred, y_pred_prob)

            # Assign to dataframe
            df_jobs[col] = y_pred
            df_jobs[f'{col}_Probability'] = y_pred_prob

            print(f'Done classifying data using {classifier_name} for {col}!')
            print('-'*20)

        done_cols += f'_{col}'
        assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
        df_jobs.to_pickle(f'{df_save_dir}df_jobs_classified{done_cols}.pkl')
        df_jobs.to_csv(f'{df_save_dir}df_jobs_classified{done_cols}.csv', index=False)
    else:
        print('-'*20)
        print(f'Column {col} already exists in dataframe. Skipping.')
        print('-'*20)



########################################
Starting!
########################################


  0%|          | 0/2 [00:00<?, ?it/s]

--------------------
--------------------
Using BertForSequenceClassification from Transformers pipeline.
Loading Fitted Transformer BertForSequenceClassification from pretrained.
Done loading Fitted Transformer BertForSequenceClassification from pretrained!


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Getting estimator for Warmth.
--------------------
Classifying data using BertForSequenceClassification for Warmth.


  0%|          | 0/15358 [00:00<?, ?it/s]

--------------------
Getting y_pred through argmax of y_pred_logits...
Using torch.argmax.
y_pred_array shape: (307154,)
--------------------
Flattening y_pred...
y_pred length: 307154
--------------------
--------------------
Getting y_pred_prob through softmax of y_pred_logits...
Using torch.nn.functional.softmax.
y_pred_prob shape: (307154, 2)
--------------------
Flattening y_pred_prob and extracting probabilities of 1...
y_pred length: 307154
--------------------
Done classifying data using BertForSequenceClassification for Warmth!
--------------------


 50%|█████     | 1/2 [6:59:26<6:59:26, 25166.76s/it]

--------------------
--------------------
Using BertForSequenceClassification from Transformers pipeline.
Loading Fitted Transformer BertForSequenceClassification from pretrained.
Done loading Fitted Transformer BertForSequenceClassification from pretrained!
Getting estimator for Competence.


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


--------------------
Classifying data using BertForSequenceClassification for Competence.


  0%|          | 0/15358 [00:00<?, ?it/s]

--------------------
Getting y_pred through argmax of y_pred_logits...
Using torch.argmax.
y_pred_array shape: (307154,)
--------------------
Flattening y_pred...
y_pred length: 307154
--------------------
--------------------
Getting y_pred_prob through softmax of y_pred_logits...
Using torch.nn.functional.softmax.
y_pred_prob shape: (307154, 2)
--------------------
Flattening y_pred_prob and extracting probabilities of 1...
y_pred length: 307154
--------------------
Done classifying data using BertForSequenceClassification for Competence!
--------------------


100%|██████████| 2/2 [13:55:45<00:00, 25072.71s/it] 

CPU times: user 1h 3min 18s, sys: 45min 3s, total: 1h 48min 21s
Wall time: 13h 55min 45s





## Inspect classified data

In [10]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_classified.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_classified.csv', index=False)


In [11]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_classified.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'


In [12]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype   
---  ------                                            --------------   -----   
 0   Search Keyword                                    307154 non-null  object  
 1   Platform                                          307154 non-null  object  
 2   Job ID                                            307154 non-null  object  
 3   Job Title                                         307154 non-null  object  
 4   Company Name                                      307149 non-null  object  
 5   Location                                          307154 non-null  object  
 6   Job Description                                   307154 non-null  object  
 7   Rating                                            165831 non-null  float64 
 8   Employment Type                                   281052 non-null  object  
 9 

In [13]:
df_jobs.head()


Unnamed: 0,Search Keyword,Platform,Job ID,Job Title,Company Name,Location,Job Description,Rating,Employment Type,Company URL,Job URL,Job Age,Job Age Number,Collection Date,Data Row,Tracking ID,Industry,Job Date,Type of ownership,Language,Dutch Requirement in Job Ad,English Requirement in Job Ad,Dutch Requirement in Job Ad_No,Dutch Requirement in Job Ad_Yes,English Requirement in Job Ad_No,English Requirement in Job Ad_Yes,Sector Code,Sector,Keywords Count,Gender_Female_n,Gender_Female_% per Sector,Gender_Female_% per Social Category,Gender_Female_% per Workforce,Gender_Male_n,Gender_Male_% per Sector,Gender_Male_% per Social Category,Gender_Male_% per Workforce,Gender,Age_Older_n,Age_Older_% per Sector,Age_Older_% per Social Category,Age_Older_% per Workforce,Age_Younger_n,Age_Younger_% per Sector,Age_Younger_% per Social Category,Age_Younger_% per Workforce,Age,Sector_n,% Sector per Workforce,Gender_Female,Gender_Male,Gender_Mixed,Age_Mixed,Age_Older,Age_Younger,Gender_Num,Age_Num,Platform_Num,Platform_LinkedIn,Platform_Indeed,Platform_Glassdoor,Job Description spacy_sentencized,Job Description num_words,Job Description num_unique_words,Job Description num_chars,Job Description num_chars_no_whitespact_and_punt,Job Description num_punctuations,Job Description spacy_sentencized_lower,Dutch Requirement in Sentence,English Requirement in Sentence,Dutch Requirement in Sentence_No,Dutch Requirement in Sentence_Yes,English Requirement in Sentence_No,English Requirement in Sentence_Yes,Job Description spacy_tokenized,Job Description spacy_sentencized_cleaned,Job Description nltk_tokenized,Job Description gensim_tokenized,Job Description bert_tokenized,Warmth,Warmth_Probability,Competence,Competence_Probability
0,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,True,False,True,False,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1,0,1,0,About Our Client,3,3,16,14,0,about our client,No,No,True,False,True,False,"[about, our, client]",about our client,[client],[client],"[about, our, client]",0,0.02,0,0.01
1,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,True,False,True,False,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1,0,1,0,The Global KYC organisation is part of ING's C...,10,10,56,45,1,the global kyc organisation is part of ing's c...,No,No,True,False,True,False,"[the, global, kyc, organisation, is, part, of,...",the global kyc organisation is part of ing 's ...,"[global, kyc, organisation, part, ing, 's, coo...","[global, kyc, organis, ing, coo, domain]","[the, global, ky, ##c, organisation, is, part,...",0,0.01,0,0.1
2,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,True,False,True,False,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1,0,1,0,Its purpose is Enabling people and organisatio...,20,19,131,111,1,its purpose is enabling people and organisatio...,No,No,True,False,True,False,"[its, purpose, is, enabling, people, and, orga...",its purpose is enabling people and organisatio...,"[purpose, enabling, people, organisations, use...","[purpos, enabl, peopl, organis, us, bank, serv...","[its, purpose, is, enabling, people, and, orga...",1,0.75,1,0.56
3,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,True,False,True,False,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1,0,1,0,Our Global KYC organisation is a first line of...,34,31,239,203,1,our global kyc organisation is a first line of...,No,No,True,False,True,False,"[our, global, kyc, organisation, is, a, first,...",our global kyc organisation is a first line of...,"[global, kyc, organisation, first, line, defen...","[global, kyc, organis, line, defenc, depart, p...","[our, global, ky, ##c, organisation, is, a, fi...",0,0.07,1,0.88
4,wholesale,Indeed,pj_da9f2c12243d7031,Transaction Monitoring Expert,Michael Page,Amsterdam,About Our Client\nThe Global KYC organisation ...,-1.0,-1,https://indeed.nl/rc/clk?jk=da9f2c12243d7031&f...,https://nl.indeed.com/vacature-bekijken/pagead...,2 dagen geleden,2 dagen geleden,2021-01-24,,,,,,en,No,No,True,False,True,False,G,Commercial services,11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39,0,0,1,1,0,0,1,1,1,0,1,0,In our Global KYC organisation you will be wor...,18,18,128,109,1,in our global kyc organisation you will be wor...,No,No,True,False,True,False,"[in, our, global, kyc, organisation, you, will...",in our global kyc organisation you will be wor...,"[global, kyc, organisation, working, many, col...","[global, kyc, organis, work, colleagu, differ,...","[in, our, global, ky, ##c, organisation, you, ...",1,0.89,0,0.08


In [14]:
df_jobs[['Job ID', 'Job Description spacy_sentencized']].info()


<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 2 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Job ID                             307154 non-null  object
 1   Job Description spacy_sentencized  307154 non-null  object
dtypes: object(2)
memory usage: 7.0+ MB


In [15]:
df_jobs[['Job ID', 'Job Description spacy_sentencized']].head()


Unnamed: 0,Job ID,Job Description spacy_sentencized
0,pj_da9f2c12243d7031,About Our Client
1,pj_da9f2c12243d7031,The Global KYC organisation is part of ING's C...
2,pj_da9f2c12243d7031,Its purpose is Enabling people and organisatio...
3,pj_da9f2c12243d7031,Our Global KYC organisation is a first line of...
4,pj_da9f2c12243d7031,In our Global KYC organisation you will be wor...


In [16]:
%%time
# Conver Job ID and Sentence to str
str_cols = [
    'Job ID',
    'Job Description',
    'Job Description spacy_sentencized',
]

for col in str_cols:
    df_jobs[col] = df_jobs[col].astype(str, errors='ignore').progress_apply(lambda x: x.strip().replace('[', '').replace(']', ''))
    df_jobs[col] = df_jobs[col].apply(lambda x: x.strip())
    df_jobs[col] = df_jobs[col].apply(lambda x: unicodedata.normalize('NFKD', x.encode('ascii', 'ignore').decode('utf-8', 'ignore')))
    print(f'{col} converted to str.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, str))) else f'{col} NOT converted to str.')


progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

Job ID converted to str.


progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

Job Description converted to str.


progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

Job Description spacy_sentencized converted to str.
CPU times: user 4.03 s, sys: 882 ms, total: 4.91 s
Wall time: 7.32 s


In [17]:
# Convert Warmth and Competence to int
# Warmth 1 = 1741, Competence 1 = 3043
int_cols = [
    'Warmth',
    'Competence',
]

for col in int_cols:
    df_jobs[col] = df_jobs[col].astype(np.int64, errors='ignore')
    print(f'{col} converted to int.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, int))) else f'{col} NOT converted to int.')
    print(f'{col} value counts:\n{df_jobs[col].value_counts()}')


progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

Warmth converted to int.
Warmth value counts:
Warmth
0    211878
1     95276
Name: count, dtype: int64


progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

Competence converted to int.
Competence value counts:
Competence
1    158797
0    148357
Name: count, dtype: int64


In [18]:
# Convert Warmth and Competence to int
# Warmth 1 = 1741, Competence 1 = 3043
float_cols = [
    'Warmth_Probability',
    'Competence_Probability',
]

for col in float_cols:
    df_jobs[col] = df_jobs[col].astype(np.float64, errors='ignore')
    print(f'{col} converted to float.' if all(df_jobs[col].progress_apply(lambda x: isinstance(x, float))) else f'{col} NOT converted to int.')


progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

Warmth_Probability converted to float.


progress-bar:   0%|          | 0/307154 [00:00<?, ?it/s]

Competence_Probability converted to float.


In [19]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype   
---  ------                                            --------------   -----   
 0   Search Keyword                                    307154 non-null  object  
 1   Platform                                          307154 non-null  object  
 2   Job ID                                            307154 non-null  object  
 3   Job Title                                         307154 non-null  object  
 4   Company Name                                      307149 non-null  object  
 5   Location                                          307154 non-null  object  
 6   Job Description                                   307154 non-null  object  
 7   Rating                                            165831 non-null  float64 
 8   Employment Type                                   281052 non-null  object  
 9 

In [20]:
df_jobs = df_jobs.dropna(
    subset=[
        'Job ID', 'Job Description spacy_sentencized', 'Warmth', 'Warmth_Probability', 'Competence', 'Competence_Probability'
    ]
)


In [21]:
df_jobs = df_jobs.drop_duplicates(subset=['Job ID', 'Job Description spacy_sentencized'])


In [22]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype   
---  ------                                            --------------   -----   
 0   Search Keyword                                    307154 non-null  object  
 1   Platform                                          307154 non-null  object  
 2   Job ID                                            307154 non-null  object  
 3   Job Title                                         307154 non-null  object  
 4   Company Name                                      307149 non-null  object  
 5   Location                                          307154 non-null  object  
 6   Job Description                                   307154 non-null  object  
 7   Rating                                            165831 non-null  float64 
 8   Employment Type                                   281052 non-null  object  
 9 

In [23]:
df_jobs.describe()


Unnamed: 0,Rating,Data Row,Keywords Count,Gender_Female_n,Gender_Female_% per Sector,Gender_Female_% per Social Category,Gender_Female_% per Workforce,Gender_Male_n,Gender_Male_% per Sector,Gender_Male_% per Social Category,Gender_Male_% per Workforce,Age_Older_n,Age_Older_% per Sector,Age_Older_% per Social Category,Age_Older_% per Workforce,Age_Younger_n,Age_Younger_% per Sector,Age_Younger_% per Social Category,Age_Younger_% per Workforce,Sector_n,% Sector per Workforce,Gender_Female,Gender_Male,Gender_Mixed,Age_Mixed,Age_Older,Age_Younger,Gender_Num,Age_Num,Platform_Num,Platform_LinkedIn,Platform_Indeed,Platform_Glassdoor,Job Description num_words,Job Description num_unique_words,Job Description num_chars,Job Description num_chars_no_whitespact_and_punt,Job Description num_punctuations,Warmth,Warmth_Probability,Competence,Competence_Probability
count,165831.0,141182.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0,307154.0
mean,-0.38,359.68,6.81,656.79,45.36,5.47,2.6,712.05,54.6,5.37,2.82,528.39,40.86,4.97,2.09,840.04,59.04,5.74,3.32,1369.08,5.42,0.25,0.37,0.38,0.64,0.2,0.15,1.11,0.95,0.62,0.46,0.45,0.08,17.67,15.81,114.11,95.86,0.31,0.31,0.31,0.52,0.47
std,1.63,284.66,2.88,1047.82,19.49,8.72,4.15,1235.51,19.52,9.32,4.89,796.79,10.12,7.49,3.15,1437.93,9.98,9.83,5.69,2216.35,8.77,0.44,0.48,0.49,0.48,0.4,0.36,0.78,0.6,0.64,0.5,0.5,0.28,16.45,12.77,107.75,90.37,0.53,0.46,0.38,0.5,0.35
min,-1.0,1.0,1.0,7.0,12.5,0.06,0.03,21.0,15.63,0.16,0.08,15.0,18.94,0.14,0.06,13.0,44.44,0.09,0.05,29.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,-1.0,111.0,4.0,87.0,27.59,0.72,0.34,189.0,34.87,1.43,0.75,95.0,32.76,0.89,0.38,195.0,51.18,1.33,0.77,290.0,1.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,7.0,45.0,38.0,0.0,0.0,0.02,0.0,0.07
50%,-1.0,291.0,7.0,226.0,43.13,1.88,0.89,210.0,56.87,1.58,0.83,205.0,41.67,1.93,0.81,288.0,56.82,1.97,1.14,398.0,1.57,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,13.0,12.0,84.0,70.0,0.0,0.0,0.07,1.0,0.54
75%,-1.0,578.0,9.0,416.0,65.13,3.46,1.65,557.0,72.41,4.2,2.2,661.0,48.82,6.22,2.62,708.0,67.24,4.84,2.8,1399.0,5.54,1.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,22.0,20.0,144.0,120.0,1.0,1.0,0.77,1.0,0.82
max,5.0,1000.0,11.0,3970.0,84.3,33.04,15.71,4510.0,87.5,34.04,17.85,2844.0,58.33,26.75,11.25,5228.0,80.81,35.73,20.69,7931.0,31.39,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,349.0,209.0,2496.0,2142.0,11.0,1.0,0.96,1.0,0.94


In [24]:
df_jobs[['Warmth', 'Warmth_Probability', 'Competence', 'Competence_Probability']].head()


Unnamed: 0,Warmth,Warmth_Probability,Competence,Competence_Probability
0,0,0.02,0,0.01
1,0,0.01,0,0.1
2,1,0.75,1,0.56
3,0,0.07,1,0.88
4,1,0.89,0,0.08


In [25]:
df_jobs[['Warmth', 'Warmth_Probability', 'Competence', 'Competence_Probability']].describe()


Unnamed: 0,Warmth,Warmth_Probability,Competence,Competence_Probability
count,307154.0,307154.0,307154.0,307154.0
mean,0.31,0.31,0.52,0.47
std,0.46,0.38,0.5,0.35
min,0.0,0.0,0.0,0.0
25%,0.0,0.02,0.0,0.07
50%,0.0,0.07,1.0,0.54
75%,1.0,0.77,1.0,0.82
max,1.0,0.96,1.0,0.94


In [26]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_for_analysis.csv', index=False)


In [27]:
print(f'Saving classified df_jobs length {len(df_jobs)} to txt file.')
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'w') as f:
    f.write(str(len(df_jobs)))


Saving classified df_jobs length 307154 to txt file.


In [28]:
get_df_info(df_jobs, ivs_all=analysis_columns)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype   
---  ------                                            --------------   -----   
 0   Search Keyword                                    307154 non-null  object  
 1   Platform                                          307154 non-null  object  
 2   Job ID                                            307154 non-null  object  
 3   Job Title                                         307154 non-null  object  
 4   Company Name                                      307149 non-null  object  
 5   Location                                          307154 non-null  object  
 6   Job Description                                   307154 non-null  object  
 7   Rating                                            165831 non-null  float64 
 8   Employment Type                                   281052 non-null  o

In [29]:
get_df_info(df_jobs, ivs_all=classified_columns)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype   
---  ------                                            --------------   -----   
 0   Search Keyword                                    307154 non-null  object  
 1   Platform                                          307154 non-null  object  
 2   Job ID                                            307154 non-null  object  
 3   Job Title                                         307154 non-null  object  
 4   Company Name                                      307149 non-null  object  
 5   Location                                          307154 non-null  object  
 6   Job Description                                   307154 non-null  object  
 7   Rating                                            165831 non-null  float64 
 8   Employment Type                                   281052 non-null  o

In [30]:
get_df_info(df_jobs, ivs_all=dvs_all)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Index: 307154 entries, 0 to 408598
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype   
---  ------                                            --------------   -----   
 0   Search Keyword                                    307154 non-null  object  
 1   Platform                                          307154 non-null  object  
 2   Job ID                                            307154 non-null  object  
 3   Job Title                                         307154 non-null  object  
 4   Company Name                                      307149 non-null  object  
 5   Location                                          307154 non-null  object  
 6   Job Description                                   307154 non-null  object  
 7   Rating                                            165831 non-null  float64 
 8   Employment Type                                   281052 non-null  o

### Save dataframe


In [31]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_for_analysis.csv', index=False)


In [32]:
print(f'Saving classified df_jobs length {len(df_jobs)} to txt file.')
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'w') as f:
    f.write(str(len(df_jobs)))


Saving classified df_jobs length 307154 to txt file.
