# ATTN: This script should be run AFTER spacy sentence splitting is completed.


# Use spacy to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_SENTENCIZED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [1]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [4]:
def get_word_num_and_frequency(row, text_col):

    row['Job Description num_words'] = len(str(row[f'{text_col}']).split())
    row['Job Description num_unique_words'] = len(set(str(row[f'{text_col}']).split()))
    row['Job Description num_chars'] = len(str(row[f'{text_col}']))
    row['Job Description num_punctuations'] = len([c for c in str(row[f'{text_col}']) if c in string.punctuation])

    return row


In [5]:
# Funtion to print df gender and age info
def df_gender_age_info(df, ivs_all=None):
    if ivs_all is None:
        ivs_all = [
            'Gender',
            'Gender_Num',
            'Gender_Female',
            'Gender_Mixed',
            'Gender_Male',
            'Age',
            'Age_Num',
            'Age_Older',
            'Age_Mixed',
            'Age_Younger',
        ]
    # Print Info
    print('\nDF INFO:\n')
    df.info()

    for iv in ivs_all:
        try:
            print('='*20)
            print(f'{iv}:')
            print('-'*20)
            print(f'{iv} Counts:\n{df[f"{iv}"].value_counts()}')
            print('-'*20)
            print(f'{iv} Percentages:\n{df[f"{iv}"].value_counts(normalize=True).mul(100).round(1).astype(float)}')
            with contextlib.suppress(Exception):
                print('-'*20)
                print(f'{iv} Mean: {df[f"{iv}"].mean().round(2).astype(float)}')
                print('-'*20)
                print(f'{iv} Standard Deviation: {df[f"{iv}"].std().round(2).astype(float)}')
        except Exception:
            print(f'{iv} not available.')

    print('\n')


In [6]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_sentencized.pkl').reset_index(drop=True)


In [7]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 57 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Search Keyword                                  17599 non-null  object  
 1   Platform                                        17599 non-null  object  
 2   Job ID                                          17599 non-null  object  
 3   Job Title                                       17599 non-null  object  
 4   Company Name                                    17597 non-null  object  
 5   Location                                        17599 non-null  object  
 6   Job Description                                 17599 non-null  object  
 7   Rating                                          3780 non-null   float64 
 8   Employment Type                                 17017 non-null  object  
 9   Company URL                 

In [8]:
# Job Ad info
df_gender_age_info(df_jobs)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 57 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Search Keyword                                  17599 non-null  object  
 1   Platform                                        17599 non-null  object  
 2   Job ID                                          17599 non-null  object  
 3   Job Title                                       17599 non-null  object  
 4   Company Name                                    17597 non-null  object  
 5   Location                                        17599 non-null  object  
 6   Job Description                                 17599 non-null  object  
 7   Rating                                          3780 non-null   float64 
 8   Employment Type                                 17017 non-null  object  
 9   Company URL      

In [9]:
# Explode df so that every row is one sentence
df_jobs = df_jobs.explode('Job Description spacy_sentencized', ignore_index=True).reset_index(drop=True)


In [10]:
# 194820
len(df_jobs)


194820

In [11]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 57 columns):
 #   Column                                          Non-Null Count   Dtype   
---  ------                                          --------------   -----   
 0   Search Keyword                                  194820 non-null  object  
 1   Platform                                        194820 non-null  object  
 2   Job ID                                          194820 non-null  object  
 3   Job Title                                       194820 non-null  object  
 4   Company Name                                    194816 non-null  object  
 5   Location                                        194820 non-null  object  
 6   Job Description                                 194820 non-null  object  
 7   Rating                                          170120 non-null  float64 
 8   Employment Type                                 163313 non-null  object  
 9   Company URL    

In [12]:
df_jobs['Job Description spacy_sentencized_lower'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda job_sentence: job_sentence.strip().lower()
)


In [13]:
df_jobs[['Job Description spacy_sentencized', 'Job Description spacy_sentencized_lower']].head()


Unnamed: 0,Job Description spacy_sentencized,Job Description spacy_sentencized_lower
0,About Our Client,about our client
1,The Global KYC organisation is part of ING's C...,the global kyc organisation is part of ing's c...
2,Its purpose is Enabling people and organisatio...,its purpose is enabling people and organisatio...
3,Our Global KYC organisation is a first line of...,our global kyc organisation is a first line of...
4,In our Global KYC organisation you will be wor...,in our global kyc organisation you will be wor...


In [14]:
%%time
# Spacy tokenize
with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)

df_jobs['Job Description spacy_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda job_sentence: [
        str(token.text.strip().lower())
        for token in nlp.tokenizer(job_sentence)
        if len(token) != 0
        and not token.is_space
        and not token.is_stop
        and not token.is_punct
        and not token.is_bracket
        and not token.like_email
        and not token.text in custom_punct_chars
    ]
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy.csv', index=False)


CPU times: user 37.2 s, sys: 1.56 s, total: 38.8 s
Wall time: 41.7 s


In [15]:
df_jobs['Job Description spacy_sentencized_cleaned'] = df_jobs['Job Description spacy_tokenized'].str.join(' ')


In [16]:
%%time
# Get sentence word frequencies
df_jobs = df_jobs.apply(
    lambda row: get_word_num_and_frequency(
        row=row, text_col='Job Description spacy_sentencized'
    ), 
    axis='columns',
    
)


In [17]:
# Job Sentence info
df_gender_age_info(df_jobs)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 64 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL    

In [18]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy.csv', index=False)



# Use NLTK to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [19]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [22]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_tokenized_spacy.pkl').reset_index(drop=True)


In [23]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 64 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [24]:
%%time
# Tokenize with NLTK
# stop_words = set(stopwords.words('english'))
# punctuations = list(string.punctuation)
# lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

df_jobs['Job Description nltk_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda job_sentence: [
        str(token.strip().lower()) 
        for token in word_tokenize(job_sentence) 
        if len(token) != 0 
        and token != '...' 
        and not token.lower() in set(stopwords.words('english')) 
        and not token.lower() in list(string.punctuation) 
    ]
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy_nltk.csv', index=False)


CPU times: user 7min 12s, sys: 2min 26s, total: 9min 38s
Wall time: 11min 8s


In [25]:
df_jobs['Job Description nltk_tokenized'].head()


0                                             [client]
1    [global, kyc, organisation, part, ing, 's, coo...
2    [purpose, enabling, people, organisations, use...
3    [global, kyc, organisation, first, line, defen...
4    [global, kyc, organisation, working, many, col...
Name: Job Description nltk_tokenized, dtype: object

In [26]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 65 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [27]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy_nltk.csv', index=False)


# Use gensim to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [28]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [31]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk.pkl').reset_index(drop=True)


In [32]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 65 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [33]:
%%time
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'
df_jobs['Job Description gensim_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda sentence: preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim.csv', index=False)


CPU times: user 55.4 s, sys: 3.02 s, total: 58.4 s
Wall time: 1min 2s


In [34]:
df_jobs['Job Description gensim_tokenized'].head()


0                                             [client]
1             [global, kyc, organis, ing, coo, domain]
2    [purpos, enabl, peopl, organis, us, bank, serv...
3    [global, kyc, organis, line, defenc, depart, p...
4    [global, kyc, organis, work, colleagu, differ,...
Name: Job Description gensim_tokenized, dtype: object

In [35]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 66 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [36]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim.pkl')
 df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim.csv', index=False)


# Use BERT to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY_NLTK_GENSIM
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [1]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import *


Using MPS


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

0it [00:00, ?it/s]

In [3]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim.pkl').reset_index(drop=True)


In [4]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 66 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [5]:
%%time
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available() else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertForSequenceClassification.from_pretrained(bert_model_name).to(device)

df_jobs['Job Description bert_encodings'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda sentence: bert_tokenizer(
        str(sentence), truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor
    ).to(device)
)

df_jobs['Job Description bert_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda sentence: bert_tokenizer.tokenize(str(sentence))
)

df_jobs['Job Description bert_tokenized_to_id'] = df_jobs['Job Description bert_tokenized'].apply(
    lambda sentence: bert_tokenizer.convert_tokens_to_ids(str(sentence))
)

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.pkl')

    df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


Token indices sequence length is longer than the specified maximum sequence length for this model (1539 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 15min 39s, sys: 21min 1s, total: 36min 41s
Wall time: 1h 11min 29s


In [6]:
df_jobs['Job Description bert_encodings'].head()


0    [input_ids, token_type_ids, attention_mask]
1    [input_ids, token_type_ids, attention_mask]
2    [input_ids, token_type_ids, attention_mask]
3    [input_ids, token_type_ids, attention_mask]
4    [input_ids, token_type_ids, attention_mask]
Name: Job Description bert_encodings, dtype: object

In [7]:
df_jobs['Job Description bert_tokenized'].head()


0                                 [about, our, client]
1    [the, global, ky, ##c, organisation, is, part,...
2    [its, purpose, is, enabling, people, and, orga...
3    [our, global, ky, ##c, organisation, is, a, fi...
4    [in, our, global, ky, ##c, organisation, you, ...
Name: Job Description bert_tokenized, dtype: object

In [8]:
df_jobs['Job Description bert_tokenized_to_id'].head()

0    100
1    100
2    100
3    100
4    100
Name: Job Description bert_tokenized_to_id, dtype: int64

In [9]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.csv', index=False)
