# ATTN: This script should be run AFTER spacy sentence splitting is completed.


# Use spacy to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_SENTENCIZED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [1]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [2]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
languages = ["en", "['nl', 'en']", ['en', 'nl']]
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [3]:
import string
import re
import time
import json
import csv
import glob
import pickle
import random
import unicodedata
import pandas as pd
import numpy as np
import googletrans
from googletrans import Translator
random.seed(42)

# Set up Spacy
import spacy
from spacy.symbols import NORM, ORTH, LEMMA, POS

nlp = spacy.load('en_core_web_sm')

# Set up NLK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.tag import pos_tag, pos_tag_sents

nltk_path = f'{llm_path}/nltk'
nltk.data.path.append(nltk_path)

nltk.download('words', download_dir = nltk_path)
nltk.download('stopwords', download_dir = nltk_path)
nltk.download('punkt', download_dir = nltk_path)
nltk.download('averaged_perceptron_tagger', download_dir = nltk_path)

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Set up Gensim
from gensim.utils import save_as_line_sentence, simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, preprocess_documents

# Set up Bert
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline, BertTokenizer, BertForPreTraining, BertConfig, BertModel
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertModel.from_pretrained(bert_model_name)


[nltk_data] Downloading package words to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package averaged_perceptron_

In [4]:
def get_word_num_and_frequency(row, text_col):

    row['Job Description num_words'] = len(str(row[f'{text_col}']).split())
    row['Job Description num_unique_words'] = len(set(str(row[f'{text_col}']).split()))
    row['Job Description num_chars'] = len(str(row[f'{text_col}']))
    row['Job Description num_punctuations'] = len([c for c in str(row[f'{text_col}']) if c in string.punctuation])

    return row


In [5]:
# Funtion to print df gender and age info
def df_gender_age_info(
    df,
    ivs_all = [
        'Gender',
        'Gender_Num',
        'Gender_Female',
        'Gender_Mixed',
        'Gender_Male',
        'Age',
        'Age_Num',
        'Age_Older',
        'Age_Mixed',
        'Age_Younger',
    ],
):
    # Print Info
    print('\nDF INFO:\n')
    df.info()

    for iv in ivs_all:
        try:
            print('='*20)
            print(f'{iv}:')
            print('-'*20)
            print(f'{iv} Counts:\n{df[f"{iv}"].value_counts()}')
            print('-'*20)
            print(f'{iv} Percentages:\n{df[f"{iv}"].value_counts(normalize=True).mul(100).round(1).astype(float)}')
            try:
                print('-'*20)
                print(f'{iv} Mean: {df[f"{iv}"].mean().round(2).astype(float)}')
                print('-'*20)
                print(f'{iv} Standard Deviation: {df[f"{iv}"].std().round(2).astype(float)}')
            except Exception:
                pass
        except Exception:
            print(f'{iv} not available.')

    print('\n')


In [6]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_sentencized.pkl').reset_index(drop=True)


In [7]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 57 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Search Keyword                                  17599 non-null  object  
 1   Platform                                        17599 non-null  object  
 2   Job ID                                          17599 non-null  object  
 3   Job Title                                       17599 non-null  object  
 4   Company Name                                    17597 non-null  object  
 5   Location                                        17599 non-null  object  
 6   Job Description                                 17599 non-null  object  
 7   Rating                                          3780 non-null   float64 
 8   Employment Type                                 17017 non-null  object  
 9   Company URL                 

In [8]:
# Job Ad info
df_gender_age_info(df_jobs)


DF INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 57 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Search Keyword                                  17599 non-null  object  
 1   Platform                                        17599 non-null  object  
 2   Job ID                                          17599 non-null  object  
 3   Job Title                                       17599 non-null  object  
 4   Company Name                                    17597 non-null  object  
 5   Location                                        17599 non-null  object  
 6   Job Description                                 17599 non-null  object  
 7   Rating                                          3780 non-null   float64 
 8   Employment Type                                 17017 non-null  object  
 9   Company URL      

In [9]:
# Explode df so that every row is one sentence
df_jobs = df_jobs.explode('Job Description spacy_sentencized', ignore_index=True).reset_index(drop=True)


In [10]:
# 194820
len(df_jobs)


194820

In [11]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 57 columns):
 #   Column                                          Non-Null Count   Dtype   
---  ------                                          --------------   -----   
 0   Search Keyword                                  194820 non-null  object  
 1   Platform                                        194820 non-null  object  
 2   Job ID                                          194820 non-null  object  
 3   Job Title                                       194820 non-null  object  
 4   Company Name                                    194816 non-null  object  
 5   Location                                        194820 non-null  object  
 6   Job Description                                 194820 non-null  object  
 7   Rating                                          170120 non-null  float64 
 8   Employment Type                                 163313 non-null  object  
 9   Company URL    

In [11]:
df_jobs['Job Description spacy_sentencized_lower'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda job_sentence: job_sentence.strip().lower()
)


In [12]:
df_jobs[['Job Description spacy_sentencized', 'Job Description spacy_sentencized_lower']].head()


Unnamed: 0,Job Description spacy_sentencized,Job Description spacy_sentencized_lower
0,About Our Client,about our client
1,The Global KYC organisation is part of ING's C...,the global kyc organisation is part of ing's c...
2,Its purpose is Enabling people and organisatio...,its purpose is enabling people and organisatio...
3,Our Global KYC organisation is a first line of...,our global kyc organisation is a first line of...
4,In our Global KYC organisation you will be wor...,in our global kyc organisation you will be wor...


In [13]:
# Spacy tokenize
with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)

df_jobs['Job Description spacy_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda job_sentence: [
        str(token.text.strip().lower())
        for token in nlp.tokenizer(job_sentence)
        if len(token) != 0
        and not token.is_space
        and not token.is_stop
        and not token.is_punct
        and not token.is_bracket
        and not token.like_email
        and not token.text in custom_punct_chars
    ]
)

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


In [14]:
df_jobs['Job Description spacy_sentencized_cleaned'] = df_jobs['Job Description spacy_tokenized'].str.join(' ')


In [15]:
# Get sentence word frequencies
df_jobs = df_jobs.apply(
    lambda row: get_word_num_and_frequency(
        row=row, text_col='Job Description spacy_sentencized'
    ), 
    axis='columns',
    
)


In [16]:
# Job Sentence info
df_gender_age_info(df_jobs)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 64 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL    

In [17]:
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


# Use NLTK to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [18]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [19]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
languages = ["en", "['nl', 'en']", ['en', 'nl']]
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [20]:
import string
import re
import time
import json
import csv
import glob
import pickle
import random
import unicodedata
import pandas as pd
import numpy as np
import googletrans
from googletrans import Translator
random.seed(42)

# Set up Spacy
import spacy
from spacy.symbols import NORM, ORTH, LEMMA, POS

nlp = spacy.load('en_core_web_sm')

# Set up NLK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.tag import pos_tag, pos_tag_sents

nltk_path = f'{llm_path}/nltk'
nltk.data.path.append(nltk_path)

nltk.download('words', download_dir = nltk_path)
nltk.download('stopwords', download_dir = nltk_path)
nltk.download('punkt', download_dir = nltk_path)
nltk.download('averaged_perceptron_tagger', download_dir = nltk_path)

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Set up Gensim
from gensim.utils import save_as_line_sentence, simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, preprocess_documents

# Set up Bert
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline, BertTokenizer, BertForPreTraining, BertConfig, BertModel
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertModel.from_pretrained(bert_model_name)


[nltk_data] Downloading package words to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package averaged_perceptron_

In [21]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_tokenized_spacy.pkl').reset_index(drop=True)


In [22]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 64 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [23]:
# Tokenize with NLTK
df_jobs['Job Description nltk_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda job_sentence: [
        str(token.strip().lower()) 
        for token in word_tokenize(job_sentence) 
        if len(token) != 0 
        and token != '...' 
        and not token.lower() in set(stopwords.words('english')) 
        and not token.lower() in list(string.punctuation) 
    ]
)

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy_nltk.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


In [24]:
df_jobs['Job Description nltk_tokenized'].head()


0                                             [client]
1    [global, kyc, organisation, part, ing, 's, coo...
2    [purpose, enabling, people, organisations, use...
3    [global, kyc, organisation, first, line, defen...
4    [global, kyc, organisation, working, many, col...
Name: Job Description nltk_tokenized, dtype: object

In [25]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 65 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [26]:
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy_nltk.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


# Use gensim to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [27]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [28]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
languages = ["en", "['nl', 'en']", ['en', 'nl']]
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [29]:
import string
import re
import time
import json
import csv
import glob
import pickle
import random
import unicodedata
import pandas as pd
import numpy as np
import googletrans
from googletrans import Translator
random.seed(42)

# Set up Spacy
import spacy
from spacy.symbols import NORM, ORTH, LEMMA, POS

nlp = spacy.load('en_core_web_sm')

# Set up NLK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.tag import pos_tag, pos_tag_sents

nltk_path = f'{llm_path}/nltk'
nltk.data.path.append(nltk_path)

nltk.download('words', download_dir = nltk_path)
nltk.download('stopwords', download_dir = nltk_path)
nltk.download('punkt', download_dir = nltk_path)
nltk.download('averaged_perceptron_tagger', download_dir = nltk_path)

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Set up Gensim
from gensim.utils import save_as_line_sentence, simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, preprocess_documents

# Set up Bert
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline, BertTokenizer, BertForPreTraining, BertConfig, BertModel
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertModel.from_pretrained(bert_model_name)


[nltk_data] Downloading package words to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package averaged_perceptron_

In [30]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk.pkl').reset_index(drop=True)


In [31]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 65 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [32]:
df_jobs['Job Description gensim_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda sentence: preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
)

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


In [33]:
df_jobs['Job Description gensim_tokenized'].head()


0                                             [client]
1             [global, kyc, organis, ing, coo, domain]
2    [purpos, enabl, peopl, organis, us, bank, serv...
3    [global, kyc, organis, line, defenc, depart, p...
4    [global, kyc, organis, work, colleagu, differ,...
Name: Job Description gensim_tokenized, dtype: object

In [34]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 66 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [35]:
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


# Use BERT to tokenize sentences


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY_NLTK_GENSIM
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [36]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [37]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
languages = ["en", "['nl', 'en']", ['en', 'nl']]
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [38]:
import string
import re
import time
import json
import csv
import glob
import pickle
import random
import unicodedata
import pandas as pd
import numpy as np
import googletrans
from googletrans import Translator
random.seed(42)

# Set up Spacy
import spacy
from spacy.symbols import NORM, ORTH, LEMMA, POS

nlp = spacy.load('en_core_web_sm')

# Set up NLK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.tag import pos_tag, pos_tag_sents

nltk_path = f'{llm_path}/nltk'
nltk.data.path.append(nltk_path)

nltk.download('words', download_dir = nltk_path)
nltk.download('stopwords', download_dir = nltk_path)
nltk.download('punkt', download_dir = nltk_path)
nltk.download('averaged_perceptron_tagger', download_dir = nltk_path)

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Set up Gensim
from gensim.utils import save_as_line_sentence, simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, preprocess_documents

# Set up Bert
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline, BertTokenizer, BertForPreTraining, BertConfig, BertModel
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertModel.from_pretrained(bert_model_name)


[nltk_data] Downloading package words to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nyxinsane/Documents/Work - UvA/Automating
[nltk_data]     Equity/Study 1/Study1_Code/data/Language
[nltk_data]     Models/nltk...
[nltk_data]   Package averaged_perceptron_

In [39]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim.pkl').reset_index(drop=True)


In [40]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 66 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Search Keyword                                  194820 non-null  object 
 1   Platform                                        194820 non-null  object 
 2   Job ID                                          194820 non-null  object 
 3   Job Title                                       194820 non-null  object 
 4   Company Name                                    194816 non-null  object 
 5   Location                                        194820 non-null  object 
 6   Job Description                                 194820 non-null  object 
 7   Rating                                          170120 non-null  float64
 8   Employment Type                                 163313 non-null  object 
 9   Company URL               

In [41]:
df_jobs['Job Description bert_tokenized'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda sentence: bert_tokenizer.tokenize(str(sentence))
)

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


In [42]:
df_jobs['Job Description bert_tokenized'].head()


0                                 [about, our, client]
1    [the, global, ky, ##c, organisation, is, part,...
2    [its, purpose, is, enabling, people, and, orga...
3    [our, global, ky, ##c, organisation, is, a, fi...
4    [in, our, global, ky, ##c, organisation, you, ...
Name: Job Description bert_tokenized, dtype: object

In [43]:
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')
