# ATTN: This script should be run AFTER all POS tagging, lemmatization, and stemming (spacy, nltk, and BERT) completed.


# Use spacy, NLTK, and gensim to create bi and trigrams


### START HERE IF SOURCING FROM DF_JOBS_TAGS_LEMMAS_STEMS_SPACY_NLTK_BERT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os
import sys
import importlib
from pathlib import Path

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [None]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
languages = ["en", "['nl', 'en']", ['en', 'nl']]
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [None]:
import string
import re
import time
import json
import csv
import glob
import pickle
import random
import unicodedata
import pandas as pd
import numpy as np
import googletrans
from googletrans import Translator
random.seed(42)

# Set up Spacy
import spacy
from spacy.symbols import NORM, ORTH, LEMMA, POS
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

# Set up NLK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.tag import pos_tag, pos_tag_sents

nltk_path = f'{llm_path}/nltk'
nltk.data.path.append(nltk_path)

nltk.download('words', download_dir = nltk_path)
nltk.download('stopwords', download_dir = nltk_path)
nltk.download('punkt', download_dir = nltk_path)
nltk.download('averaged_perceptron_tagger', download_dir = nltk_path)

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Set up Gensim
from gensim.utils import save_as_line_sentence, simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, preprocess_documents

# Set up Bert
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline, BertTokenizer, BertForPreTraining, BertConfig, BertModel
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertModel.from_pretrained(bert_model_name)


In [None]:
def spacy_matches_ngrams(sentence, matcher, gram_type):
    doc = nlp(sentence)
    for match_id, start, end in matcher(doc):
        if nlp.vocab.strings[match_id].split('_')[0] == gram_type:
            return [doc[start:end].text]


In [None]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.pkl').reset_index(drop=True)


In [None]:
# Spacy Bigrams
matcher = Matcher(nlp.vocab)

bigram_rules = [
    ['NOUN', 'VERB'],
    ['VERB', 'NOUN'],
    ['ADJ', 'NOUN'],
    ['ADJ', 'PROPN'],
    # more rules here...
]

bigram_patterns = [[{'POS': i} for i in j] for j in bigram_rules]

matcher.add('bigram_patterns', bigram_patterns)

df_jobs['Job Description 2grams_spacy'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda sentence:
    spacy_matches_ngrams(sentence, matcher, 'bigram')
)


In [None]:
# Spacy Trigrams
matcher = Matcher(nlp.vocab)

trigram_rules = [
    ['VERB', 'ADJ', 'NOUN'],
    ['NOUN', 'VERB', 'ADV'],
    ['NOUN', 'ADP', 'NOUN'],
    # more rules here...
]

trigram_patterns = [[{'POS': i} for i in j] for j in trigram_rules]

matcher.add('trigram_patterns', trigram_patterns)

df_jobs['Job Description 3grams_spacy'] = df_jobs['Job Description spacy_sentencized'].apply(
    lambda sentence:
    spacy_matches_ngrams(sentence, matcher, 'trigram')
)


In [None]:
# NLTK bi and trigrams



In [None]:
# Gensim bi and trigrams
# Bigrams
bigram = Phraser(Phrases(df_jobs['Job Description spacy_tokenized'], connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
df_jobs['Job Description gensim_2garms'] = list(bigram[df_jobs['Job Description spacy_tokenized']])

# Trigrams
trigram = Phraser(Phrases(bigram_sentences, connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
df_jobs['Job Description gensim_3garms'] = list(trigram[bigram_sentences])


In [None]:
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_ngrams_spacy_nltk_gensim.pkl')

    df_jobs.to_csv(f'{data_dir}df_jobs_ngrams_spacy_nltk_gensim.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')
