# ATTN: This script should be run AFTER all POS tagging, lemmatization, and stemming (spacy and nltk) completed.
# If BERT POS tagging was done, change pkl file loading


# Use spacy to create bi and trigrams


### START HERE IF SOURCING FROM DF_JOBS_TAGS_LEMMAS_STEMS_SPACY_NLTK
### IF BERT POS TAGGING WAS DONE, SOURCING FROM DF_JOBS_TAGS_LEMMAS_STEMS_SPACY_NLTK_BERT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [None]:
def spacy_make_ngrams(sentence, matcher, gram_type):

    doc = nlp(sentence)
    matches = matcher(doc)
    matches_list = []

    for idx in range(len(matches)):
        for match_id, start, end in matches:
            if nlp.vocab.strings[match_id].split('_')[0] == gram_type:
                match = doc[matches[idx][1]: matches[idx][2]].text
                matches_list.append(match.lower())
    
    return list(set(matches_list))


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.pkl').reset_index(drop=True)


In [None]:
%%time
df_jobs['Job Description spacy_1grams_original_list'] = df_jobs['Job Description spacy_tokenized']
df_jobs['Job Description spacy_1grams'] = df_jobs['Job Description spacy_tokenized'].apply(
    lambda tokens: [
        tuple(token.split())
        for token in tokens
    ]
)


In [6]:
%%time
# Spacy bi and trigrams
matcher = Matcher(nlp.vocab)

bigram_rules = [
    ['NOUN', 'VERB'],
    ['VERB', 'NOUN'],
    ['ADJ', 'NOUN'],
    ['ADJ', 'PROPN'],
    # more rules here...
]

trigram_rules = [
    ['VERB', 'ADJ', 'NOUN'],
    ['NOUN', 'VERB', 'ADV'],
    ['NOUN', 'ADP', 'NOUN'],
    # more rules here...
]

patters_dict = {
    'bigram_patterns': [[{'POS': i} for i in j] for j in bigram_rules],
    'trigram_patterns': [[{'POS': i} for i in j] for j in trigram_rules],
}

ngram_dict = {
    'bigram': 2,
    'trigram': 3,
}

for ngram_name, ngram_num in ngram_dict.items():
    
    
    matcher.add(f'{ngram_name}_patterns', patters_dict[f'{ngram_name}_patterns'])

    df_jobs[f'Job Description spacy_{str(ngram_num)}grams_original_list'] = df_jobs['Job Description spacy_sentencized'].apply(
        lambda sentence: 
            [
                '_'.join(ngram_.split())
                for ngram_ in spacy_make_ngrams(sentence, matcher, ngram_name)
            ]
    )
    
    df_jobs[f'Job Description spacy_{str(ngram_num)}grams'] = df_jobs['Job Description spacy_sentencized'].apply(
        lambda sentence: 
            [
                tuple(ngram_.split())
                for ngram_ in spacy_make_ngrams(sentence, matcher, ngram_name)
            ]
    )

    df_jobs[f'Job Description spacy_{str(ngram_num)}grams_in_sent'] = df_jobs['Job Description spacy_sentencized'].str.lower().replace(
        regex = {
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_jobs[f'Job Description spacy_{str(ngram_num)}grams_original_list']
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )

    if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
        df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy.pkl')

        df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy.csv', index=False)
    else:
        print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')

    if f'{ngram_name}_patterns' in matcher:
        matcher.remove(f'{ngram_name}_patterns')
    assert f'{ngram_name}_patterns' not in matcher

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy.pkl')

    df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')



In [None]:
%%time
# Spacy Allgrams
df_jobs['Job Description spacy_123grams_original_list'] = df_jobs['Job Description spacy_tokenized'] + df_jobs['Job Description spacy_2grams_original_list'] + df_jobs['Job Description spacy_3grams_original_list']
df_jobs['Job Description spacy_123grams'] = df_jobs['Job Description spacy_1grams'] + df_jobs['Job Description spacy_2grams'] + df_jobs['Job Description spacy_3grams']
df_jobs['Job Description spacy_123grams_in_sent'] = df_jobs['Job Description spacy_sentencized'].str.lower().replace(
    regex = {
        re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
        for ngrams_list in df_jobs[f'Job Description spacy_123grams_original_list']
        for ngram_ in ngrams_list
        if '_' in ngram_
    }
)


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy.csv', index=False)


# Use NLTK to create bi and trigrams


### START HERE IF SOURCING FROM DF_JOBS_NGRAMS_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_ngrams_spacy.pkl').reset_index(drop=True)


In [None]:
df_jobs['Job Description nltk_1grams_original_list'] = df_jobs['Job Description nltk_tokenized']
df_jobs['Job Description nltk_1grams'] = df_jobs['Job Description nltk_tokenized'].apply(
    lambda tokens: [
        tuple(token.split())
        for token in tokens
    ]
)


In [None]:
%%time
# NLTK bi and trigrams
ngram_dict = {
    'bigram': 2,
    'trigram': 3
}

for ngram_name, ngram_num in ngram_dict.items():

    df_jobs[f'Job Description nltk_{str(ngram_num)}grams_original_list'] = df_jobs['Job Description nltk_tokenized'].apply(
        lambda tokens:
            list(
                '_'.join(ngram_list)
                for ngram_list in nltk.ngrams(tokens, ngram_num)
            )
    )

    df_jobs[f'Job Description nltk_{str(ngram_num)}grams'] = df_jobs['Job Description nltk_tokenized'].apply(
        lambda tokens: list(nltk.ngrams(tokens, ngram_num))
    )

    df_jobs[f'Job Description nltk_{str(ngram_num)}grams_in_sent'] = df_jobs['Job Description spacy_sentencized'].str.lower().replace(
        regex = {
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_jobs[f'Job Description nltk_{str(ngram_num)}grams_original_list']
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy_nltk.csv', index=False)


In [None]:
%%time
# NLTK Allgrams
df_jobs['Job Description nltk_123grams_original_list'] = (
    df_jobs['Job Description nltk_tokenized']
    + df_jobs['Job Description nltk_2grams_original_list']
    + df_jobs['Job Description nltk_3grams_original_list']
)
df_jobs['Job Description nltk_123grams'] = (
    df_jobs['Job Description nltk_1grams']
    + df_jobs['Job Description nltk_2grams']
    + df_jobs['Job Description nltk_3grams']
)
df_jobs['Job Description nltk_123grams_in_sent'] = (
    df_jobs['Job Description spacy_sentencized']
    .str.lower()
    .replace(
        regex={
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_jobs[
                'Job Description nltk_123grams_original_list'
            ]
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
)



In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy_nltk.csv', index=False)


# Use Gensim to create bi and trigrams


### START HERE IF SOURCING FROM DF_JOBS_NGRAMS_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_ngrams_spacy_nltk.pkl').reset_index(drop=True)


In [None]:
%%time
df_jobs['Job Description gensim_1grams_original_list'] = df_jobs['Job Description gensim_tokenized']
df_jobs['Job Description gensim_1grams'] = df_jobs['Job Description gensim_tokenized'].apply(
    lambda tokens: [
        tuple(token.split())
        for token in tokens
    ]
)


In [None]:
%%time
# Gensim bi and trigrams
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'

# Gensim Bigrams
bigram = Phraser(Phrases(df_jobs['Job Description gensim_tokenized'], connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
df_jobs['Job Description gensim_2grams_original_list_all'] = bigram[df_jobs['Job Description gensim_tokenized']]
df_jobs['Job Description gensim_2grams_original_list'] = df_jobs['Job Description gensim_2grams_original_list_all'].apply(
    lambda ngrams_list: [
        ngram_
        for ngram_ in ngrams_list
        if len(re.findall('[a-zA-Z]*\_[a-zA-Z]*', ngram_)) != 0
    ]
)
df_jobs['Job Description gensim_2grams'] = df_jobs['Job Description gensim_2grams_original_list'].apply(
    lambda ngrams: [
        tuple(ngram.split('_'))
        for ngram in ngrams
        if '_' in ngram
    ]
)
df_jobs[f'Job Description gensim_2grams_in_sent'] = df_jobs['Job Description spacy_sentencized'].str.lower().apply(
    lambda sentence: ' '.join(preprocess_string(re.sub(pattern, ' ', sentence.strip().lower())))
).replace(
    regex = {
        re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
        for ngrams_list in df_jobs[f'Job Description gensim_2grams_original_list']
        for ngram_ in ngrams_list
        if '_' in ngram_
    }
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy_nltk_gensim.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy_nltk_gensim.csv', index=False)

# Gensim Trigrams
trigram = Phraser(Phrases(df_jobs['Job Description gensim_2grams_original_list_all'], connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
df_jobs['Job Description gensim_3grams_original_list_all'] = trigram[df_jobs['Job Description gensim_2grams_original_list_all']]
df_jobs['Job Description gensim_3grams_original_list'] = df_jobs['Job Description gensim_3grams_original_list_all'].apply(
    lambda ngrams_list: [
        ngram_
        for ngram_ in ngrams_list
        if len(re.findall('[a-zA-Z]*\_[a-zA-Z]*\_[a-zA-Z]*', ngram_)) != 0
    ]
)
df_jobs['Job Description gensim_3grams'] = df_jobs['Job Description gensim_3grams_original_list'].apply(
    lambda ngrams: [
        tuple(ngram.split('_'))
        for ngram in ngrams
        if '_' in ngram
    ]
)
df_jobs[f'Job Description gensim_3grams_in_sent'] = df_jobs['Job Description spacy_sentencized'].str.lower().apply(
    lambda sentence: ' '.join(preprocess_string(re.sub(pattern, ' ', sentence.strip().lower())))
).replace(
    regex = {
        re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
        for ngrams_list in df_jobs[f'Job Description gensim_3grams_original_list']
        for ngram_ in ngrams_list
        if '_' in ngram_
    }
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy_nltk_gensim.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy_nltk_gensim.csv', index=False)


In [None]:
%%time
# Gensim Allgrams
df_jobs['Job Description gensim_123grams_original_list'] = (
    df_jobs['Job Description gensim_tokenized']
    + df_jobs['Job Description gensim_2grams_original_list']
    + df_jobs['Job Description gensim_3grams_original_list']
)
df_jobs['Job Description gensim_123grams'] = (
    df_jobs['Job Description gensim_1grams']
    + df_jobs['Job Description gensim_2grams']
    + df_jobs['Job Description gensim_3grams']
)
df_jobs['Job Description gensim_123grams_in_sent'] = (
    df_jobs['Job Description spacy_sentencized']
    .str.lower()
    .apply(
        lambda sentence: ' '.join(
            preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
        )
    )
    .replace(
        regex={
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_jobs[
                'Job Description gensim_123grams_original_list'
            ]
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
)


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_spacy_nltk_gensim.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_spacy_nltk_gensim.csv', index=False)


# Create word frequencies for uni, bi, and trigrams


### START HERE IF SOURCING FROM DF_JOBS_NGRAMS_SPACY_NLTK_GENSIM
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [None]:
def get_abs_frequency(row, text_col, ngram_num, embedding_library):

    abs_word_freq = defaultdict(int)
    for word in row[f'Job Description {embedding_library}_{ngram_num}grams_original_list']:
        abs_word_freq[word] += 1

        abs_wtd_df = (
            pd.DataFrame.from_dict(abs_word_freq, orient='index')
            .rename(columns={0: 'abs_word_freq'})
            .sort_values(by=['abs_word_freq'], ascending=False)
            )
        abs_wtd_df.insert(1, 'abs_word_perc', value=abs_wtd_df['abs_word_freq'] / abs_wtd_df['abs_word_freq'].sum())
        abs_wtd_df.insert(2, 'abs_word_perc_cum', abs_wtd_df['abs_word_perc'].cumsum())

        row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_freq'] = str(abs_wtd_df['abs_word_freq'].to_dict())
        row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_perc'] = str(abs_wtd_df['abs_word_perc'].to_dict())
        row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_perc_cum'] = str(abs_wtd_df['abs_word_perc_cum'].to_dict())

    return row


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_ngrams_spacy_nltk_gensim.pkl').reset_index(drop=True)


In [None]:
%%time
ngrams_list=[1, 2, 3, 123]
embedding_libraries_list = ['spacy', 'nltk', 'gensim']

for embedding_library, ngram_num in itertools.product(embedding_libraries_list, ngrams_list):
    df_jobs = df_jobs.apply(lambda row: get_abs_frequency(row=row, text_col='Job Description spacy_tokenized', ngram_num=ngram_num, embedding_library=embedding_library), axis='columns')

    assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
    df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_frequency.pkl')
    df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_frequency.csv', index=False)


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_frequency.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_frequency.csv', index=False)


# Create BoW dictionary, corpus, and tfidf matrix for uni, bi, and trigrams


### START HERE IF SOURCING FROM DF_JOBS_NGRAMS_FREQUENCY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *


In [None]:
def get_corpus_and_dictionary(row, ngram_num, embedding_library):
    
    ngrams_original_list = row[f'Job Description {embedding_library}_{ngram_num}grams_original_list']
    dictionary = Dictionary([ngrams_original_list])
    BoW_corpus = [dictionary.doc2bow(ngrams_original_list)]
    tfidf = TfidfModel(BoW_corpus, smartirs='ntc')
    tfidf_matrix = [tfidf[doc] for doc in BoW_corpus]

    row[f'Job Description {embedding_library}_{ngram_num}grams_dictionary'] = dictionary
    row[f'Job Description {embedding_library}_{ngram_num}grams_BoW_corpus'] = BoW_corpus
    row[f'Job Description {embedding_library}_{ngram_num}grams_tfidf'] = tfidf
    row[f'Job Description {embedding_library}_{ngram_num}grams_tfidf_matrix'] = tfidf_matrix
    
    return row


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_ngrams_frequency.pkl').reset_index(drop=True)


In [None]:
%%time
ngrams_list=[1, 2, 3, 123]
embedding_libraries_list = ['spacy', 'nltk', 'gensim']

for embedding_library, ngram_num in itertools.product(embedding_libraries_list, ngrams_list):
    df_jobs = df_jobs.apply(
        lambda row: get_corpus_and_dictionary(
            row=row, ngram_num=ngram_num, embedding_library=embedding_library
        ),
        axis='columns'
    )

    assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
    df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_frequency.pkl')
    df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_BoW.csv', index=False)


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_ngrams_frequency.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_ngrams_BoW.csv', index=False)
