# ATTN: This script should be run AFTER all tokenization (spacy, nltk, gensim, and BERT) completed.


# Use spacy to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY_NLTK_GENSIM_BERT

### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [1]:
import os  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path  # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


Using MPS


0it [00:00, ?it/s]

<Figure size 640x480 with 0 Axes>

In [3]:
df_jobs = pd.read_pickle(
    f'{df_save_dir}df_jobs_tokenized_spacy_nltk_gensim_bert.pkl'
).reset_index(drop=True)


In [4]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194820 entries, 0 to 194819
Data columns (total 68 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   Search Keyword                                    194820 non-null  object 
 1   Platform                                          194820 non-null  object 
 2   Job ID                                            194820 non-null  object 
 3   Job Title                                         194820 non-null  object 
 4   Company Name                                      194816 non-null  object 
 5   Location                                          194820 non-null  object 
 6   Job Description                                   194820 non-null  object 
 7   Rating                                            170120 non-null  float64
 8   Employment Type                                   163313 non-null  object 
 9   Comp

In [5]:
%%time
# Load customer characters
with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)

# POS tagging
df_jobs['Job Description spacy_token_tags'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: [
        tuple([token.text.strip().lower(), token.tag_])
        for token in nlp(job_sentence)

    ]
)

# Lemmatization
df_jobs['Job Description spacy_lemmas'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: [
        token.lemma_.strip().lower()
        for token in nlp(job_sentence)
        if len(token) != 0 and not token.is_stop and not token.is_punct and token.text not in custom_punct_chars
    ]
)

# Stemming
df_jobs['Job Description spacy_stems'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: [
        stemmer.stem(token.text.strip().lower())
        for token in nlp(job_sentence)
        if len(token) != 0 and not token.is_stop and not token.is_punct and token.text not in custom_punct_chars
    ]
)

assert len(df_jobs) > 0 and isinstance(
    df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.pkl')
df_jobs.to_csv(
    f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.csv', index=False)


progress-bar:   0%|          | 0/194820 [00:00<?, ?it/s]

In [None]:
df_jobs.info()


In [None]:
df_jobs[
    [
        'Job Description spacy_token_tags',
        'Job Description spacy_lemmas',
        'Job Description spacy_stems'
    ]
].head()


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.csv', index=False)


# Use NLTK to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM DF_JOBS_TAGS_LEMMAS_STEMS_SPACY

### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path  # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
def get_wordnet_pos(token):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [None]:
df_jobs = pd.read_pickle(
    f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.pkl').reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
%%time
# POS tagging
df_jobs['Job Description nltk_token_tags'] = df_jobs['Job Description spacy_tokenized'].progress_apply(
    lambda token: pos_tag(token)
)

# Lemmatization
df_jobs['Job Description nltk_lemmas'] = df_jobs['Job Description spacy_tokenized'].progress_apply(
    lambda tokens: [
        lemmatizer.lemmatize(
            token, get_wordnet_pos(
                unicodedata.normalize('NFKD', str(token.strip().lower())).encode(
                    'ascii', 'ignore').decode('utf-8', 'ignore')
            )
        )
        for token in tokens
    ]
)

# Stemming
df_jobs['Job Description nltk_stems'] = df_jobs['Job Description spacy_tokenized'].progress_apply(
    lambda tokens: [
        stemmer.stem(
            unicodedata.normalize('NFKD', str(token.strip().lower())).encode(
                'ascii', 'ignore').decode('utf-8', 'ignore')
        )
        for token in tokens
    ]
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.csv', index=False)


In [None]:
df_jobs.info()


In [None]:
df_jobs[
    [
        'Job Description nltk_token_tags',
        'Job Description nltk_lemmas',
        'Job Description nltk_stems'
    ]
].head()


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.csv', index=False)


# Use BERT to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM DF_JOBS_TAGS_LEMMAS_STEMS_SPACY_NLTK

### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path  # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.pkl').reset_index(drop=True)


In [None]:
# %%time
# max_length = 512
# returned_tensor = 'pt'
# cpu_counts = torch.multiprocessing.cpu_count()
# device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
# ) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device_name = str(device.type)
# print(f'Using {device_name.upper()}')
# bert_model_name = 'bert-base-uncased'
# bert_tokenizer = BertTokenizerFast.from_pretrained(
#     bert_model_name, strip_accents=True)
# bert_model = BertForSequenceClassification.from_pretrained(
#     bert_model_name).to(device)
# bert_pos_model_name = 'QCRI/bert-base-multilingual-cased-pos-english'
# bert_pos_model = AutoModelForTokenClassification.from_pretrained(bert_pos_model_name).to(device)
# bert_pos_tagger = TokenClassificationPipeline(model=bert_pos_model, tokenizer=bert_tokenizer, device=device)

# df_jobs['Job Description bert_token_tags_with_scores'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
#     lambda sentence: [
#         (bert_pos_tag['word'], bert_pos_tag['entity'], bert_pos_tag['score'])
#         for i in range(len(sentence.split()))
#         for bert_pos_tag in bert_pos_tagger(sentence)
#     ]
# ).to(device)

# assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
# df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.pkl')
# df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)

# df_jobs['Job Description bert_token_tags'] = df_jobs['Job Description bert_token_tags_with_scores'].progress_apply(
#     lambda tag_list: [
#         [(tag_list[i][0], tag_list[i][1])]
#         for tag_tuple in tag_list
#         for i in range(len(tag_list))
#     ]
# )

# assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
# df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.pkl')
# df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)



In [None]:
# assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
# df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.pkl')
# df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)
