# ATTN: This script should be run AFTER all tokenization (spacy, nltk, gensim, and BERT) completed.


# Use spacy to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM DF_JOBS_TOKENIZED_SPACY_NLTK_GENSIM_BERT

### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_sentiment_spacy_nltk.pkl')


In [None]:
df_jobs.info()


In [None]:
%%time
# Load customer characters
with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)

# POS tagging
df_jobs['Job Description spacy_token_tags'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: [
        tuple([token.text.strip().lower(), token.tag_])
        for token in nlp(job_sentence)

    ]
)

# Lemmatization
df_jobs['Job Description spacy_lemmas'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: [
        token.lemma_.strip().lower()
        for token in nlp(job_sentence)
        if token.text is not None
        and len(token) != 0
        and len(token.text) != 0
        and bool(token)
        and bool(token.text)
        and token.text != '...'
        and not token.is_space
        and not token.is_punct
        and not token.is_quote
        and not token.is_bracket
        and not token.like_email
    ]
)

# Stemming
df_jobs['Job Description spacy_stems'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: [
        stemmer.stem(token.text.strip().lower())
        for token in nlp(job_sentence)
        if token.text is not None
        and len(token) != 0
        and len(token.text) != 0
        and bool(token)
        and bool(token.text)
        and token.text != '...'
        and not token.is_space
        and not token.is_punct
        and not token.is_quote
        and not token.is_bracket
        and not token.like_email
    ]
)

assert len(df_jobs) > 0 and isinstance(
    df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.pkl')
df_jobs.to_csv(
    f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.csv', index=False)


In [None]:
df_jobs.info()


In [None]:
df_jobs[
    [
        'Job Description spacy_token_tags',
        'Job Description spacy_lemmas',
        'Job Description spacy_stems'
    ]
].head()


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.csv', index=False)


# Use NLTK to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM DF_JOBS_TAGS_LEMMAS_STEMS_SPACY

### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
def get_wordnet_pos(token):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [None]:
df_jobs = pd.read_pickle(
    f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy.pkl')


In [None]:
df_jobs.info()


In [None]:
%%time
# POS tagging
df_jobs['Job Description nltk_token_tags'] = df_jobs['Job Description spacy_tokenized'].progress_apply(
    lambda token: pos_tag(token)
)

# Lemmatization
df_jobs['Job Description nltk_lemmas'] = df_jobs['Job Description spacy_tokenized'].progress_apply(
    lambda tokens: [
        lemmatizer.lemmatize(
            token, get_wordnet_pos(
                unicodedata.normalize('NFKD', str(token.strip().lower())).encode(
                    'ascii', 'ignore').decode('utf-8', 'ignore')
            )
        )
        for token in tokens
    ]
)

# Stemming
df_jobs['Job Description nltk_stems'] = df_jobs['Job Description spacy_tokenized'].progress_apply(
    lambda tokens: [
        stemmer.stem(
            unicodedata.normalize('NFKD', str(token.strip().lower())).encode(
                'ascii', 'ignore').decode('utf-8', 'ignore')
        )
        for token in tokens
    ]
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.csv', index=False)


In [None]:
df_jobs.info()


In [None]:
df_jobs[
    [
        'Job Description nltk_token_tags',
        'Job Description nltk_lemmas',
        'Job Description nltk_stems'
    ]
].head()


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.csv', index=False)


In [None]:
print(f'Saving df_jobs length {len(df_jobs)} to txt file.')
with open(f'{data_dir}df_jobs_len.txt', 'w') as f:
    f.write(str(len(df_jobs)))


# Use BERT to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM DF_JOBS_TAGS_LEMMAS_STEMS_SPACY_NLTK

### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
import evaluate
from accelerate import Accelerator

from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    AutoTokenizer,
    BertConfig,
    BertForPreTraining,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    BertTokenizerFast,
    BitsAndBytesConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    EarlyStoppingCallback,
    GPT2Config,
    GPT2ForSequenceClassification,
    GPT2Model,
    GPT2TokenizerFast,
    GPTJConfig,
    GPTJForSequenceClassification,
    GPTJModel,
    GPTNeoXConfig,
    GPTNeoXForSequenceClassification,
    GPTNeoXTokenizerFast,
    LlamaConfig,
    LlamaForSequenceClassification,
    LlamaTokenizer,
    LlamaTokenizerFast,
    MegatronBertForSequenceClassification,
    OpenAIGPTConfig,
    OpenAIGPTForSequenceClassification,
    OpenAIGPTTokenizerFast,
    TextClassificationPipeline,
    TFGPTJForSequenceClassification,
    TFGPTJModel,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup,
    pipeline,
)
from transformers.integrations import (
    TensorBoardCallback,
    is_optuna_available,
    is_ray_available,
)
accelerator = Accelerator()


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk.pkl')


In [None]:
%%time
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(
    bert_model_name, strip_accents=True, trust_remote_code=True)
bert_model = BertForSequenceClassification.from_pretrained(
    bert_model_name, trust_remote_code=True).to(device)
bert_pos_model_name = 'QCRI/bert-base-multilingual-cased-pos-english'
bert_pos_model = AutoModelForTokenClassification.from_pretrained(bert_pos_model_name, trust_remote_code=True).to(device)
bert_pos_tagger = TokenClassificationPipeline(model=bert_pos_model, tokenizer=bert_tokenizer, device=device)

df_jobs['Job Description bert_token_tags_with_scores'] = df_jobs['Job Description spacy_sentencized'].progress_apply(
    lambda sentence: [
        (bert_pos_tag['word'], bert_pos_tag['entity'], bert_pos_tag['score'])
        for i in range(len(sentence.split()))
        for bert_pos_tag in bert_pos_tagger(sentence)
    ]
).to(device)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)

df_jobs['Job Description bert_token_tags'] = df_jobs['Job Description bert_token_tags_with_scores'].progress_apply(
    lambda tag_list: [
        [(tag_list[i][0], tag_list[i][1])]
        for tag_tuple in tag_list
        for i in range(len(tag_list))
    ]
)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)



In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)
