# ATTN: This script uses Google translate to detect job description language. Google translate will limit requests and take a very long time. Only run this script if redoing language detection.

## Read from scrapped data

In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


#### Read paths

In [None]:
glob_paths = list(set(glob.glob(f'{scraped_data}Coding Material/*Folder/*/Job ID -*- Codebook (Automating Equity).xlsx')))


In [None]:
# 244 xlsx files
len(glob_paths)


#### Use paths to open files, fix keywords, and drop unneeded columns

In [None]:
%%time
# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

# Fix list catches all incorrect/faculty keyword search terms
fix_list = []

# Appended data catches all the fixed and cleaned dfs
appended_data = []

for glob_path in glob_paths:

    try:
        df_temp = pd.read_excel(glob_path)
    except ValueError:
        fix_list.append(glob_path)

    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        df_temp = df_temp
        df_temp = df_temp.drop(columns=cols, axis='columns', errors='ignore')
        df_temp = df_temp.drop(
        df_temp.columns[
                df_temp.columns.str.contains(
                    'unnamed|index|level', regex=True, case=False, flags=re.I
                )
            ],
            axis='columns',
            errors='ignore',
        )

        appended_data.append(df_temp)

# Concatonate list of dfs into one large df_manual
df_manual = pd.concat(appended_data, axis='index')

# Save df_manual to file
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw.csv')


In [None]:
# If we couldn't fix some keywords, we add them to list fix_list and write to file
if len(fix_list) != 0:
    print('Some keywords to fix!')
    with open(f'{data_dir}fix_list.txt', 'w') as f:
        json.dump(fix_list, f)


In [None]:
# List of dfs, len = 244
len(appended_data)


In [None]:
# Concatonate list of dfs into one large df_manual
df_manual = pd.concat(appended_data, axis='index')


In [None]:
# len = 12400
len(df_manual)


In [None]:
# Save df_manual to file
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw.csv')


## Drop duplicated and missing data

### START HERE IF SOURCING FROM df_manual_RAW
### PLEASE SET CORRECT DIRECTORY PATHS BELOW

In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_raw.pkl')


In [None]:
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# len = 12400
len(df_manual)


In [None]:
df_manual.info()


In [None]:
# Clean columns
df_manual.columns = df_manual.columns.to_series().progress_apply(lambda x: str(x).strip())

In [None]:
# Remove columns 'Task_Mentioned', 'Task_Warmth', 'Task_Competence'
df_manual = df_manual.drop(
    columns=['Task_Mentioned', 'Task_Warmth', 'Task_Competence'],
    axis='columns',
    errors='ignore'
)

In [None]:
df_manual.info()

In [None]:
# Missing values: Sentence = 4, Warmth = 2, Competence = 0
df_manual.isna().sum()

In [None]:
# Drop NA
df_manual = df_manual.dropna(axis='index', how='all')
df_manual = df_manual.dropna(axis='columns', how='all')
df_manual = df_manual.dropna(
    subset = ['Sentence', 'Warmth', 'Competence'],
)

In [None]:
# No na values
df_manual.isna().sum()

In [None]:
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
df_manual.columns

In [None]:
# Convert Warmth and Competence to int
# Warmth 1 = 2826, Competence 1 = 5064
int_cols = [
    'Warmth',
    'Competence',
]

for col in int_cols:
    df_manual[col] = df_manual[col].astype(np.int64, errors='ignore')
    print(f'{col} converted to int.' if all(df_manual[col].progress_apply(lambda x: isinstance(x, int))) else f'{col} NOT converted to int.')
    print(f'{col} value counts:\n{df_manual[col].value_counts()}')


In [None]:
df_manual.info()


In [None]:
%%time
# Conver Job ID and Sentence to str
str_cols = [
    'Job ID',
    'Sentence',
]

for col in str_cols:
    df_manual[col] = df_manual[col].astype(str, errors='ignore').progress_apply(lambda x: x.strip().replace('[', '').replace(']', ''))
    print(f'{col} converted to str.' if all(df_manual[col].progress_apply(lambda x: isinstance(x, str))) else f'{col} NOT converted to str.')


In [None]:
# len = 12394
len(df_manual)

In [None]:
df_manual.info()

In [None]:
# Rename Sentence to 'Job Description spacy_sentencized'
df_manual = df_manual.rename(
    columns = {
        'Sentence': 'Job Description spacy_sentencized'
    },
    errors='ignore'
)

In [None]:
df_manual.columns


In [None]:
# Drop NA
df_manual = df_manual.dropna(axis='index', how='all')
df_manual = df_manual.dropna(axis='columns', how='all')
df_manual = df_manual.dropna(
    subset = ['Job Description spacy_sentencized', 'Warmth', 'Competence'],
)


In [None]:
# len = 12394
len(df_manual)


In [None]:
# len = 133
len(df_manual.groupby(['Job ID'])['Job ID'].unique())


In [None]:
# Drop duplicates on subset of 'Job ID' and 'Sentence'
random.seed(random_state)
np.random.seed(random_state)
df_manual = df_manual.drop_duplicates(subset=['Job ID', 'Job Description spacy_sentencized'], keep='first')


In [None]:
# len = 6400
len(df_manual)


In [None]:
# Remove any rows with missing 'Job ID'
df_manual = df_manual.drop(
    df_manual[
        (df_manual['Job ID'].isin(nan_list)) | 
        (df_manual['Job ID'].isnull()) | 
        (df_manual['Job ID'].isna())
    ].index, 
    axis='index',
)


In [None]:
# len = 6400
len(df_manual)


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# Save df_manual to file
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw_dropped.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw_dropped.csv')


## Add English and Dutch language requirement columns

### START HERE IF SOURCING FROM df_manual_RAW_DROPPED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_raw_dropped.pkl')#


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# 6400
len(df_manual)


In [None]:
df_manual.info()


In [None]:
%%time
# Add language requirement column
# Use regex to find language requirement
dutch_requirement_pattern = r'[Ll]anguage: [Dd]utch|[Dd]utch [Pp]referred|[Dd]utch [Re]quired|[Dd]utch [Ll]anguage|[Pp]roficient in [Dd]utch|[Ss]peak [Dd]utch|[Kk]now [Dd]utch'
english_requirement_pattern = r'[Ll]anguage: [Ee]nglish|[Ee]nglish [Pp]referred|[Ee]nglish [Re]quired|[Ee]nglish [Ll]anguage|[Pp]roficient in [Ee]nglish|[Ss]peak [Ee]nglish|[Kk]now [Ee]nglish'

lang_requirements = {
    'Dutch Requirement': dutch_requirement_pattern, 'English Requirement': english_requirement_pattern
}

for lang_req, lang_req_pattern in lang_requirements.items():
    if lang_req in df_manual.columns:
        df_manual = df_manual.drop(columns=[lang_req])
    df_manual[lang_req] = np.where(
        df_manual['Job Description spacy_sentencized'].str.contains(lang_req_pattern),
        'Yes',
        'No',
    )

    df_manual[lang_req] = df_manual[lang_req].astype('category').cat.reorder_categories(['No', 'Yes'], ordered=True)
    df_manual[lang_req] = pd.Categorical(df_manual[lang_req], categories=['No', 'Yes'], ordered=True)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw_language_requirement.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw_english_requirement.csv')


In [None]:
# Yes = 7
df_manual['Dutch Requirement'].value_counts()


In [None]:
# Yes = 8
df_manual['English Requirement'].value_counts()

In [None]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw_language_requirement.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw_language_requirement.csv')


## Add data from Sectors dataframe (see CBS directory under scrapped_data directory) and Categorical data


### START HERE IF SOURCING FROM df_manual_RAW_LANGUAGE_REQUIREMENT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
def get_df_info(df, ivs_all=None):
    if ivs_all is None:
        ivs_all = [
            'Gender',
            'Gender_Num',
            'Gender_Female',
            'Gender_Mixed',
            'Gender_Male',
            'Age',
            'Age_Num',
            'Age_Older',
            'Age_Mixed',
            'Age_Younger',
        ]
    # Print Info
    print('\nDF INFO:\n')
    df.info()

    for iv in ivs_all:
        try:
            print('='*20)
            print(f'{iv}:')
            print('-'*20)
            if len(df[iv].value_counts()) > 5:
                print(f'{iv} Counts:\n{df[iv].value_counts()}')
                print('-'*20)
                print(f'{iv} Percentages:\n{df[iv].value_counts(normalize=True).mul(100).round(1).astype(float)}')
                print('-'*20)
            min_val = df[iv].min()
            max_val = df[iv].max()
            if min_val not in [0, 1]:
                print(f'Min {iv} value: {min_val}')
            if max_val not in [1, 3]:
                print(f'Max {iv} value: {max_val}')
            with contextlib.suppress(Exception):
                print('-'*20)
                print(f'{iv} Mean: {df[iv].mean().round(2).astype(float)}')
                print('-'*20)
                print(f'{iv} Standard Deviation: {df[iv].std().round(2).astype(float)}')
        except Exception:
            print(f'{iv} not available.')

    print('\n')


In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_raw_language_requirement.pkl')


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
df_manual.info()


In [None]:
df_manual['Job ID'] = df_manual['Job ID'].progress_apply(lambda x: str(x).lower().strip())


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_including_sector_genage_data.pkl')


In [None]:
df_jobs.info()

In [None]:
df_jobs['Job ID'] = df_jobs['Job ID'].progress_apply(lambda x: str(x).lower().strip())


In [None]:
df_jobs.columns


In [None]:
df_jobs = df_jobs.drop(
    columns = [
        'Job Description', 'Rating', 'Employment Type',
        'Company URL', 'Job URL', 'Job Age', 'Job Age Number',
        'Collection Date', 'Data Row', 'Tracking ID', 'Job Date',
        'Type of ownership', 'Language', 'Dutch Requirement', 'English Requirement', 
    ],
    errors='ignore'
)

In [None]:
df_jobs.columns


In [None]:
# Add sector and categorical data from df_jobs
df_manual = df_manual.merge(df_jobs, on='Job ID', how='inner')


In [None]:
# len = 5978
len(df_manual)


In [None]:
df_manual.info()


In [None]:
df_manual.head()

#### Check if there is any missing sector data in the merged dataframe

In [None]:
df_manual['Sector'].isna().sum()

In [None]:
if df_manual['Sector'].isna().sum() != 0:
    print('Some search keywords did not match a sector. Fixing')
    print(set(df_manual['Search Keyword'].loc[df_manual['Sector'].isna()].to_list()))
    print(len(df_manual['Search Keyword'].loc[df_manual['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))
    df_manual = fix_keywords(df_manual)
    print(set(df_manual['Search Keyword'].loc[df_manual['Sector'].isna()].to_list()))
    print(len(df_manual['Search Keyword'].loc[df_manual['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))


In [None]:
# Manual Job Ad info, len = 117
get_df_info(df_manual.groupby(['Job ID']).first())


In [None]:
# Manual Job Sentence info
get_df_info(df_manual)


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
if df_manual['Sector'].isna().sum() == 0:
    assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
    df_manual.to_pickle(f'{df_save_dir}df_manual_including_sector_genage_data.pkl')
    df_manual.to_csv(f'{df_save_dir}df_manual_including_sector_genage_data.csv')
else:
    print(f"MISSING SECTOR DATA: COUNT {df_manual['Sector'].isna().sum()}")

# ATTN: This script should be run AFTER spacy sentence splitting is completed.


## Use spacy to tokenize sentences


### START HERE IF SOURCING FROM df_manual_SENTENCIZED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
def get_word_num_and_frequency(row, text_col):

    with open(f'{data_dir}punctuations.txt', 'rb') as f:
        custom_punct_chars = pickle.load(f)
    row['Job Description num_words'] = len(str(row[text_col]).split())
    row['Job Description num_unique_words'] = len(set(str(row[text_col]).split()))
    row['Job Description num_chars'] = len(str(row[text_col]))
    row['Job Description num_chars_no_whitespact_and_punt'] = len(
        [
            c.translate({ord(s): None for s in string.whitespace})
            for c in str(row[text_col])
            if c not in custom_punct_chars and c not in string.punctuation
        ]
    )
    row['Job Description num_punctuations'] = len(
        [
            c
            for c in str(row[text_col])
            if c in custom_punct_chars and c in string.punctuation
        ]
    )

    return row


In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_including_sector_genage_data.pkl')


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
df_manual['Job Description spacy_sentencized_lower'] = df_manual['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: job_sentence.strip().lower()
)


In [None]:
df_manual[['Job Description spacy_sentencized', 'Job Description spacy_sentencized_lower']].head()


In [None]:
%%time
# Spacy tokenize
with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)

df_manual['Job Description spacy_tokenized'] = df_manual[
    'Job Description spacy_sentencized'
].progress_apply(
    lambda job_sentence: [
        str(token.text.strip().lower())
        for token in nlp.tokenizer(job_sentence)
        if len(token) != 0
        and not token.is_space
        and not token.is_stop
        and not token.is_punct
        and not token.is_bracket
        and not token.like_email
        and token.text not in custom_punct_chars
    ]
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy.csv')


In [None]:
df_manual['Job Description spacy_sentencized_cleaned'] = df_manual['Job Description spacy_tokenized'].str.join(' ')


In [None]:
%%time
# Get sentence word frequencies
df_manual = df_manual.progress_apply(
    lambda row: get_word_num_and_frequency(
        row=row, text_col='Job Description spacy_sentencized'
    ), 
    axis='columns',
)


In [None]:
df_manual[
    [
        'Job Description spacy_sentencized',
        'Job Description num_words', 'Job Description num_unique_words',
        'Job Description num_chars', 'Job Description num_chars_no_whitespact_and_punt'
    ]
].head()


In [None]:
df_manual.columns


In [None]:
get_df_info(df_manual)

In [None]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy.csv')


## Use NLTK to tokenize sentences


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy.pkl')


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
df_manual.info()


In [None]:
%%time
# Tokenize with NLTK
# stop_words = set(stopwords.words('english'))
# punctuations = list(string.punctuation)
# lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

df_manual['Job Description nltk_tokenized'] = df_manual['Job Description spacy_sentencized'].progress_apply(
    lambda job_sentence: [
        str(token.strip().lower()) 
        for token in word_tokenize(job_sentence) 
        if len(token) != 0 
        and token != '...' 
        and not token.lower() in set(stopwords.words('english')) 
        and not token.lower() in list(string.punctuation) 
    ]
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk.csv')


In [None]:
df_manual['Job Description nltk_tokenized'].head()


In [None]:
df_manual.info()


In [None]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk.csv')


## Use gensim to tokenize sentences


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk.pkl')


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
df_manual.info()


In [None]:
%%time
df_manual['Job Description gensim_tokenized'] = df_manual['Job Description spacy_sentencized'].progress_apply(
    lambda sentence: preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.csv')


In [None]:
df_manual['Job Description gensim_tokenized'].head()


In [None]:
df_manual.info()


In [None]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.csv')


## Use BERT to tokenize sentences


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY_NLTK_GENSIM
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.pkl')


In [None]:
# Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
df_manual.info()


In [None]:
%%time
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available() else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertForSequenceClassification.from_pretrained(bert_model_name).to(device)

df_manual['Job Description bert_tokenized'] = df_manual['Job Description spacy_sentencized'].progress_apply(
    lambda sentence: bert_tokenizer.tokenize(str(sentence))
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.csv')


In [None]:
df_manual['Job Description bert_tokenized'].head()


In [None]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_for_trainning.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_for_trainning.csv')


In [None]:
print(f'Saving df_manual length {len(df_manual)} to txt file.')
with open(f'{data_dir}df_manual_len.txt', 'w') as f:
    f.write(str(len(df_manual)))


### Specification Curve Check

In [None]:
from setup_module import specification_curve_fork as specy # type:ignore # isort:skip # fmt:skip # noqa # nopep8

print(f'Running specification curve analysis with:\nDEPENDENT VARIABLES = {dvs}\nINDEPENDENT VARIABLES = {ivs_perc}\nCONTROLS = {controls}')
sc = specy.SpecificationCurve(df=df_manual, y_endog=dvs, x_exog=ivs_perc, controls=controls)
sc.fit(estimator=sm.Logit)
sc.plot(show_plot=True)


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.csv')


# ATTN: This script should be run AFTER all tokenization (spacy, nltk, gensim, and BERT) completed.


## Use spacy to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY_NLTK_GENSIM_BERT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.pkl')


In [None]:
## Warmth 1 = 1595 (24.90%), Competence 1 = 2836 (44.30%)
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# df_manual.info()


In [None]:
# %%time
# # Load customer characters
# with open(f'{data_dir}punctuations.txt', 'rb') as f:
#     custom_punct_chars = pickle.load(f)

# # POS tagging
# df_manual['Job Description spacy_token_tags'] = df_manual[
#     'Job Description spacy_sentencized'
# ].progress_apply(
#     lambda job_sentence: [
#         (token.text.strip().lower(), token.tag_) for token in nlp(job_sentence)
#     ]
# )

# # Lemmatization
# df_manual['Job Description spacy_lemmas'] = df_manual['Job Description spacy_sentencized'].progress_apply(
#     lambda job_sentence: [
#         token.lemma_.strip().lower()
#         for token in nlp(job_sentence)
#         if len(token) != 0 and not token.is_stop and not token.is_punct and token.text not in custom_punct_chars
#     ]
# )

# # Stemming
# df_manual['Job Description spacy_stems'] = df_manual['Job Description spacy_sentencized'].progress_apply(
#     lambda job_sentence: [
#         stemmer.stem(token.text.strip().lower())
#         for token in nlp(job_sentence)
#         if len(token) != 0 and not token.is_stop and not token.is_punct and token.text not in custom_punct_chars
#     ]
# )

# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.csv')


In [None]:
# df_manual.info()


In [None]:
# df_manual[
#     [
#         'Job Description spacy_token_tags',
#         'Job Description spacy_lemmas',
#         'Job Description spacy_stems'
#     ]
# ].head()


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.csv')


## Use NLTK to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM df_manual_TAGS_LEMMAS_STEMS_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# def get_wordnet_pos(token):
#     """Map POS tag to first character lemmatize() accepts"""
#     tag = nltk.pos_tag([token])[0][1][0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# df_manual.info()


In [None]:
# %%time
# # POS tagging
# df_manual['Job Description nltk_token_tags'] = df_manual['Job Description spacy_tokenized'].progress_apply(
#     lambda token: pos_tag(token)
# )

# # Lemmatization
# df_manual['Job Description nltk_lemmas'] = df_manual['Job Description spacy_tokenized'].progress_apply(
#     lambda tokens: [
#         lemmatizer.lemmatize(
#             token, get_wordnet_pos(
#                 unicodedata.normalize('NFKD', str(token.strip().lower())).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#             )
#         )
#         for token in tokens
#     ]
# )

# # Stemming
# df_manual['Job Description nltk_stems'] = df_manual['Job Description spacy_tokenized'].progress_apply(
#     lambda tokens: [
#         stemmer.stem(
#             unicodedata.normalize('NFKD', str(token.strip().lower())).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#         )
#         for token in tokens
#     ]
# )

# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.csv')


In [None]:
# df_manual.info()


In [None]:
# df_manual[['Job Description nltk_token_tags', 'Job Description nltk_lemmas', 'Job Description nltk_stems']].head()


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.csv')


## Use BERT to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM df_manual_TAGS_LEMMAS_STEMS_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# %%time
# bert_pos_model_name = 'QCRI/bert-base-multilingual-cased-pos-english'
# bert_pos_model = AutoModelForTokenClassification.from_pretrained(bert_pos_model_name).to(device)
# bert_pos_tagger = TokenClassificationPipeline(model=bert_pos_model, tokenizer=bert_tokenizer).to(device)

# df_manual['Job Description bert_token_tags_with_scores'] = df_manual['Job Description spacy_sentencized'].progress_apply(
#     lambda sentence: [
#         (bert_pos_tag['word'], bert_pos_tag['entity'], bert_pos_tag['score'])
#         for i in range(len(sentence.split()))
#         for bert_pos_tag in bert_pos_tagger(sentence)
#     ]
# )

# df_manual['Job Description bert_token_tags'] = df_manual['Job Description bert_token_tags_with_scores'].progress_apply(
#     lambda tag_list: [
#         [(tag_list[i][0], tag_list[i][1])]
#         for tag_tuple in tag_list
#         for i in range(len(tag_list))
#     ]
# )


# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.csv')


In [None]:
# df_manual['Job Description bert_token_tags'].head()

In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.csv')


# ATTN: This script should be run AFTER all POS tagging, lemmatization, and stemming (spacy and nltk) completed. If BERT POS tagging was done, change pkl file loading


## Use spacy to create bi and trigrams


### START HERE IF SOURCING FROM df_manual_TAGS_LEMMAS_STEMS_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os
# import sys
# import importlib
# from pathlib import Path
# import numpy as np

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# def spacy_make_ngrams(sentence, matcher, gram_type):

#     doc = nlp(sentence)
#     matches = matcher(doc)
#     matches_list = []

#     for idx in range(len(matches)):
#         for match_id, start, end in matches:
#             if nlp.vocab.strings[match_id].split('_')[0] == gram_type:
#                 match = doc[matches[idx][1]: matches[idx][2]].text
#                 matches_list.append(match.lower())

#     return list(set(matches_list))


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# %%time
# df_manual['Job Description spacy_1grams_original_list'] = df_manual['Job Description spacy_tokenized']
# df_manual['Job Description spacy_1grams'] = df_manual['Job Description spacy_tokenized'].progress_apply(
#     lambda tokens: [
#         tuple(token.split())
#         for token in tokens
#     ]
# )


In [None]:
# %%time
# # Spacy bi and trigrams
# matcher = Matcher(nlp.vocab)

# bigram_rules = [
#     ['NOUN', 'VERB'],
#     ['VERB', 'NOUN'],
#     ['ADJ', 'NOUN'],
#     ['ADJ', 'PROPN'],
#     # more rules here...
# ]

# trigram_rules = [
#     ['VERB', 'ADJ', 'NOUN'],
#     ['NOUN', 'VERB', 'ADV'],
#     ['NOUN', 'ADP', 'NOUN'],
#     # more rules here...
# ]

# patters_dict = {
#     'bigram_patterns': [[{'POS': i} for i in j] for j in bigram_rules],
#     'trigram_patterns': [[{'POS': i} for i in j] for j in trigram_rules],
# }

# ngram_dict = {
#     'bigram': 2,
#     'trigram': 3,
# }

# for ngram_name, ngram_num in tqdm.tqdm(ngram_dict.items()):

#     matcher.add(f'{ngram_name}_patterns', patters_dict[f'{ngram_name}_patterns'])

#     df_manual[f'Job Description spacy_{str(ngram_num)}grams_original_list'] = df_manual['Job Description spacy_sentencized'].progress_apply(
#         lambda sentence: 
#             [
#                 '_'.join(ngram_.split())
#                 for ngram_ in spacy_make_ngrams(sentence, matcher, ngram_name)
#             ]
#     )

#     df_manual[f'Job Description spacy_{str(ngram_num)}grams'] = df_manual['Job Description spacy_sentencized'].progress_apply(
#         lambda sentence: 
#             [
#                 tuple(ngram_.split())
#                 for ngram_ in spacy_make_ngrams(sentence, matcher, ngram_name)
#             ]
#     )

#     df_manual[f'Job Description spacy_{str(ngram_num)}grams_in_sent'] = df_manual['Job Description spacy_sentencized'].str.lower().replace(
#         regex = {
#             re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
#             for ngrams_list in df_manual[f'Job Description spacy_{str(ngram_num)}grams_original_list']
#             for ngram_ in ngrams_list
#             if '_' in ngram_
#         }
#     )

#     if f'{ngram_name}_patterns' in matcher:
#         matcher.remove(f'{ngram_name}_patterns')
#     assert f'{ngram_name}_patterns' not in matcher


In [None]:
# %%time
# # Spacy Allgrams
# df_manual['Job Description spacy_123grams_original_list'] = df_manual['Job Description spacy_tokenized'] + df_manual['Job Description spacy_2grams_original_list'] + df_manual['Job Description spacy_3grams_original_list']
# df_manual['Job Description spacy_123grams'] = df_manual['Job Description spacy_1grams'] + df_manual['Job Description spacy_2grams'] + df_manual['Job Description spacy_3grams']
# df_manual['Job Description spacy_123grams_in_sent'] = (
#     df_manual['Job Description spacy_sentencized']
#     .str.lower()
#     .replace(
#         regex={
#             re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
#             for ngrams_list in df_manual[
#                 'Job Description spacy_123grams_original_list'
#             ]
#             for ngram_ in ngrams_list
#             if '_' in ngram_
#         }
#     )
# )


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_spacy.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_spacy.csv')


## Use NLTK to create bi and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_spacy.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# %%time
# df_manual['Job Description nltk_1grams_original_list'] = df_manual['Job Description nltk_tokenized']
# df_manual['Job Description nltk_1grams'] = df_manual['Job Description nltk_tokenized'].progress_apply(
#     lambda tokens: [
#         tuple(token.split())
#         for token in tokens
#     ]
# )


In [None]:
# %%time
# # NLTK bi and trigrams
# ngram_dict = {
#     'bigram': 2,
#     'trigram': 3
# }

# for ngram_name, ngram_num in tqdm.tqdm(ngram_dict.items()):

#     df_manual[f'Job Description nltk_{str(ngram_num)}grams_original_list'] = df_manual['Job Description nltk_tokenized'].progress_apply(
#         lambda tokens:
#             list(
#                 '_'.join(ngram_list)
#                 for ngram_list in nltk.ngrams(tokens, ngram_num)
#             )
#     )

#     df_manual[f'Job Description nltk_{str(ngram_num)}grams'] = df_manual['Job Description nltk_tokenized'].progress_apply(
#         lambda tokens: list(nltk.ngrams(tokens, ngram_num))
#     )

#     df_manual[f'Job Description nltk_{str(ngram_num)}grams_in_sent'] = df_manual['Job Description spacy_sentencized'].str.lower().replace(
#         regex = {
#             re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
#             for ngrams_list in df_manual[f'Job Description nltk_{str(ngram_num)}grams_original_list']
#             for ngram_ in ngrams_list
#             if '_' in ngram_
#         }
#     )


In [None]:
# %%time
# # NLTK Allgrams
# df_manual['Job Description nltk_123grams_original_list'] = (
#     df_manual['Job Description nltk_tokenized']
#     + df_manual['Job Description nltk_2grams_original_list']
#     + df_manual['Job Description nltk_3grams_original_list']
# )
# df_manual['Job Description nltk_123grams'] = (
#     df_manual['Job Description nltk_1grams']
#     + df_manual['Job Description nltk_2grams']
#     + df_manual['Job Description nltk_3grams']
# )
# df_manual['Job Description nltk_123grams_in_sent'] = (
#     df_manual['Job Description spacy_sentencized']
#     .str.lower()
#     .replace(
#         regex={
#             re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
#             for ngrams_list in df_manual[
#                 'Job Description nltk_123grams_original_list'
#             ]
#             for ngram_ in ngrams_list
#             if '_' in ngram_
#         }
#     )
# )


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_spacy_nltk.csv')


## Use Gensim to create bi and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# df_manual['Job Description gensim_1grams_original_list'] = df_manual['Job Description gensim_tokenized']
# df_manual['Job Description gensim_1grams'] = df_manual['Job Description gensim_tokenized'].progress_apply(
#     lambda tokens: [
#         tuple(token.split())
#         for token in tokens
#     ]
# )


In [None]:
# %%time
# # Gensim bi and trigrams
# pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'

# # Gensim Bigrams
# bigram = Phraser(Phrases(df_manual['Job Description gensim_tokenized'], connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
# df_manual['Job Description gensim_2grams_original_list_all'] = bigram[df_manual['Job Description gensim_tokenized']]
# df_manual['Job Description gensim_2grams_original_list'] = df_manual['Job Description gensim_2grams_original_list_all'].progress_apply(
#     lambda ngrams_list: [
#         ngram_
#         for ngram_ in ngrams_list
#         if len(re.findall('[a-zA-Z]*\_[a-zA-Z]*', ngram_)) != 0
#     ]
# )
# df_manual['Job Description gensim_2grams'] = df_manual['Job Description gensim_2grams_original_list'].progress_apply(
#     lambda ngrams: [
#         tuple(ngram.split('_'))
#         for ngram in ngrams
#         if '_' in ngram
#     ]
# )
# df_manual['Job Description gensim_2grams_in_sent'] = (
#     df_manual['Job Description spacy_sentencized']
#     .str.lower()
#     .progress_apply(
#         lambda sentence: ' '.join(
#             preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
#         )
#     )
#     .replace(
#         regex={
#             re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
#             for ngrams_list in df_manual[
#                 'Job Description gensim_2grams_original_list'
#             ]
#             for ngram_ in ngrams_list
#             if '_' in ngram_
#         }
#     )
# )

# # Gensim Trigrams
# trigram = Phraser(Phrases(df_manual['Job Description gensim_2grams_original_list_all'], connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
# df_manual['Job Description gensim_3grams_original_list_all'] = trigram[df_manual['Job Description gensim_2grams_original_list_all']]
# df_manual['Job Description gensim_3grams_original_list'] = df_manual['Job Description gensim_3grams_original_list_all'].progress_apply(
#     lambda ngrams_list: [
#         ngram_
#         for ngram_ in ngrams_list
#         if len(re.findall('[a-zA-Z]*\_[a-zA-Z]*\_[a-zA-Z]*', ngram_)) != 0
#     ]
# )
# df_manual['Job Description gensim_3grams'] = df_manual['Job Description gensim_3grams_original_list'].progress_apply(
#     lambda ngrams: [
#         tuple(ngram.split('_'))
#         for ngram in ngrams
#         if '_' in ngram
#     ]
# )
# df_manual['Job Description gensim_3grams_in_sent'] = (
#     df_manual['Job Description spacy_sentencized']
#     .str.lower()
#     .progress_apply(
#         lambda sentence: ' '.join(
#             preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
#         )
#     )
#     .replace(
#         regex={
#             re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
#             for ngrams_list in df_manual[
#                 'Job Description gensim_3grams_original_list'
#             ]
#             for ngram_ in ngrams_list
#             if '_' in ngram_
#         }
#     )
# )


In [None]:
# %%time
# # Gensim Allgrams
# pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'

# df_manual['Job Description gensim_123grams_original_list'] = (
#     df_manual['Job Description gensim_tokenized']
#     + df_manual['Job Description gensim_2grams_original_list']
#     + df_manual['Job Description gensim_3grams_original_list']
# )
# df_manual['Job Description gensim_123grams'] = (
#     df_manual['Job Description gensim_1grams']
#     + df_manual['Job Description gensim_2grams']
#     + df_manual['Job Description gensim_3grams']
# )
# df_manual['Job Description gensim_123grams_in_sent'] = (
#     df_manual['Job Description spacy_sentencized']
#     .str.lower()
#     .progress_apply(
#         lambda sentence: ' '.join(
#             preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
#         )
#     )
#     .replace(
#         regex={
#             re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
#             for ngrams_list in df_manual[
#                 'Job Description gensim_123grams_original_list'
#             ]
#             for ngram_ in ngrams_list
#             if '_' in ngram_
#         }
#     )
# )


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk_gensim.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_spacy_nltk_gensim.csv')


## Create word frequencies for uni, bi, and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_SPACY_NLTK_GENSIM
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# def get_abs_frequency(row, text_col, ngram_num, embedding_library):

#     abs_word_freq = defaultdict(int)
#     for word in row[f'Job Description {embedding_library}_{ngram_num}grams_original_list']:
#         abs_word_freq[word] += 1

#         abs_wtd_df = (
#             pd.DataFrame.from_dict(abs_word_freq, orient='index')
#             .rename(columns={0: 'abs_word_freq'})
#             .sort_values(by=['abs_word_freq'], ascending=False)
#             )
#         abs_wtd_df.insert(1, 'abs_word_perc', value=abs_wtd_df['abs_word_freq'] / abs_wtd_df['abs_word_freq'].sum())
#         abs_wtd_df.insert(2, 'abs_word_perc_cum', abs_wtd_df['abs_word_perc'].cumsum())

#         row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_freq'] = str(abs_wtd_df['abs_word_freq'].to_dict())
#         row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_perc'] = str(abs_wtd_df['abs_word_perc'].to_dict())
#         row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_perc_cum'] = str(abs_wtd_df['abs_word_perc_cum'].to_dict())

#     return row


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk_gensim.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# %%time
# ngrams_list=[1, 2, 3, 123]
# embedding_libraries_list = ['spacy', 'nltk', 'gensim']

# for embedding_library, ngram_num in tqdm_product(embedding_libraries_list, ngrams_list):
#     df_manual[f'Job Description {embedding_library}_{ngram_num}grams_count'] = df_manual[f'Job Description {embedding_library}_{ngram_num}grams'].apply(lambda x: len(x))
#     df_manual = df_manual.progress_apply(lambda row: get_abs_frequency(row=row, text_col='Job Description spacy_tokenized', ngram_num=ngram_num, embedding_library=embedding_library), axis='columns')


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_frequency.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_frequency.csv')


## Create BoW dictionary, corpus, and tfidf matrix for uni, bi, and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_FREQUENCY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# def get_corpus_and_dictionary(row, ngram_num, embedding_library):

#     ngrams_original_list = row[f'Job Description {embedding_library}_{ngram_num}grams_original_list']
#     dictionary = Dictionary([ngrams_original_list])
#     BoW_corpus = [dictionary.doc2bow(ngrams_original_list)]
#     tfidf = TfidfModel(BoW_corpus, smartirs='ntc')
#     tfidf_matrix = [tfidf[doc] for doc in BoW_corpus]

#     row[f'Job Description {embedding_library}_{ngram_num}grams_dictionary'] = dictionary
#     row[f'Job Description {embedding_library}_{ngram_num}grams_BoW_corpus'] = BoW_corpus
#     row[f'Job Description {embedding_library}_{ngram_num}grams_tfidf'] = tfidf
#     row[f'Job Description {embedding_library}_{ngram_num}grams_tfidf_matrix'] = tfidf_matrix

#     return row


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_frequency.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# %%time
# ngrams_list=[1, 2, 3, 123]
# embedding_libraries_list = ['spacy', 'nltk', 'gensim']
# for embedding_library, ngram_num in tqdm_product(embedding_libraries_list, ngrams_list):
#     df_manual = df_manual.progress_apply(
#         lambda row: get_corpus_and_dictionary(
#             row=row, ngram_num=ngram_num, embedding_library=embedding_library
#         ),
#         axis='columns'
#     )

# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_frequency.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_BoW.csv')


In [None]:
# df_manual.columns


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_BoW.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_BoW.csv')


# ATTN: This script should be run AFTER all bi and trigrams (spacy, nltk, and gensim) completed.



## Use spacy and nltk for sentiment scoring


### START HERE IF SOURCING FROM df_manual_NGRAMS_BOW
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_BoW.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# %%time
# # Spacy sentiment
# if 'spacytextblob' not in nlp.pipe_names:
#     nlp.add_pipe('spacytextblob')

# df_manual['Job Description spacy_sentiment'] = df_manual['Job Description spacy_sentencized'].progress_apply(
#     lambda sentence: float(nlp(sentence)._.blob.polarity)
#     if isinstance(sentence, str) else np.nan
# )


In [None]:
# %%time
# # NLTK sentiment
# df_manual['Job Description nltk_sentiment'] = df_manual['Job Description spacy_sentencized'].progress_apply(
#     lambda sentence: float(sentim_analyzer.polarity_scores(sentence)['compound'])
#     if isinstance(sentence, str) else np.nan
# )


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_sentiment_spacy_nltk.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_sentiment_spacy_nltk.csv')


# ATTN: This script should be run AFTER all sentiment scoring (spacy and nltk) completed.


## Word2Vec and FastText embeddings


### START HERE IF SOURCING FROM df_manual_SENTIMENT_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
# import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
# from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [None]:
# from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# def build_train_word2vec(df, ngram_number, embedding_library, size = 300, words=None, t = time.time(), cores = multiprocessing.cpu_count()):
#     if words is None:
#         words = [
#             'she',
#             'he',
#             'support',
#             'leader',
#             'management',
#             'team',
#             'business',
#             'customer',
#             'risk',
#             'build',
#             'computer',
#             'programmer',
#         ]
#     sentences = df[f'Job Description {embedding_library}_{ngram_number}grams_original_list'].values

#     w2v_model = Word2Vec(
#         sentences=sentences,
#         vector_size=size,
#         min_count=0,
#         window=2,
#         sample=6e-5,
#         alpha=0.03,
#         min_alpha=0.0007,
#         negative=20,
#         workers=cores - 1,
#         sg = 1,
#     )

#     w2v_model.build_vocab(sentences, progress_per=10000)
#     print(f'Time to train the model for {size}: {round((time.time() - t) / 60, 2)} mins')

#     w2v_model.train(
#         sentences,
#         total_examples=w2v_model.corpus_count,
#         epochs=30,
#         report_delay=1,
#     )

#     print(f'Time to build w2v_vocab for {size}: {round((time.time() - t) / 60, 2)} mins')
#     w2v_vocab = list(w2v_model.wv.index_to_key)

#     print(f'Checking words form list of length {len(words)}')
#     print(f'WORDS LIST: {words}')

# #     for word in words:
# #         print(f'Checking word:\n{word.upper()}:')
# #         try:
# # #             print(f'Word2Vec {size}: {w2v_model.wv[word]}')
# #             print(f'Length of {size} model vobal: {len(w2v_vocab)}')
# #             print(f'{size} - Positive most similar to {word}: {w2v_model.wv.most_similar(positive=word, topn=5)}')
# #             print(f'{size} - Negative most similar to {word}: {w2v_model.wv.most_similar(negative=word, topn=5)}')

# #         except KeyError as e:
# #             print(e)

#     return w2v_vocab, w2v_model

# def word2vec_embeddings(sentences, w2v_vocab, w2v_model, size=300):

#     sentences = [word for word in sentences if word in w2v_vocab]

#     return (
#         np.mean(w2v_model.wv[sentences], axis=0)
#         if sentences
#         else np.zeros(size)
#     )



In [None]:
# def build_train_fasttext(df, ngram_number, embedding_library, size = 300, words=None, t = time.time(), cores = multiprocessing.cpu_count()):
#     if words is None:
#         words = [
#             'she',
#             'he',
#             'support',
#             'leader',
#             'management',
#             'team',
#             'business',
#             'customer',
#             'risk',
#             'build',
#             'computer',
#             'programmer',
#         ]
#     sentences = df[f'Job Description {embedding_library}_{ngram_number}grams_original_list'].values

#     ft_model = FastText(
#         sentences=sentences,
#         vector_size=size,
#         min_count=0,
#         window=2,
#         sample=6e-5,
#         alpha=0.03,
#         min_alpha=0.0007,
#         negative=20,
#         workers=cores - 1,
#         sg = 1,
#     )

#     ft_model.build_vocab(sentences, progress_per=10000)
#     print(f'Time to train the model for {size}: {round((time.time() - t) / 60, 2)} mins')

#     ft_model.train(
#         sentences,
#         total_examples=ft_model.corpus_count,
#         epochs=30,
#         report_delay=1,
#     )

#     print(f'Time to build vocab for {size}: {round((time.time() - t) / 60, 2)} mins')
#     ft_vocab = list(ft_model.wv.index_to_key)

#     print(f'Checking words form list of length {len(words)}')
#     print(f'WORDS LIST: {words}')

# #     for word in words:
# #         print(f'Checking word:\n{word.upper()}:')
# #         try:
# # #             print(f'FastText {size}: {ft_model_300.wv[word]}')
# #             print(f'Length of {size} model vobal: {len(ft_vocab)}')
# #             print(f'{size} - Positive most similar to {word}: {ft_model.wv.most_similar(positive=word, topn=5)}')
# #             print(f'{size} - Negative most similar to {word}: {ft_model.wv.most_similar(negative=word, topn=5)}')

# #         except KeyError as e:
# #             print(e)

#     return ft_vocab, ft_model

# def fasttext_embeddings(sentences, ft_vocab, ft_model, size=300):

#     sentences = [word for word in sentences if word in ft_vocab]

#     return np.mean(ft_model.wv[sentences], axis=0) if sentences else np.zeros(size)


In [None]:
# def get_glove(glove_file = f'{llm_path}/gensim/glove/glove.840B.300d.txt'):
#     embeddings_index = {}
#     with open(glove_file, 'r', encoding='utf8') as glove:

#         for line in glove:
#             values = line.split()
#             word = values[0]

#             with contextlib.suppress(ValueError):
#                 coefs = np.asarray(values[1:], dtype='float32')
#                 embeddings_index[word] = coefs
#     print(f'Found {len(embeddings_index)} word vectors.')

#     return embeddings_index


In [None]:
# def sent2vec(sentences, embeddings_index=None, external_glove=True, extra_preprocessing_enabled=False):

#     if external_glove is False and embeddings_index is None:
#         embeddings_index= get_glove()

#     if extra_preprocessing_enabled is False:
#         words = sentences

#     elif extra_preprocessing_enabled is True:
#         stop_words = set(sw.words('english'))
#         words = str(sentences).lower()
#         words = word_tokenize(words)
#         words = [w for w in words if (w not in stop_words) and (w.isalpha())]

#     M = []

#     try:
#         for w in words:
#             try:
#                 M.append(embeddings_index[w])
#             except Exception:
#                 continue

#         M = np.array(M)
#         v = M.sum(axis='index')
#         return np.zeros(300) if type(v) != np.ndarray else v / np.sqrt((v ** 2).sum())

#     except Exception:
#         return np.zeros(300)


In [None]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_sentiment_spacy_nltk.pkl')


In [None]:
# get_df_info(df_manual, ivs_all=['Warmth', 'Competence'])


In [None]:
# embedding_models_dict = {
#     'w2v': [build_train_word2vec, word2vec_embeddings, Word2Vec],
#     'ft': [build_train_fasttext, fasttext_embeddings, FastText],
# }


In [None]:
# %%time
# # Make embeddings
# ngrams_list=[1, 2, 3, 123]
# embedding_libraries_list = ['spacy', 'nltk', 'gensim']

# for embedding_library, ngram_number in tqdm_product(embedding_libraries_list, ngrams_list):
#     print(f'Building {embedding_library}_{ngram_number}grams model and vocabulary.')

#     for embed_model_name, embed_func_list in tqdm.tqdm(embedding_models_dict.items()):

#         build_train_func, embed_func, model_loader = embed_func_list
#         print(f'Building {embed_model_name} from {embed_func.__name__} function.')

#         vocab, model = build_train_func(
#             df=df_manual,
#             ngram_number=ngram_number,
#             embedding_library=embedding_library,
#         )

#         print(f'Getting {embed_model_name} embeddings.')

#         df_manual[
#             f'Job Description {embedding_library}_{ngram_number}grams_mean_{embed_model_name}_embeddings'
#         ] = df_manual[
#             f'Job Description {embedding_library}_{ngram_number}grams_original_list'
#         ].progress_apply(
#             lambda sentences: embed_func(sentences, vocab, model)
#         )
#         model.save(f'{data_dir}embeddings models/{embedding_library}_{ngram_number}grams_{embed_model_name}_model.model')

#     # Sent2Vec
#     print('Getting sent2vec embeddings.')
#     embeddings_index = get_glove()
#     df_manual[f'Job Description {embedding_library}_{ngram_number}grams_sent2vec_embeddings'] = df_manual[f'Job Description {embedding_library}_{ngram_number}grams'].progress_apply(lambda sentences: sent2vec(sentences, embeddings_index=embeddings_index, external_glove=True, extra_preprocessing_enabled=False))
#     print('Done getting sent2vec embeddings.')


In [None]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_for_trainning.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_for_trainning.csv')


In [None]:
# print(f'Saving df_manual length {len(df_manual)} to txt file.')
# with open(f'{data_dir}df_manual_len.txt', 'w') as f:
#     f.write(str(len(df_manual)))
