# ATTN: This script should be run AFTER language detection is completed.

# Drop non-English job descriptions

### START HERE IF SOURCING FROM DF_JOBS_RAW_LANGUAGE_DETECTED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_language_detected.pkl').reset_index(drop=True)


In [None]:
# len = 62577
df_jobs.info()


In [None]:
# nl = 44863, en = 17591, ['en', 'nl'] = 8, ['nl', 'en'] = 9
df_jobs['Language'].value_counts()


In [None]:
%%time
# The majority of ['en', 'nl'] labeled job descriptions contain mostly English
df_jobs['Language'] = df_jobs['Language'].progress_apply(
    lambda lang: 'en' if lang == "['en', 'nl']" else lang
)


In [None]:
# Drop non-English ads
df_jobs = df_jobs.drop(
    df_jobs[
        df_jobs['Language'] != 'en'
    ].index, 
        axis='index', 
)


In [None]:
# 17599
len(df_jobs)


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_english_only.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_english_only.csv', index=False)


# Fix abbreviations in job descriptions

### START HERE IF SOURCING FROM DF_JOBS_RAW_ENGLISH_ONLY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_english_only.pkl').reset_index(drop=True)


In [None]:
# len = 17599
df_jobs.info()


In [None]:
df_jobs['Job Description'] = df_jobs.loc[df_jobs['Job Description'].notnull(), 'Job Description'].progress_apply(
    lambda job_description: ' '.join(job_description.split('/')) if '/' in job_description else job_description
)


In [None]:
abb_dict = {
    r'incl\.': 'including', 
    r'e\.g\.': 'for example', 
    r'e\.g': 'for example', 
    r'etc\.': 'et cetera', 
}


In [None]:
%%time
df_jobs['Job Description'] = df_jobs['Job Description'].replace(abb_dict, regex=True)


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_fixed_abbreviations.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_fixed_abbreviations.csv', index=False)


# Add English and Dutch language requirement columns

### START HERE IF SOURCING FROM DF_JOBS_RAW_FIXED_ABBREVIATIONS
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_fixed_abbreviations.pkl').reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
%%time
# Add language requirement column
# Use regex to find language requirement
dutch_requirement_pattern = r'[Ll]anguage: [Dd]utch|[Dd]utch [Pp]referred|[Dd]utch [Re]quired|[Dd]utch [Ll]anguage|[Pp]roficient in [Dd]utch|[Ss]peak [Dd]utch|[Kk]now [Dd]utch'
english_requirement_pattern = r'[Ll]anguage: [Ee]nglish|[Ee]nglish [Pp]referred|[Ee]nglish [Re]quired|[Ee]nglish [Ll]anguage|[Pp]roficient in [Ee]nglish|[Ss]peak [Ee]nglish|[Kk]now [Ee]nglish'

lang_requirements = {
    'Dutch Requirement': dutch_requirement_pattern, 'English Requirement': english_requirement_pattern
}

for lang_req, lang_req_pattern in lang_requirements.items():

    if lang_req in df_jobs.columns:
        df_jobs = df_jobs.drop(columns=[lang_req])
    df_jobs[lang_req] = np.where(
        df_jobs['Job Description'].str.contains(lang_req_pattern),
        1,
        0,
    )
    df_jobs[lang_req] = df_jobs[lang_req].astype('category').cat.reorder_categories([0, 1], ordered=True)
    df_jobs[lang_req] = pd.Categorical(df_jobs[lang_req], categories=[0, 1], ordered=True)

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_language_requirement.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_english_requirement.csv', index=False)


In [None]:
# Yes = 235
df_jobs['Dutch Requirement'].value_counts()

In [None]:
# Yes = 526
df_jobs['English Requirement'].value_counts()


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_language_requirement.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_language_requirement.csv', index=False)


# Add data from Sectors dataframe (see CBS directory under scrapped_data directory)


### START HERE IF SOURCING FROM DF_JOBS_RAW_LANGUAGE_REQUIREMENT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_language_requirement.pkl').reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
df_sectors = pd.read_pickle(f'{scraped_data}CBS/Data/Sectors Output from script.pkl').reset_index(drop=True)


In [None]:
df_sectors.columns


In [None]:
df_sectors.columns = [
    '_'.join(col) 
    if 'SBI Sector Titles' not in col 
    and 'Total Workforce' not in col 
    else col[-1] 
    for col in df_sectors.columns
]


In [None]:
df_sectors = df_sectors.rename(
    columns={
        'Keywords': 'Search Keyword', 
        'Code': 'Sector Code', 
        'Sector Name': 'Sector', 
        'Gender_Sectoral Gender Segregation_Dominant Category': 'Gender', 
        'Age_Sectoral Age Segregation_Dominant Category': 'Age', 
        'n': 'Sector_n', 
    },
)
df_sectors = df_sectors.rename(columns={element: re.sub(r' \(\W*45 years\)', '', element) for element in df_sectors.columns.tolist()})


In [None]:
df_sectors.columns

In [None]:
df_sectors = df_sectors.explode(
    'Search Keyword', ignore_index=True
).reset_index(drop=True)


In [None]:
df_sectors.columns

In [None]:
# len 101
len(df_sectors)


#### Before adding sector data, make sure keywords are correct as to not have any missing sector data when merging

In [None]:
# This is a manually collected dictionary of incorrect/faulty keywords in scraped site data
with open(f'{scraped_data}CBS/Data/keyword_trans_dict.txt') as f:
    keyword_trans_dict = json.load(f)

def fix_keywords(df_temp):

    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        for key, value in keyword_trans_dict.items():
            df_temp.loc[
                df_temp[df_temp['Search Keyword'].notnull()]['Search Keyword'].astype(str).progress_apply(
                lambda x: x.lower().strip()
                ) == str(key).lower().strip(), 'Search Keyword'
            ] = str(value).lower().strip()

        unfixed = df_temp.loc[
            df_temp[df_temp['Search Keyword'].notnull()]['Search Keyword'].astype(str).progress_apply(lambda x: x.lower().strip()).isin([x.lower().strip() for x in list(keyword_trans_dict.keys())])
        ]

        if len(unfixed) != 0:
            for key, value in keyword_trans_dict.items():
                for idx, row in df_temp.iterrows():
                    if row['Search Keyword'].astype(str).lower().strip() == str(key).lower().strip():
                        df_temp.loc[idx, 'Search Keyword'] = str(value).lower().strip()
    
        unfixed = df_temp.loc[
                df_temp[df_temp['Search Keyword'].notnull()]['Search Keyword'].astype(str).progress_apply(lambda x: x.lower().strip()).isin([x.lower().strip() for x in list(keyword_trans_dict.keys())])
            ]
        if len(unfixed) != 0:
            print('Some keywords were not fixed. Please check file unfixed_keywords.txt in data directory.')
            with open(f'{data_dir}unfixed_keywords.txt', 'w') as f:
                json.dump(unfixed, f)
    
    return df_temp

# Fix keywords
if len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]) != 0:
    print('Some search keywords did not match a sector. Fixing')
#     print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))
    df_jobs = fix_keywords(df_jobs)
#     print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))


In [None]:
df_jobs = df_jobs.merge(df_sectors, on='Search Keyword', how='left')


In [None]:
# 17599
len(df_jobs)


In [None]:
df_jobs.info()


#### Check if there is any missing sector data in the merged dataframe

In [None]:
df_jobs['Sector'].isna().sum()

In [None]:
if df_jobs['Sector'].isna().sum() != 0:
    print('Some search keywords did not match a sector. Fixing')
    print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))
    df_jobs = fix_keywords(df_jobs)
    print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))


In [None]:
df_jobs['Gender'].value_counts()


In [None]:
df_jobs['Age'].value_counts()

In [None]:
if df_jobs['Sector'].isna().sum() == 0:
    assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
    df_jobs.to_pickle(f'{df_save_dir}df_jobs_including_sector_data.pkl')
    df_jobs.to_csv(f'{df_save_dir}df_jobs_including_sector_data.csv', index=False)

else:
    print(f"MISSING SECTOR DATA: COUNT {df_jobs['Sector'].isna().sum()}")

# Add categorical data


### START HERE IF SOURCING FROM DF_JOBS_INCLUDING_SECTOR_DATA
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
# Function to order categories
def categorize_df_gender_age(df, gender_order=None, age_order=None, ivs=None):
    if gender_order is None:
        gender_order = ['Female', 'Mixed Gender', 'Male']
    if age_order is None:
        age_order = ['Older', 'Mixed Age', 'Younger']
    if ivs is None:
        ivs = ['Gender', 'Age']
    # Arrange Categories
    for iv in ivs:
        if iv == 'Gender':
            order = gender_order
        elif iv == 'Age':
            order = age_order
        try:
            df[iv] = df[iv].astype('category').cat.reorder_categories(order, ordered=True)

            df[iv] = pd.Categorical(
                df[iv], categories=order, ordered=True
            )
            df[f'{iv}_Num'] = pd.to_numeric(df[iv].cat.codes).astype('int64')
        except ValueError as e:
            print(e)

    return df


In [None]:
# Funtion to print df gender and age info (also for warmth and competence)
def get_df_info(df, ivs_all=None):
    if ivs_all is None:
        ivs_all = [
            'Gender',
            'Gender_Num',
            'Gender_Female',
            'Gender_Mixed',
            'Gender_Male',
            'Age',
            'Age_Num',
            'Age_Older',
            'Age_Mixed',
            'Age_Younger',
        ]
    # Print Info
    print('\nDF INFO:\n')
    df.info()

    for iv in ivs_all:
        try:
            print('='*20)
            print(f'{iv}:')
            print('-'*20)
            print(f'{iv} Counts:\n{df[iv].value_counts()}')
            print('-'*20)
            print(f'{iv} Percentages:\n{df[iv].value_counts(normalize=True).mul(100).round(1).astype(float)}')
            try:
                print('-'*20)
                print(f'{iv} Mean: {df[iv].mean().round(2).astype(float)}')
                print('-'*20)
                print(f'{iv} Standard Deviation: {df[iv].std().round(2).astype(float)}')
            except Exception:
                pass
        except Exception:
            print(f'{iv} not available.')

    print('\n')


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_including_sector_data.pkl').reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
df_jobs = df_jobs.join(pd.get_dummies(df_jobs[['Gender', 'Age']], dtype='int64'))
df_jobs = df_jobs.rename({'Gender_Mixed Gender': 'Gender_Mixed', 'Age_Mixed Age': 'Age_Mixed'}, axis='columns')


In [None]:
df_jobs = categorize_df_gender_age(df_jobs)

In [None]:
df_jobs['Gender'].value_counts()

In [None]:
df_jobs['Gender_Female'].value_counts()

In [None]:
df_jobs['Gender_Num'].value_counts()

In [None]:
df_jobs['Platform'].value_counts()

In [None]:
platform_order = ['LinkedIn', 'Indeed', 'Glassdoor']
df_jobs['Platform'] = df_jobs['Platform'].astype('category').cat.reorder_categories(platform_order, ordered=True)
df_jobs['Platform'] = pd.Categorical(df_jobs['Platform'], categories=platform_order, ordered=True)
df_jobs['Platform_Num'] = pd.to_numeric(df_jobs['Platform'].cat.codes).astype('int64')
df_jobs = df_jobs.join(pd.get_dummies(df_jobs[['Platform']], dtype='int64'))

In [None]:
df_jobs[['Platform_Num', 'Platform_LinkedIn', 'Platform_Indeed', 'Platform_Glassdoor']].value_counts()

In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_including_sector_genage_data.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_including_sector_genage_data.csv', index=False)



# Use spacy to split job ads to sentences


### START HERE IF SOURCING FROM DF_JOBS_INCLUDING_SECTOR_GENAGE_DATA
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_including_sector_genage_data.pkl').reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
# Function to make a list of punctuations that determine sentence boundry, i.e., split characters
def make_custom_punct_chars(main_punct_chars=None, repeated_punct_chars=None):
    if main_punct_chars is None:
        main_punct_chars = [':', '|']
    if repeated_punct_chars is None:
        repeated_punct_chars = ['\n', ',']
    custom_punct_chars = []
    temp_multi = []
    temp_spaced = []

    for punct_char in main_punct_chars:
        custom_punct_chars+= f'{punct_char}', f'{punct_char} '

    for idx in range(4):
        for punct_char in repeated_punct_chars:
            temp_multi.append(f'{punct_char}'*int(idx+1))
            temp_spaced.append(f'{punct_char} '*int(idx+1))

    for multi, spaced in zip(temp_multi, temp_spaced):
        custom_punct_chars+= multi, spaced

    custom_punct_chars.remove(',')
    custom_punct_chars.remove(', ')

    return custom_punct_chars

custom_punct_chars = make_custom_punct_chars()


In [None]:
custom_punct_chars


In [None]:
%%time
# Add sentencizer to spacy pipe and set custom punctuations
if 'sentencizer' not in nlp.pipe_names:
    sentencizer = nlp.add_pipe('sentencizer')
sentencizer.punct_chars.update(custom_punct_chars)

if all(custom_punct_char in sentencizer.punct_chars for custom_punct_char in custom_punct_chars):
    with open(f'{data_dir}punctuations.txt', 'wb') as f:
        pickle.dump(sentencizer.punct_chars, f)

with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)


In [None]:
# Add special cases to spacy
special_cases_dict = {
    'incl.': [{65: 'incl', 67: 'including'}],
    'incl. ': [{65: 'incl', 67: 'including'}],
    '(incl.': [{65: 'incl', 67: 'including'}],
    'etc.': [{65: 'etc', 67: 'et cetera'}],
    'etc. ': [{65: 'etc', 67: 'et cetera'}],
    'e.g.': [{65: 'e.g', 67: 'for example'}],
    'e.g. ': [{65: 'e.g', 67: 'for example'}],
}

nlp.tokenizer.rules.update(special_cases_dict)


In [None]:
%%time
# Spacy sentencize
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'
if all(custom_punct_char in sentencizer.punct_chars for custom_punct_char in custom_punct_chars):
    df_jobs['Job Description spacy_sentencized'] = df_jobs['Job Description'].progress_apply(
        lambda job_description: [
            sent 
            for sentence in nlp(job_description).sents 
            for sent in re.split(pattern, sentence.text) 
            if len(sent) != 0 
        ]
    )

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_sentencized.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_sentencized.csv', index=False)


In [None]:
df_jobs['Job Description spacy_sentencized'].head()


In [None]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_sentencized.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_sentencized.csv', index=False)
