# ATTN: This script should be run AFTER language detection is completed.

# Drop non-English job descriptions

### START HERE IF SOURCING FROM DF_JOBS_RAW_LANGUAGE_DETECTED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [1]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [4]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_language_detected.pkl').reset_index(drop=True)


In [5]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62577 entries, 0 to 62576
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Search Keyword     62577 non-null  object 
 1   Platform           62577 non-null  object 
 2   Job ID             62577 non-null  object 
 3   Job Title          62577 non-null  object 
 4   Company Name       62574 non-null  object 
 5   Location           62577 non-null  object 
 6   Job Description    62577 non-null  object 
 7   Rating             3975 non-null   float64
 8   Employment Type    61995 non-null  object 
 9   Company URL        59263 non-null  object 
 10  Job URL            62577 non-null  object 
 11  Job Age            62577 non-null  object 
 12  Job Age Number     62577 non-null  object 
 13  Collection Date    62577 non-null  object 
 14  Data Row           58599 non-null  float64
 15  Tracking ID        58599 non-null  object 
 16  Industry           591

In [6]:
# nl = 44863, en = 17591, ['en', 'nl'] = 8
df_jobs['Language'].value_counts()


nl              44863
en              17591
de                 53
fr                 36
['nl', 'en']        9
['en', 'nl']        8
pl                  5
id                  4
da                  4
tr                  1
['nl', 'af']        1
st                  1
af                  1
Name: Language, dtype: int64

In [7]:
%%time
# The majority of ['en', 'nl'] labeled job descriptions contain mostly English
df_jobs['Language'] = df_jobs['Language'].progress_apply(
    lambda lang: 'en' if lang == "['en', 'nl']" else lang
)


In [8]:
# Drop non-English ads
df_jobs = df_jobs.drop(
    df_jobs[
        df_jobs['Language'] != 'en'
    ].index, 
        axis='index', 
        errors='ignore'
)


In [9]:
# 17599
len(df_jobs)


17599

In [10]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_english_only.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_english_only.csv', index=False)


# Fix abbreviations in job descriptions

### START HERE IF SOURCING FROM DF_JOBS_RAW_ENGLISH_ONLY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [11]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [14]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_english_only.pkl').reset_index(drop=True)


In [15]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Search Keyword     17599 non-null  object 
 1   Platform           17599 non-null  object 
 2   Job ID             17599 non-null  object 
 3   Job Title          17599 non-null  object 
 4   Company Name       17597 non-null  object 
 5   Location           17599 non-null  object 
 6   Job Description    17599 non-null  object 
 7   Rating             3780 non-null   float64
 8   Employment Type    17017 non-null  object 
 9   Company URL        15959 non-null  object 
 10  Job URL            17599 non-null  object 
 11  Job Age            17599 non-null  object 
 12  Job Age Number     17599 non-null  object 
 13  Collection Date    17599 non-null  object 
 14  Data Row           13816 non-null  float64
 15  Tracking ID        13816 non-null  object 
 16  Industry           144

In [16]:
df_jobs['Job Description'] = df_jobs.loc[df_jobs['Job Description'].notnull(), 'Job Description'].progress_apply(
    lambda job_description: ' '.join(job_description.split('/')) if '/' in job_description else job_description
)


In [17]:
abb_dict = {
    r'incl\.': 'including', 
    r'e\.g\.': 'for example', 
    r'e\.g': 'for example', 
    r'etc\.': 'et cetera', 
}


In [18]:
%%time
df_jobs['Job Description'] = df_jobs['Job Description'].replace(abb_dict, regex=True)


In [19]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_fixed_abbreviations.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_fixed_abbreviations.csv', index=False)


# Add English and Dutch language requirement columns

### START HERE IF SOURCING FROM DF_JOBS_RAW_FIXED_ABBREVIATIONS
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [20]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [23]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_fixed_abbreviations.pkl').reset_index(drop=True)


In [24]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Search Keyword     17599 non-null  object 
 1   Platform           17599 non-null  object 
 2   Job ID             17599 non-null  object 
 3   Job Title          17599 non-null  object 
 4   Company Name       17597 non-null  object 
 5   Location           17599 non-null  object 
 6   Job Description    17599 non-null  object 
 7   Rating             3780 non-null   float64
 8   Employment Type    17017 non-null  object 
 9   Company URL        15959 non-null  object 
 10  Job URL            17599 non-null  object 
 11  Job Age            17599 non-null  object 
 12  Job Age Number     17599 non-null  object 
 13  Collection Date    17599 non-null  object 
 14  Data Row           13816 non-null  float64
 15  Tracking ID        13816 non-null  object 
 16  Industry           144

In [25]:
%%time
# Add language requirement column
# Use regex to find language requirement
dutch_requirement_pattern = r'[Ll]anguage: [Dd]utch|[Dd]utch [Pp]referred|[Dd]utch [Re]quired|[Dd]utch [Ll]anguage|[Pp]roficient in [Dd]utch|[Ss]peak [Dd]utch|[Kk]now [Dd]utch'
english_requirement_pattern = r'[Ll]anguage: [Ee]nglish|[Ee]nglish [Pp]referred|[Ee]nglish [Re]quired|[Ee]nglish [Ll]anguage|[Pp]roficient in [Ee]nglish|[Ss]peak [Ee]nglish|[Kk]now [Ee]nglish'

lang_requirements = {
    'Dutch Requirement': dutch_requirement_pattern, 'English Requirement': english_requirement_pattern
}

for lang_req, lang_req_pattern in lang_requirements.items():
    
    if lang_req in df_jobs.columns:
        df_jobs = df_jobs.drop(columns=[lang_req])
    df_jobs[lang_req] = np.where(
        df_jobs['Job Description'].str.contains(lang_req_pattern),
        'Yes',
        'No',
    )

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_language_requirement.pkl')
    
    df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_english_requirement.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


In [26]:
# Yes = 235
df_jobs['Dutch Requirement'].value_counts()


No     17364
Yes      235
Name: Dutch Requirement, dtype: int64

In [27]:
# Yes = 526
df_jobs['English Requirement'].value_counts()

No     17073
Yes      526
Name: English Requirement, dtype: int64

In [28]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_raw_language_requirement.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_raw_language_requirement.csv', index=False)


# Add data from Sectors dataframe (see CBS directory under scrapped_data directory)


### START HERE IF SOURCING FROM DF_JOBS_RAW_LANGUAGE_REQUIREMENT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [29]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [32]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_raw_language_requirement.pkl').reset_index(drop=True)


In [33]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Search Keyword       17599 non-null  object 
 1   Platform             17599 non-null  object 
 2   Job ID               17599 non-null  object 
 3   Job Title            17599 non-null  object 
 4   Company Name         17597 non-null  object 
 5   Location             17599 non-null  object 
 6   Job Description      17599 non-null  object 
 7   Rating               3780 non-null   float64
 8   Employment Type      17017 non-null  object 
 9   Company URL          15959 non-null  object 
 10  Job URL              17599 non-null  object 
 11  Job Age              17599 non-null  object 
 12  Job Age Number       17599 non-null  object 
 13  Collection Date      17599 non-null  object 
 14  Data Row             13816 non-null  float64
 15  Tracking ID          13816 non-null 

In [34]:
df_sectors = pd.read_pickle(f'{scraped_data}CBS/Data/Sectors Output from script.pkl').reset_index(drop=True)


In [35]:
df_sectors.columns


MultiIndex([('SBI Sector Titles', 'Industry class / branch (SIC2008)', ...),
            ('SBI Sector Titles', 'Industry class / branch (SIC2008)', ...),
            ('SBI Sector Titles', 'Industry class / branch (SIC2008)', ...),
            ('SBI Sector Titles', 'Industry class / branch (SIC2008)', ...),
            ('SBI Sector Titles', 'Industry class / branch (SIC2008)', ...),
            ('SBI Sector Titles', 'Industry class / branch (SIC2008)', ...),
            ('SBI Sector Titles', 'Industry class / branch (SIC2008)', ...),
            (           'Gender',                            'Female', ...),
            (           'Gender',                            'Female', ...),
            (           'Gender',                            'Female', ...),
            (           'Gender',                            'Female', ...),
            (           'Gender',                              'Male', ...),
            (           'Gender',                              'Male', ...),

In [36]:
df_sectors.columns = [
    '_'.join(col) 
    if 'SBI Sector Titles' not in col 
    and 'Total Workforce' not in col 
    else col[-1] 
    for col in df_sectors.columns
]


In [37]:
df_sectors = df_sectors.rename(
    columns={
        'Keywords': 'Search Keyword', 
        'Code': 'Sector Code', 
        'Sector Name': 'Sector', 
        'Gender_Female_n': 'Female Count (x 1000)', 
        'Gender_Male_n': 'Male Count (x 1000)', 
        'Gender_Sectoral Gender Segregation_Dominant Category': 'Gender', 
        'Age_Sectoral Age Segregation_Dominant Category': 'Age', 
        'n': 'Sector Count (x 1000)', 
    },
    errors='ignore',
)


In [38]:
df_sectors = df_sectors.explode(
    'Search Keyword', ignore_index=True
).reset_index(drop=True)


In [39]:
len(df_sectors)


101

#### Before adding sector data, make sure keywords are correct as to not have any missing sector data when merging

In [40]:
# This is a manually collected dictionary of incorrect/faulty keywords in scraped site data
with open(f'{scraped_data}CBS/Data/keyword_trans_dict.txt') as f:
    keyword_trans_dict = json.load(f)

def fix_keywords(df_temp):

    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        for key, value in keyword_trans_dict.items():
            df_temp.loc[
                df_temp[df_temp['Search Keyword'].notnull()]['Search Keyword'].astype(str).progress_apply(
                lambda x: x.lower().strip()
                ) == str(key).lower().strip(), 'Search Keyword'
            ] = str(value).lower().strip()

        unfixed = df_temp.loc[
            df_temp[df_temp['Search Keyword'].notnull()]['Search Keyword'].astype(str).progress_apply(lambda x: x.lower().strip()).isin([x.lower().strip() for x in list(keyword_trans_dict.keys())])
        ]

        if len(unfixed) != 0:
            for key, value in keyword_trans_dict.items():
                for idx, row in df_temp.iterrows():
                    if row['Search Keyword'].astype(str).lower().strip() == str(key).lower().strip():
                        df_temp.loc[idx, 'Search Keyword'] = str(value).lower().strip()
    
        unfixed = df_temp.loc[
                df_temp[df_temp['Search Keyword'].notnull()]['Search Keyword'].astype(str).progress_apply(lambda x: x.lower().strip()).isin([x.lower().strip() for x in list(keyword_trans_dict.keys())])
            ]
        if len(unfixed) != 0:
            print('Some keywords were not fixed. Please check file unfixed_keywords.txt in data directory.')
            with open(f'{data_dir}unfixed_keywords.txt', 'w') as f:
                json.dump(unfixed, f)
    
    return df_temp

# Fix keywords
if len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]) != 0:
    print('Some search keywords did not match a sector. Fixing')
#     print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))
    df_jobs = fix_keywords(df_jobs)
#     print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))


In [41]:
df_jobs = df_jobs.merge(df_sectors, on='Search Keyword', how='left')


In [42]:
len(df_jobs)


17599

In [43]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 17599 entries, 0 to 17598
Data columns (total 48 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Search Keyword                                  17599 non-null  object 
 1   Platform                                        17599 non-null  object 
 2   Job ID                                          17599 non-null  object 
 3   Job Title                                       17599 non-null  object 
 4   Company Name                                    17597 non-null  object 
 5   Location                                        17599 non-null  object 
 6   Job Description                                 17599 non-null  object 
 7   Rating                                          3780 non-null   float64
 8   Employment Type                                 17017 non-null  object 
 9   Company URL                            

#### Check if there is any missing sector data in the merged dataframe

In [44]:
df_jobs['Sector'].isna().sum()

0

In [45]:
if df_jobs['Sector'].isna().sum() != 0:
    print('Some search keywords did not match a sector. Fixing')
    print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))
    df_jobs = fix_keywords(df_jobs)
    print(set(df_jobs['Search Keyword'].loc[df_jobs['Sector'].isna()].to_list()))
    print(len(df_jobs['Search Keyword'].loc[df_jobs['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))


In [46]:
df_jobs['Gender'].value_counts()


Mixed Gender    12256
Male             3830
Female           1513
Name: Gender, dtype: int64

In [47]:
df_jobs['Age'].value_counts()


Mixed Age    11467
Older         3778
Younger       2354
Name: Age, dtype: int64

In [48]:
if df_jobs['Sector'].isna().sum() == 0:
    assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
    df_jobs.to_pickle(f'{df_save_dir}df_jobs_including_sector_data.pkl')
    df_jobs.to_csv(f'{df_save_dir}df_jobs_including_sector_data.csv', index=False)

else:
    print(f"MISSING SECTOR DATA: COUNT {df_jobs['Sector'].isna().sum()}")

# Add categorical data


### START HERE IF SOURCING FROM DF_JOBS_INCLUDING_SECTOR_DATA
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [49]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [52]:
# Function to order categories
def categorize_df_gender_age(df, gender_order=None, age_order=None, ivs=None):
    if gender_order is None:
        gender_order = ['Female', 'Mixed Gender', 'Male']
    if age_order is None:
        age_order = ['Older', 'Mixed Age', 'Younger']
    if ivs is None:
        ivs = ['Gender', 'Age']
    # Arrange Categories
    for iv in ivs:
        if iv == 'Gender':
            order = gender_order
        elif iv == 'Age':
            order = age_order
        try:
            df[iv] = df[iv].astype('category').cat.reorder_categories(order, ordered=True)

            df[iv] = pd.Categorical(
                df[iv], categories=order, ordered=True
            )
            df[f'{iv}_Num'] = pd.to_numeric(df[iv].cat.codes).astype('int64')
        except ValueError as e:
            print(e)

    return df


In [53]:
# Funtion to print df gender and age info
def df_gender_age_info(df, ivs_all=None):
    if ivs_all is None:
        ivs_all = [
            'Gender',
            'Gender_Num',
            'Gender_Female',
            'Gender_Mixed',
            'Gender_Male',
            'Age',
            'Age_Num',
            'Age_Older',
            'Age_Mixed',
            'Age_Younger',
        ]
    # Print Info
    print('\nDF INFO:\n')
    df.info()

    for iv in ivs_all:
        try:
            print('='*20)
            print(f'{iv}:')
            print('-'*20)
            print(f'{iv} Counts:\n{df[f"{iv}"].value_counts()}')
            print('-'*20)
            print(f'{iv} Percentages:\n{df[f"{iv}"].value_counts(normalize=True).mul(100).round(1).astype(float)}')
            try:
                print('-'*20)
                print(f'{iv} Mean: {df[f"{iv}"].mean().round(2).astype(float)}')
                print('-'*20)
                print(f'{iv} Standard Deviation: {df[f"{iv}"].std().round(2).astype(float)}')
            except Exception:
                pass
        except Exception:
            print(f'{iv} not available.')

    print('\n')


In [54]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_including_sector_data.pkl').reset_index(drop=True)


In [55]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 48 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Search Keyword                                  17599 non-null  object 
 1   Platform                                        17599 non-null  object 
 2   Job ID                                          17599 non-null  object 
 3   Job Title                                       17599 non-null  object 
 4   Company Name                                    17597 non-null  object 
 5   Location                                        17599 non-null  object 
 6   Job Description                                 17599 non-null  object 
 7   Rating                                          3780 non-null   float64
 8   Employment Type                                 17017 non-null  object 
 9   Company URL                            

In [56]:
df_jobs = df_jobs.join(pd.get_dummies(df_jobs[['Gender', 'Age']], dtype='int64'))
df_jobs = categorize_df_gender_age(df_jobs)


In [57]:
df_jobs['Gender'].value_counts()

Mixed Gender    12256
Male             3830
Female           1513
Name: Gender, dtype: int64

In [58]:
df_jobs['Gender_Num'].value_counts()

1    12256
2     3830
0     1513
Name: Gender_Num, dtype: int64

In [59]:
df_jobs['Gender_Female'].value_counts()


0    16086
1     1513
Name: Gender_Female, dtype: int64

In [60]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_including_sector_genage_data.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_including_sector_genage_data.csv', index=False)



# Use spacy to split job ads to sentences


### START HERE IF SOURCING FROM DF_JOBS_INCLUDING_SECTOR_GENAGE_DATA
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [61]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [64]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_including_sector_genage_data.pkl').reset_index(drop=True)


In [65]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 56 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Search Keyword                                  17599 non-null  object  
 1   Platform                                        17599 non-null  object  
 2   Job ID                                          17599 non-null  object  
 3   Job Title                                       17599 non-null  object  
 4   Company Name                                    17597 non-null  object  
 5   Location                                        17599 non-null  object  
 6   Job Description                                 17599 non-null  object  
 7   Rating                                          3780 non-null   float64 
 8   Employment Type                                 17017 non-null  object  
 9   Company URL                 

In [66]:
# Function to make a list of punctuations that determine sentence boundry, i.e., split characters
def make_custom_punct_chars(main_punct_chars=None, repeated_punct_chars=None):
    if main_punct_chars is None:
        main_punct_chars = [':', '|']
    if repeated_punct_chars is None:
        repeated_punct_chars = ['\n', ',']
    custom_punct_chars = []
    temp_multi = []
    temp_spaced = []

    for punct_char in main_punct_chars:
        custom_punct_chars+= f'{punct_char}', f'{punct_char} '

    for idx in range(4):
        for punct_char in repeated_punct_chars:
            temp_multi.append(f'{punct_char}'*int(idx+1))
            temp_spaced.append(f'{punct_char} '*int(idx+1))

    for multi, spaced in zip(temp_multi, temp_spaced):
        custom_punct_chars+= multi, spaced

    custom_punct_chars.remove(',')
    custom_punct_chars.remove(', ')

    return custom_punct_chars

custom_punct_chars = make_custom_punct_chars()


In [67]:
custom_punct_chars


[':',
 ': ',
 '|',
 '| ',
 '\n',
 '\n ',
 '\n\n',
 '\n \n ',
 ',,',
 ', , ',
 '\n\n\n',
 '\n \n \n ',
 ',,,',
 ', , , ',
 '\n\n\n\n',
 '\n \n \n \n ',
 ',,,,',
 ', , , , ']

In [68]:
%%time
# Add sentencizer to spacy pipe and set custom punctuations
if 'sentencizer' not in nlp.pipe_names:
    sentencizer = nlp.add_pipe('sentencizer')
sentencizer.punct_chars.update(custom_punct_chars)

if all(custom_punct_char in sentencizer.punct_chars for custom_punct_char in custom_punct_chars):
    with open(f'{data_dir}punctuations.txt', 'wb') as f:
        pickle.dump(sentencizer.punct_chars, f)

with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)


CPU times: user 5.92 ms, sys: 3.94 ms, total: 9.86 ms
Wall time: 11.3 ms


In [69]:
# Add special cases to spacy
special_cases_dict = {
    'incl.': [{65: 'incl', 67: 'including'}],
    'incl. ': [{65: 'incl', 67: 'including'}],
    '(incl.': [{65: 'incl', 67: 'including'}],
    'etc.': [{65: 'etc', 67: 'et cetera'}],
    'etc. ': [{65: 'etc', 67: 'et cetera'}],
    'e.g.': [{65: 'e.g', 67: 'for example'}],
    'e.g. ': [{65: 'e.g', 67: 'for example'}],
}

nlp.tokenizer.rules.update(special_cases_dict)


In [70]:
%%time
# Spacy sentencize
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'
if all(custom_punct_char in sentencizer.punct_chars for custom_punct_char in custom_punct_chars):
    df_jobs['Job Description spacy_sentencized'] = df_jobs['Job Description'].progress_apply(
        lambda job_description: [
            sent 
            for sentence in nlp(job_description).sents 
            for sent in re.split(pattern, sentence.text) 
            if len(sent) != 0 
        ]
    )

assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
df_jobs.to_pickle(f'{df_save_dir}df_jobs_sentencized.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_sentencized.csv', index=False)


CPU times: user 19min 6s, sys: 32.1 s, total: 19min 38s
Wall time: 23min 33s


In [71]:
df_jobs['Job Description spacy_sentencized'].head()


0    [About Our Client, The Global KYC organisation...
1    [Your role:, We’re in business to save our hom...
2    [During the past four years Colourful Rebel ha...
3    [Job Description, We are currently recruiting ...
4    [KARL LAGERFELD COMPANY PROFILE, The house of ...
Name: Job Description spacy_sentencized, dtype: object

In [72]:
assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
 df_jobs.to_pickle(f'{df_save_dir}df_jobs_sentencized.pkl')
df_jobs.to_csv(f'{df_save_dir}df_jobs_sentencized.csv', index=False)
