# ATTN: This script uses Google translate to detect job description language. Google translate will limit requests and take a very long time. Only run this script if redoing language detection.

# Read from scrapped data

In [1]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import *


Using MPS


0it [00:00, ?it/s]

#### Read paths

In [3]:
glob_paths = list(set(glob.glob(f'{scraped_data}Coding Material/*Folder/*/Job ID -*- Codebook (Automating Equity).xlsx')))


In [4]:
# 244 xlsx files
len(glob_paths)


244

#### Use paths to open files, fix keywords, and drop unneeded columns

In [5]:
%%time
# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

# Fix list catches all incorrect/faculty keyword search terms
fix_list = []

# Appended data catches all the fixed and cleaned dfs
appended_data = []

for glob_path in glob_paths:

    try:
        df_temp = pd.read_excel(glob_path).reset_index(drop=True)
    except ValueError:
        fix_list.append(glob_path)

    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        df_temp = df_temp.reset_index(drop=True)
        df_temp = df_temp.drop(columns=cols, axis='columns', errors='ignore')
        df_temp = df_temp.drop(
        df_temp.columns[
                df_temp.columns.str.contains(
                    'unnamed|index|level', regex=True, case=False, flags=re.I
                )
            ],
            axis='columns',
            errors='ignore',
        )

        appended_data.append(df_temp.reset_index(drop=True))

# Concatonate list of dfs into one large df_manual
df_manual = pd.concat(appended_data, axis='index').reset_index(drop=True)

# Save df_manual to file
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw.csv', index=False)


CPU times: user 5.23 s, sys: 84.7 ms, total: 5.31 s
Wall time: 5.37 s


In [6]:
# If we couldn't fix some keywords, we add them to list fix_list and write to file
if len(fix_list) != 0:
    print('Some keywords to fix!')
    with open(f'{data_dir}fix_list.txt', 'w') as f:
        json.dump(fix_list, f)


In [7]:
# List of dfs, len = 244
len(appended_data)


244

In [8]:
# Concatonate list of dfs into one large df_manual
df_manual = pd.concat(appended_data, axis='index').reset_index(drop=True)


In [9]:
# len = 12400
len(df_manual)


12400

In [10]:
# Save df_manual to file
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw.csv', index=False)


# Drop duplicated and missing data

### START HERE IF SOURCING FROM df_manual_RAW
### PLEASE SET CORRECT DIRECTORY PATHS BELOW

In [11]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [12]:
from setup_module.imports import *

In [13]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_raw.pkl').reset_index(drop=True)


In [14]:
# len = 12400
len(df_manual)


12400

In [15]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12400 entries, 0 to 12399
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Job ID           12400 non-null  object 
 1   Sentence         12396 non-null  object 
 2   Warmth           12398 non-null  float64
 3   Competence       12400 non-null  int64  
 4   Task_Mentioned   12398 non-null  float64
 5   Task_Warmth      12398 non-null  float64
 6   Task_Competence  12398 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 678.2+ KB


In [16]:
# Clean columns
df_manual.columns = df_manual.columns.to_series().apply(lambda x: str(x).strip())

In [17]:
# Remove columns 'Task_Mentioned', 'Task_Warmth', 'Task_Competence'
df_manual = df_manual.drop(
    columns=['Task_Mentioned', 'Task_Warmth', 'Task_Competence'],
    axis='columns',
    errors='ignore'
)

In [18]:
df_manual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12400 entries, 0 to 12399
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Job ID      12400 non-null  object 
 1   Sentence    12396 non-null  object 
 2   Warmth      12398 non-null  float64
 3   Competence  12400 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 387.6+ KB


In [19]:
# Missing values: Sentence = 4, Warmth = 2, Competence = 0
df_manual.isna().sum()

Job ID        0
Sentence      4
Warmth        2
Competence    0
dtype: int64

In [20]:
# Drop NA
df_manual = df_manual.dropna(axis='index', how='all')
df_manual = df_manual.dropna(axis='columns', how='all')
df_manual = df_manual.dropna(
    subset = ['Sentence', 'Warmth', 'Competence'],
)

In [21]:
# No na values
df_manual.isna().sum()

Job ID        0
Sentence      0
Warmth        0
Competence    0
dtype: int64

In [22]:
df_manual.columns

Index(['Job ID', 'Sentence', 'Warmth', 'Competence'], dtype='object')

In [23]:
# Convert Warmth and Competence to int
int_cols = [
    'Warmth',
    'Competence',
]

for col in int_cols:
    df_manual[col] = df_manual[col].astype(np.int64, errors='ignore')
    print(f'{col} converted to int.' if all(df_manual[col].apply(lambda x: isinstance(x, int))) else f'{col} NOT converted to int.')
    print(f'{col} value counts:\n{df_manual[col].value_counts()}')


Warmth converted to int.
Warmth value counts:
0    9568
1    2826
Name: Warmth, dtype: int64
Competence converted to int.
Competence value counts:
0    7330
1    5064
Name: Competence, dtype: int64


In [24]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 12394 entries, 0 to 12399
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Job ID      12394 non-null  object
 1   Sentence    12394 non-null  object
 2   Warmth      12394 non-null  int64 
 3   Competence  12394 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 484.1+ KB


In [25]:
%%time
# Conver Job ID and Sentence to str
str_cols = [
    'Job ID',
    'Sentence',
]

for col in str_cols:
    df_manual[col] = df_manual[col].astype(str, errors='ignore').apply(lambda x: x.strip().replace('[', '').replace(']', ''))
    print(f'{col} converted to str.' if all(df_manual[col].apply(lambda x: isinstance(x, str))) else f'{col} NOT converted to str.')


Job ID converted to str.
Sentence converted to str.
CPU times: user 12.7 ms, sys: 650 µs, total: 13.3 ms
Wall time: 12.9 ms


In [26]:
# len = 12394
len(df_manual)

12394

In [27]:
df_manual.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12394 entries, 0 to 12399
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Job ID      12394 non-null  object
 1   Sentence    12394 non-null  object
 2   Warmth      12394 non-null  int64 
 3   Competence  12394 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 484.1+ KB


In [28]:
# Rename Sentence to 'Job Description spacy_sentencized'
df_manual = df_manual.rename(
    columns = {
        'Sentence': 'Job Description spacy_sentencized'
    },
    errors='ignore'
)

In [29]:
df_manual.columns


Index(['Job ID', 'Job Description spacy_sentencized', 'Warmth', 'Competence'], dtype='object')

In [30]:
# Drop NA
df_manual = df_manual.dropna(axis='index', how='all')
df_manual = df_manual.dropna(axis='columns', how='all')
df_manual = df_manual.dropna(
    subset = ['Job Description spacy_sentencized', 'Warmth', 'Competence'],
)


In [31]:
# len = 12394
len(df_manual)


12394

In [32]:
# len = 133
df_manual.groupby(['Job ID'])['Job ID'].unique()


Job ID
2466455525                      [2466455525]
3768944208                      [3768944208]
4023920432                      [4023920432]
4039450758                      [4039450758]
4040119601                      [4040119601]
4052472440                      [4052472440]
p_00793660bce7b2ed      [p_00793660bce7b2ed]
p_09725507026d21ef      [p_09725507026d21ef]
p_0d554b9204e398f7      [p_0d554b9204e398f7]
p_0dc4e2d857181631      [p_0dc4e2d857181631]
p_0f58b766aae13693      [p_0f58b766aae13693]
p_116303c4194c07ff      [p_116303c4194c07ff]
p_12ae47322f756789      [p_12ae47322f756789]
p_12d2cf02c0b8eec6      [p_12d2cf02c0b8eec6]
p_12e252b3cd0d118a      [p_12e252b3cd0d118a]
p_15a42cd4b082799e      [p_15a42cd4b082799e]
p_181bbb09cdbf0744      [p_181bbb09cdbf0744]
p_19f3c631b6e3e918      [p_19f3c631b6e3e918]
p_19f6c2f5306a5d5b      [p_19f6c2f5306a5d5b]
p_1b37ad5237066811      [p_1b37ad5237066811]
p_1c679fe6fac7b908      [p_1c679fe6fac7b908]
p_1f636629bc8a52ce      [p_1f636629bc8a52ce]
p_2

In [33]:
# Drop duplicates on subset of 'Job ID' and 'Sentence'
df_manual = df_manual.drop_duplicates(subset=['Job ID', 'Job Description spacy_sentencized'], keep='first', ignore_index=True)


In [34]:
# len = 6400
len(df_manual)


6400

In [35]:
# Remove any rows with missing 'Job ID'
df_manual = df_manual.drop(
    df_manual[
        (df_manual['Job ID'].isin(nan_list)) | 
        (df_manual['Job ID'].isnull()) | 
        (df_manual['Job ID'].isna())
    ].index, 
    axis='index',
    errors='ignore'
)


In [36]:
# len = 6400
len(df_manual)


6400

In [37]:
# Save df_manual to file
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw_dropped.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw_dropped.csv', index=False)


# Add English and Dutch language requirement columns

### START HERE IF SOURCING FROM df_manual_RAW_DROPPED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [38]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [39]:
from setup_module.imports import *

In [40]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_raw_dropped.pkl').reset_index(drop=True)


In [41]:
# 6400
len(df_manual)


6400

In [42]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400 entries, 0 to 6399
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Job ID                             6400 non-null   object
 1   Job Description spacy_sentencized  6400 non-null   object
 2   Warmth                             6400 non-null   int64 
 3   Competence                         6400 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 200.1+ KB


In [43]:
%%time
# Add language requirement column
# Use regex to find language requirement
dutch_requirement_pattern = r'[Ll]anguage: [Dd]utch|[Dd]utch [Pp]referred|[Dd]utch [Re]quired|[Dd]utch [Ll]anguage|[Pp]roficient in [Dd]utch|[Ss]peak [Dd]utch|[Kk]now [Dd]utch'
english_requirement_pattern = r'[Ll]anguage: [Ee]nglish|[Ee]nglish [Pp]referred|[Ee]nglish [Re]quired|[Ee]nglish [Ll]anguage|[Pp]roficient in [Ee]nglish|[Ss]peak [Ee]nglish|[Kk]now [Ee]nglish'

lang_requirements = {
    'Dutch Requirement': dutch_requirement_pattern, 'English Requirement': english_requirement_pattern
}

for lang_req, lang_req_pattern in lang_requirements.items():
    
    if lang_req in df_manual.columns:
        df_manual = df_manual.drop(columns=[lang_req])
    df_manual[lang_req] = np.where(
        df_manual['Job Description spacy_sentencized'].str.contains(lang_req_pattern),
        'Yes',
        'No',
    )

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw_language_requirement.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw_english_requirement.csv', index=False)


CPU times: user 75.8 ms, sys: 4.46 ms, total: 80.3 ms
Wall time: 79.9 ms


In [44]:
# Yes = 235
df_manual['Dutch Requirement'].value_counts()


No     6393
Yes       7
Name: Dutch Requirement, dtype: int64

In [45]:
# Yes = 526
df_manual['English Requirement'].value_counts()

No     6392
Yes       8
Name: English Requirement, dtype: int64

In [46]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_raw_language_requirement.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_raw_language_requirement.csv', index=False)


# Add data from Sectors dataframe (see CBS directory under scrapped_data directory) and Categorical data


### START HERE IF SOURCING FROM df_manual_RAW_LANGUAGE_REQUIREMENT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [47]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [48]:
from setup_module.imports import *

In [49]:
def df_gender_age_info(df, ivs_all=None):
    if ivs_all is None:
        ivs_all = [
            'Gender',
            'Gender_Num',
            'Gender_Female',
            'Gender_Mixed',
            'Gender_Male',
            'Age',
            'Age_Num',
            'Age_Older',
            'Age_Mixed',
            'Age_Younger',
        ]
    # Print Info
    print('\nDF INFO:\n')
    df.info()

    for iv in ivs_all:
        try:
            counts = df[f"{iv}"].value_counts()
            percentages = df[f"{iv}"].value_counts(normalize=True).mul(100).round(1).astype(float)
            print('='*20)
            print(f'{iv}:')
            print('-'*20)
            print(f'{iv} Counts:\n{counts}')
            print('-'*20)
            print(f'{iv} Percentages:\n{percentages}')

            with contextlib.suppress(Exception):
                mean = df[f"{iv}"].mean().round(2).astype(float)
                sd = df[f"{iv}"].std().round(2).astype(float)
                print('-'*20)
                print(f'{iv} Mean: {mean}')
                print('-'*20)
                print(f'{iv} Standard Deviation: {sd}')

        except Exception:
            print(f'{iv} not available.')

    print('\n')


In [50]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_raw_language_requirement.pkl').reset_index(drop=True)


In [51]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400 entries, 0 to 6399
Data columns (total 6 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Job ID                             6400 non-null   object
 1   Job Description spacy_sentencized  6400 non-null   object
 2   Warmth                             6400 non-null   int64 
 3   Competence                         6400 non-null   int64 
 4   Dutch Requirement                  6400 non-null   object
 5   English Requirement                6400 non-null   object
dtypes: int64(2), object(4)
memory usage: 300.1+ KB


In [52]:
df_manual['Job ID'] = df_manual['Job ID'].apply(lambda x: str(x).lower().strip())


In [53]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_including_sector_genage_data.pkl').reset_index(drop=True)


In [54]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17599 entries, 0 to 17598
Data columns (total 56 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Search Keyword                                  17599 non-null  object  
 1   Platform                                        17599 non-null  object  
 2   Job ID                                          17599 non-null  object  
 3   Job Title                                       17599 non-null  object  
 4   Company Name                                    17597 non-null  object  
 5   Location                                        17599 non-null  object  
 6   Job Description                                 17599 non-null  object  
 7   Rating                                          3780 non-null   float64 
 8   Employment Type                                 17017 non-null  object  
 9   Company URL                 

In [55]:
df_jobs['Job ID'] = df_jobs['Job ID'].apply(lambda x: str(x).lower().strip())


In [56]:
df_jobs.columns


Index(['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Rating', 'Employment Type', 'Company URL', 'Job URL', 'Job Age', 'Job Age Number', 'Collection Date', 'Data Row', 'Tracking ID', 'Industry', 'Job Date', 'Type of ownership', 'Language', 'Dutch Requirement', 'English Requirement', 'Sector Code', 'Sector', 'Keywords Count', '% per Sector', '% per Social Category', '% per Workforce', 'Female Count (x 1000)', 'Gender_Female_% per Sector', 'Gender_Female_% per Social Category', 'Gender_Female_% per Workforce', 'Male Count (x 1000)', 'Gender_Male_% per Sector', 'Gender_Male_% per Social Category', 'Gender_Male_% per Workforce', 'Gender', 'Age_Older (>= 45 years)_n', 'Age_Older (>= 45 years)_% per Sector', 'Age_Older (>= 45 years)_% per Social Category', 'Age_Older (>= 45 years)_% per Workforce', 'Age_Younger (< 45 years)_n', 'Age_Younger (< 45 years)_% per Sector', 'Age_Younger (< 45 years)_% per Social Category', 'Age_Younger (< 45 ye

In [57]:
df_jobs = df_jobs.drop(
    columns = [
        'Job Description', 'Rating', 'Employment Type',
        'Company URL', 'Job URL', 'Job Age', 'Job Age Number',
        'Collection Date', 'Data Row', 'Tracking ID', 'Job Date',
        'Type of ownership', 'Language', 'Dutch Requirement', 'English Requirement', 
    ],
    errors='ignore'
)

In [58]:
df_jobs.columns


Index(['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Industry', 'Sector Code', 'Sector', 'Keywords Count', '% per Sector', '% per Social Category', '% per Workforce', 'Female Count (x 1000)', 'Gender_Female_% per Sector', 'Gender_Female_% per Social Category', 'Gender_Female_% per Workforce', 'Male Count (x 1000)', 'Gender_Male_% per Sector', 'Gender_Male_% per Social Category', 'Gender_Male_% per Workforce', 'Gender', 'Age_Older (>= 45 years)_n', 'Age_Older (>= 45 years)_% per Sector', 'Age_Older (>= 45 years)_% per Social Category', 'Age_Older (>= 45 years)_% per Workforce', 'Age_Younger (< 45 years)_n', 'Age_Younger (< 45 years)_% per Sector', 'Age_Younger (< 45 years)_% per Social Category', 'Age_Younger (< 45 years)_% per Workforce', 'Age', 'Sector Count (x 1000)', '% Sector per Workforce', 'Gender_Female', 'Gender_Male', 'Gender_Mixed Gender', 'Age_Mixed Age', 'Age_Older', 'Age_Younger', 'Gender_Num', 'Age_Num'], dtype='object')

In [59]:
# Add sector and categorical data from df_jobs
df_manual = df_manual.merge(df_jobs, on='Job ID', how='inner')


In [60]:
# len = 5978
len(df_manual)


5978

In [61]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5978 entries, 0 to 5977
Data columns (total 46 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Job ID                                          5978 non-null   object  
 1   Job Description spacy_sentencized               5978 non-null   object  
 2   Warmth                                          5978 non-null   int64   
 3   Competence                                      5978 non-null   int64   
 4   Dutch Requirement                               5978 non-null   object  
 5   English Requirement                             5978 non-null   object  
 6   Search Keyword                                  5978 non-null   object  
 7   Platform                                        5978 non-null   object  
 8   Job Title                                       5978 non-null   object  
 9   Company Name                  

In [62]:
df_manual.head()

Unnamed: 0,Job ID,Job Description spacy_sentencized,Warmth,Competence,Dutch Requirement,English Requirement,Search Keyword,Platform,Job Title,Company Name,Location,Industry,Sector Code,Sector,Keywords Count,% per Sector,% per Social Category,% per Workforce,Female Count (x 1000),Gender_Female_% per Sector,Gender_Female_% per Social Category,Gender_Female_% per Workforce,Male Count (x 1000),Gender_Male_% per Sector,Gender_Male_% per Social Category,Gender_Male_% per Workforce,Gender,Age_Older (>= 45 years)_n,Age_Older (>= 45 years)_% per Sector,Age_Older (>= 45 years)_% per Social Category,Age_Older (>= 45 years)_% per Workforce,Age_Younger (< 45 years)_n,Age_Younger (< 45 years)_% per Sector,Age_Younger (< 45 years)_% per Social Category,Age_Younger (< 45 years)_% per Workforce,Age,Sector Count (x 1000),% Sector per Workforce,Gender_Female,Gender_Male,Gender_Mixed Gender,Age_Mixed Age,Age_Older,Age_Younger,Gender_Num,Age_Num
0,pj_61a43be4d808c9f8,About Our Client,0,0,No,No,specialised nurse,Indeed,MSL - Pharma - 12 months contract - Dutch Speaker,Michael Page,Amsterdam,,Q,Health and social work activities,11.0,0.01,0.11,0.0,1208.0,0.84,0.1,0.05,224.0,0.16,0.02,0.01,Female,661.0,0.46,0.06,0.03,770.0,0.54,0.05,0.03,Mixed Age,1433.0,0.06,1,0,0,1,0,0,0,1
1,pj_61a43be4d808c9f8,International pharma organization focusing on ...,0,0,No,No,specialised nurse,Indeed,MSL - Pharma - 12 months contract - Dutch Speaker,Michael Page,Amsterdam,,Q,Health and social work activities,11.0,0.01,0.11,0.0,1208.0,0.84,0.1,0.05,224.0,0.16,0.02,0.01,Female,661.0,0.46,0.06,0.03,770.0,0.54,0.05,0.03,Mixed Age,1433.0,0.06,1,0,0,1,0,0,0,1
2,pj_61a43be4d808c9f8,Job Description,0,0,No,No,specialised nurse,Indeed,MSL - Pharma - 12 months contract - Dutch Speaker,Michael Page,Amsterdam,,Q,Health and social work activities,11.0,0.01,0.11,0.0,1208.0,0.84,0.1,0.05,224.0,0.16,0.02,0.01,Female,661.0,0.46,0.06,0.03,770.0,0.54,0.05,0.03,Mixed Age,1433.0,0.06,1,0,0,1,0,0,0,1
3,pj_61a43be4d808c9f8, Develop and lead Key External Experts and in...,1,0,No,No,specialised nurse,Indeed,MSL - Pharma - 12 months contract - Dutch Speaker,Michael Page,Amsterdam,,Q,Health and social work activities,11.0,0.01,0.11,0.0,1208.0,0.84,0.1,0.05,224.0,0.16,0.02,0.01,Female,661.0,0.46,0.06,0.03,770.0,0.54,0.05,0.03,Mixed Age,1433.0,0.06,1,0,0,1,0,0,0,1
4,pj_61a43be4d808c9f8, Discuss scientific data on products with all...,1,1,No,No,specialised nurse,Indeed,MSL - Pharma - 12 months contract - Dutch Speaker,Michael Page,Amsterdam,,Q,Health and social work activities,11.0,0.01,0.11,0.0,1208.0,0.84,0.1,0.05,224.0,0.16,0.02,0.01,Female,661.0,0.46,0.06,0.03,770.0,0.54,0.05,0.03,Mixed Age,1433.0,0.06,1,0,0,1,0,0,0,1


#### Check if there is any missing sector data in the merged dataframe

In [63]:
df_manual['Sector'].isna().sum()

0

In [64]:
if df_manual['Sector'].isna().sum() != 0:
    print('Some search keywords did not match a sector. Fixing')
    print(set(df_manual['Search Keyword'].loc[df_manual['Sector'].isna()].to_list()))
    print(len(df_manual['Search Keyword'].loc[df_manual['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))
    df_manual = fix_keywords(df_manual)
    print(set(df_manual['Search Keyword'].loc[df_manual['Sector'].isna()].to_list()))
    print(len(df_manual['Search Keyword'].loc[df_manual['Search Keyword'].isin(list(keyword_trans_dict.keys()))]))


In [65]:
# Manual Job Ad info, len = 117
df_gender_age_info(df_manual.groupby(['Job ID']).first())



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Index: 117 entries, 3768944208 to pj_a4ac3e531abef752
Data columns (total 45 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Job Description spacy_sentencized               117 non-null    object  
 1   Warmth                                          117 non-null    int64   
 2   Competence                                      117 non-null    int64   
 3   Dutch Requirement                               117 non-null    object  
 4   English Requirement                             117 non-null    object  
 5   Search Keyword                                  117 non-null    object  
 6   Platform                                        117 non-null    object  
 7   Job Title                                       117 non-null    object  
 8   Company Name                                    117 non-null    object  
 9   L

In [66]:
# Manual Job Sentence info
df_gender_age_info(df_manual)



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5978 entries, 0 to 5977
Data columns (total 46 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Job ID                                          5978 non-null   object  
 1   Job Description spacy_sentencized               5978 non-null   object  
 2   Warmth                                          5978 non-null   int64   
 3   Competence                                      5978 non-null   int64   
 4   Dutch Requirement                               5978 non-null   object  
 5   English Requirement                             5978 non-null   object  
 6   Search Keyword                                  5978 non-null   object  
 7   Platform                                        5978 non-null   object  
 8   Job Title                                       5978 non-null   object  
 9   Company Name       

In [67]:
# Manual Job Sentence info
df_gender_age_info(df_manual, ivs_all=['Warmth', 'Competence'])



DF INFO:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5978 entries, 0 to 5977
Data columns (total 46 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   Job ID                                          5978 non-null   object  
 1   Job Description spacy_sentencized               5978 non-null   object  
 2   Warmth                                          5978 non-null   int64   
 3   Competence                                      5978 non-null   int64   
 4   Dutch Requirement                               5978 non-null   object  
 5   English Requirement                             5978 non-null   object  
 6   Search Keyword                                  5978 non-null   object  
 7   Platform                                        5978 non-null   object  
 8   Job Title                                       5978 non-null   object  
 9   Company Name       

In [68]:
if df_manual['Sector'].isna().sum() == 0:
    assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
    df_manual.to_pickle(f'{df_save_dir}df_manual_including_sector_genage_data.pkl')
    df_manual.to_csv(f'{df_save_dir}df_manual_including_sector_genage_data.csv', index=False)
else:
    print(f"MISSING SECTOR DATA: COUNT {df_manual['Sector'].isna().sum()}")

# ATTN: This script should be run AFTER spacy sentence splitting is completed.


# Use spacy to tokenize sentences


### START HERE IF SOURCING FROM df_manual_SENTENCIZED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [69]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [70]:
from setup_module.imports import *

In [71]:
def get_word_num_and_frequency(row, text_col):

    row['Job Description num_words'] = len(str(row[f'{text_col}']).split())
    row['Job Description num_unique_words'] = len(set(str(row[f'{text_col}']).split()))
    row['Job Description num_chars'] = len(str(row[f'{text_col}']))
    row['Job Description num_punctuations'] = len([c for c in str(row[f'{text_col}']) if c in string.punctuation])

    return row


In [72]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_including_sector_genage_data.pkl').reset_index(drop=True)


In [73]:
df_manual['Job Description spacy_sentencized_lower'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda job_sentence: job_sentence.strip().lower()
)


In [74]:
df_manual[['Job Description spacy_sentencized', 'Job Description spacy_sentencized_lower']].head()


Unnamed: 0,Job Description spacy_sentencized,Job Description spacy_sentencized_lower
0,About Our Client,about our client
1,International pharma organization focusing on ...,international pharma organization focusing on ...
2,Job Description,job description
3, Develop and lead Key External Experts and in..., develop and lead key external experts and in...
4, Discuss scientific data on products with all..., discuss scientific data on products with all...


In [75]:
%%time
# Spacy tokenize
with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)

df_manual['Job Description spacy_tokenized'] = df_manual[
    'Job Description spacy_sentencized'
].apply(
    lambda job_sentence: [
        str(token.text.strip().lower())
        for token in nlp.tokenizer(job_sentence)
        if len(token) != 0
        and not token.is_space
        and not token.is_stop
        and not token.is_punct
        and not token.is_bracket
        and not token.like_email
        and token.text not in custom_punct_chars
    ]
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy.csv', index=False)


CPU times: user 489 ms, sys: 9.3 ms, total: 498 ms
Wall time: 500 ms


In [76]:
df_manual['Job Description spacy_sentencized_cleaned'] = df_manual['Job Description spacy_tokenized'].str.join(' ')


In [77]:
%%time
# Get sentence word frequencies
df_manual = df_manual.apply(
    lambda row: get_word_num_and_frequency(
        row=row, text_col='Job Description spacy_sentencized'
    ), 
    axis='columns',
    
)


CPU times: user 4.05 s, sys: 8.96 ms, total: 4.06 s
Wall time: 4.06 s


In [78]:
df_manual.columns


Index(['Job ID', 'Job Description spacy_sentencized', 'Warmth', 'Competence', 'Dutch Requirement', 'English Requirement', 'Search Keyword', 'Platform', 'Job Title', 'Company Name', 'Location', 'Industry', 'Sector Code', 'Sector', 'Keywords Count', '% per Sector', '% per Social Category', '% per Workforce', 'Female Count (x 1000)', 'Gender_Female_% per Sector', 'Gender_Female_% per Social Category', 'Gender_Female_% per Workforce', 'Male Count (x 1000)', 'Gender_Male_% per Sector', 'Gender_Male_% per Social Category', 'Gender_Male_% per Workforce', 'Gender', 'Age_Older (>= 45 years)_n', 'Age_Older (>= 45 years)_% per Sector', 'Age_Older (>= 45 years)_% per Social Category', 'Age_Older (>= 45 years)_% per Workforce', 'Age_Younger (< 45 years)_n', 'Age_Younger (< 45 years)_% per Sector', 'Age_Younger (< 45 years)_% per Social Category', 'Age_Younger (< 45 years)_% per Workforce', 'Age', 'Sector Count (x 1000)', '% Sector per Workforce', 'Gender_Female', 'Gender_Male', 'Gender_Mixed Gender

In [79]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy.csv', index=False)


# Use NLTK to tokenize sentences


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [80]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [81]:
from setup_module.imports import *

In [82]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy.pkl').reset_index(drop=True)


In [83]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 53 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [84]:
%%time
# Tokenize with NLTK
# stop_words = set(stopwords.words('english'))
# punctuations = list(string.punctuation)
# lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

df_manual['Job Description nltk_tokenized'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda job_sentence: [
        str(token.strip().lower()) 
        for token in word_tokenize(job_sentence) 
        if len(token) != 0 
        and token != '...' 
        and not token.lower() in set(stopwords.words('english')) 
        and not token.lower() in list(string.punctuation) 
    ]
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk.csv', index=False)


CPU times: user 5.04 s, sys: 1.58 s, total: 6.62 s
Wall time: 6.63 s


In [85]:
df_manual['Job Description nltk_tokenized'].head()


0                                             [client]
1    [international, pharma, organization, focusing...
2                                   [job, description]
3    [, develop, lead, key, external, experts, inv...
4    [, discuss, scientific, data, products, relev...
Name: Job Description nltk_tokenized, dtype: object

In [86]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 54 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [87]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk.csv', index=False)


# Use gensim to tokenize sentences


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [88]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [89]:
from setup_module.imports import *


In [90]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk.pkl').reset_index(drop=True)


In [91]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 54 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [92]:
%%time
df_manual['Job Description gensim_tokenized'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda sentence: preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.csv', index=False)


CPU times: user 521 ms, sys: 11.1 ms, total: 532 ms
Wall time: 533 ms


In [93]:
df_manual['Job Description gensim_tokenized'].head()


0                                             [client]
1      [intern, pharma, organ, focus, oncolog, market]
2                                      [job, descript]
3    [develop, lead, kei, extern, expert, investig,...
4    [discuss, scientif, data, product, relev, stak...
Name: Job Description gensim_tokenized, dtype: object

In [94]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 55 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [95]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.csv', index=False)


# Use BERT to tokenize sentences


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY_NLTK_GENSIM
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [96]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [97]:
from setup_module.imports import *


In [98]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim.pkl').reset_index(drop=True)


In [99]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 55 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [100]:
%%time
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available() else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizerFast.from_pretrained(bert_model_name, strip_accents = True)
bert_model = BertForSequenceClassification.from_pretrained(bert_model_name).to(device)

df_manual['Job Description bert_encodings'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda sentence: bert_tokenizer(
        str(sentence), truncation=True, padding=True, max_length=max_length, return_tensors=returned_tensor
    )
)

df_manual['Job Description bert_tokenized'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda sentence: bert_tokenizer.tokenize(str(sentence))
)

df_manual['Job Description bert_tokenized_to_id'] = df_manual['Job Description bert_tokenized'].apply(
    lambda sentence: bert_tokenizer.convert_tokens_to_ids(str(sentence))
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.csv', index=False)


Using MPS


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

CPU times: user 3.11 s, sys: 373 ms, total: 3.48 s
Wall time: 4.74 s


In [101]:
df_manual['Job Description bert_encodings'].head()


0    [input_ids, token_type_ids, attention_mask]
1    [input_ids, token_type_ids, attention_mask]
2    [input_ids, token_type_ids, attention_mask]
3    [input_ids, token_type_ids, attention_mask]
4    [input_ids, token_type_ids, attention_mask]
Name: Job Description bert_encodings, dtype: object

In [102]:
df_manual['Job Description bert_tokenized'].head()


0                                 [about, our, client]
1    [international, ph, ##arm, ##a, organization, ...
2                                   [job, description]
3    [develop, and, lead, key, external, experts, a...
4    [discuss, scientific, data, on, products, with...
Name: Job Description bert_tokenized, dtype: object

In [103]:
df_manual['Job Description bert_tokenized_to_id'].head()

0    100
1    100
2    100
3    100
4    100
Name: Job Description bert_tokenized_to_id, dtype: int64

In [104]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.csv', index=False)


# ATTN: This script should be run AFTER all tokenization (spacy, nltk, gensim, and BERT) completed.


# Use spacy to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM df_manual_TOKENIZED_SPACY_NLTK_GENSIM_BERT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [105]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [106]:
from setup_module.imports import *


In [107]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tokenized_spacy_nltk_gensim_bert.pkl').reset_index(drop=True)


In [108]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 58 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [109]:
%%time
# Load customer characters
with open(f'{data_dir}punctuations.txt', 'rb') as f:
    custom_punct_chars = pickle.load(f)

# POS tagging
df_manual['Job Description spacy_token_tags'] = df_manual[
    'Job Description spacy_sentencized'
].apply(
    lambda job_sentence: [
        (token.text.strip().lower(), token.tag_) for token in nlp(job_sentence)
    ]
)

# Lemmatization
df_manual['Job Description spacy_lemmas'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda job_sentence: [
        token.lemma_.strip().lower()
        for token in nlp(job_sentence)
        if len(token) != 0 and not token.is_stop and not token.is_punct and token.text not in custom_punct_chars
    ]
)

# Stemming
df_manual['Job Description spacy_stems'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda job_sentence: [
        stemmer.stem(token.text.strip().lower())
        for token in nlp(job_sentence)
        if len(token) != 0 and not token.is_stop and not token.is_punct and token.text not in custom_punct_chars
    ]
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.csv', index=False)


CPU times: user 1min 28s, sys: 218 ms, total: 1min 28s
Wall time: 1min 28s


In [110]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 61 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [111]:
df_manual[
    [
        'Job Description spacy_token_tags',
        'Job Description spacy_lemmas',
        'Job Description spacy_stems'
    ]
].head()


Unnamed: 0,Job Description spacy_token_tags,Job Description spacy_lemmas,Job Description spacy_stems
0,"[(about, IN), (our, PRP$), (client, NNP)]",[client],[client]
1,"[(international, JJ), (pharma, NN), (organizat...","[international, pharma, organization, focus, o...","[intern, pharma, organ, focus, oncolog, market]"
2,"[(job, NNP), (description, NNP)]","[job, description]","[job, descript]"
3,"[(, .), (develop, VB), (and, CC), (lead, VB),...","[, develop, lead, key, external, experts, inv...","[, develop, lead, key, extern, expert, invest..."
4,"[(, .), (discuss, VB), (scientific, JJ), (dat...","[, discuss, scientific, datum, product, relev...","[, discuss, scientif, data, product, relev, s..."


In [112]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.csv', index=False)


# Use NLTK to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM df_manual_TAGS_LEMMAS_STEMS_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [113]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [114]:
from setup_module.imports import *


In [115]:
def get_wordnet_pos(token):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [116]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy.pkl').reset_index(drop=True)


In [117]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 61 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [118]:
%%time
# POS tagging
df_manual['Job Description nltk_token_tags'] = df_manual['Job Description spacy_tokenized'].apply(
    lambda token: pos_tag(token)
)

# Lemmatization
df_manual['Job Description nltk_lemmas'] = df_manual['Job Description spacy_tokenized'].apply(
    lambda tokens: [
        lemmatizer.lemmatize(
            token, get_wordnet_pos(
                unicodedata.normalize('NFKD', str(token.strip().lower())).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            )
        )
        for token in tokens
    ]
)

# Stemming
df_manual['Job Description nltk_stems'] = df_manual['Job Description spacy_tokenized'].apply(
    lambda tokens: [
        stemmer.stem(
            unicodedata.normalize('NFKD', str(token.strip().lower())).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        )
        for token in tokens
    ]
)

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.csv', index=False)


CPU times: user 7.94 s, sys: 1.15 s, total: 9.09 s
Wall time: 9.09 s


In [119]:
df_manual.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5978 entries, 0 to 5977
Data columns (total 64 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Job ID                                          5978 non-null   object 
 1   Job Description spacy_sentencized               5978 non-null   object 
 2   Warmth                                          5978 non-null   int64  
 3   Competence                                      5978 non-null   int64  
 4   Dutch Requirement                               5978 non-null   object 
 5   English Requirement                             5978 non-null   object 
 6   Search Keyword                                  5978 non-null   object 
 7   Platform                                        5978 non-null   object 
 8   Job Title                                       5978 non-null   object 
 9   Company Name                             

In [120]:
df_manual[['Job Description nltk_token_tags', 'Job Description nltk_lemmas', 'Job Description nltk_stems']].head()


Unnamed: 0,Job Description nltk_token_tags,Job Description nltk_lemmas,Job Description nltk_stems
0,"[(client, NN)]",[client],[client]
1,"[(international, JJ), (pharma, NN), (organizat...","[international, pharma, organization, focus, o...","[intern, pharma, organ, focus, oncolog, market]"
2,"[(job, NN), (description, NN)]","[job, description]","[job, descript]"
3,"[(, NNS), (develop, VBP), (lead, JJ), (key, J...","[, develop, lead, key, external, expert, inve...","[, develop, lead, key, extern, expert, investi..."
4,"[(, JJ), (discuss, NN), (scientific, JJ), (da...","[, discus, scientific, data, product, relevan...","[, discuss, scientif, data, product, relev, st..."


In [121]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.csv', index=False)


# Use BERT to create Parts-Of-Speech (POS) tags, lemmas, and stems


### START HERE IF SOURCING FROM df_manual_TAGS_LEMMAS_STEMS_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [122]:
# import os
# import sys
# import importlib
# from pathlib import Path
# import numpy as np

# mod = sys.modules[__name__]

# code_dir = None
# code_dir_name = 'Code'
# unwanted_subdir_name = 'Analysis'

# for _ in range(5):

#     parent_path = str(Path.cwd().parents[_]).split('/')[-1]

#     if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

#         code_dir = str(Path.cwd().parents[_])

#         if code_dir is not None:
#             break

# sys.path.append(code_dir)
# # %load_ext autoreload
# # %autoreload 2


In [123]:
# from setup_module.imports import *


In [124]:
# df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.pkl').reset_index(drop=True)


In [125]:
# %%time
# bert_pos_model_name = 'QCRI/bert-base-multilingual-cased-pos-english'
# bert_pos_model = AutoModelForTokenClassification.from_pretrained(bert_pos_model_name).to(device)
# bert_pos_tagger = TokenClassificationPipeline(model=bert_pos_model, tokenizer=bert_tokenizer).to(device)

# df_manual['Job Description bert_token_tags_with_scores'] = df_manual['Job Description spacy_sentencized'].apply(
#     lambda sentence: [
#         (bert_pos_tag['word'], bert_pos_tag['entity'], bert_pos_tag['score'])
#         for i in range(len(sentence.split()))
#         for bert_pos_tag in bert_pos_tagger(sentence)
#     ]
# )

# df_manual['Job Description bert_token_tags'] = df_manual['Job Description bert_token_tags_with_scores'].apply(
#     lambda tag_list: [
#         [(tag_list[i][0], tag_list[i][1])]
#         for tag_tuple in tag_list
#         for i in range(len(tag_list))
#     ]
# )


# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)


In [126]:
# df_manual['Job Description bert_token_tags'].head()

In [127]:
# assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
# df_manual.to_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.pkl')
# df_manual.to_csv(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk_bert.csv', index=False)


# ATTN: This script should be run AFTER all POS tagging, lemmatization, and stemming (spacy and nltk) completed.
# If BERT POS tagging was done, change pkl file loading


### START HERE IF SOURCING FROM df_manual_TAGS_LEMMAS_STEMS_SPACY_NLTK
### IF BERT POS TAGGING WAS DONE, SOURCING FROM df_manual_TAGS_LEMMAS_STEMS_SPACY_NLTK_BERT
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


# Use spacy to create bi and trigrams


In [128]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [129]:
from setup_module.imports import *


In [130]:
def spacy_make_ngrams(sentence, matcher, gram_type):

    doc = nlp(sentence)
    matches = matcher(doc)
    matches_list = []

    for idx in range(len(matches)):
        for match_id, start, end in matches:
            if nlp.vocab.strings[match_id].split('_')[0] == gram_type:
                match = doc[matches[idx][1]: matches[idx][2]].text
                matches_list.append(match.lower())
    
    return list(set(matches_list))


In [131]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_tags_lemmas_stems_spacy_nltk.pkl').reset_index(drop=True)


In [132]:
%%time
df_manual['Job Description spacy_1grams_original_list'] = df_manual['Job Description spacy_tokenized']
df_manual['Job Description spacy_1grams'] = df_manual['Job Description spacy_tokenized'].apply(
    lambda tokens: [
        tuple(token.split())
        for token in tokens
    ]
)


CPU times: user 9.66 ms, sys: 1.01 ms, total: 10.7 ms
Wall time: 10.4 ms


In [133]:
%%time
# Spacy bi and trigrams
matcher = Matcher(nlp.vocab)

bigram_rules = [
    ['NOUN', 'VERB'],
    ['VERB', 'NOUN'],
    ['ADJ', 'NOUN'],
    ['ADJ', 'PROPN'],
    # more rules here...
]

trigram_rules = [
    ['VERB', 'ADJ', 'NOUN'],
    ['NOUN', 'VERB', 'ADV'],
    ['NOUN', 'ADP', 'NOUN'],
    # more rules here...
]

patters_dict = {
    'bigram_patterns': [[{'POS': i} for i in j] for j in bigram_rules],
    'trigram_patterns': [[{'POS': i} for i in j] for j in trigram_rules],
}

ngram_dict = {
    'bigram': 2,
    'trigram': 3,
}

for ngram_name, ngram_num in ngram_dict.items():
    
    
    matcher.add(f'{ngram_name}_patterns', patters_dict[f'{ngram_name}_patterns'])

    df_manual[f'Job Description spacy_{str(ngram_num)}grams_original_list'] = df_manual['Job Description spacy_sentencized'].apply(
        lambda sentence: 
            [
                '_'.join(ngram_.split())
                for ngram_ in spacy_make_ngrams(sentence, matcher, ngram_name)
            ]
    )
    
    df_manual[f'Job Description spacy_{str(ngram_num)}grams'] = df_manual['Job Description spacy_sentencized'].apply(
        lambda sentence: 
            [
                tuple(ngram_.split())
                for ngram_ in spacy_make_ngrams(sentence, matcher, ngram_name)
            ]
    )

    df_manual[f'Job Description spacy_{str(ngram_num)}grams_in_sent'] = df_manual['Job Description spacy_sentencized'].str.lower().replace(
        regex = {
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_manual[f'Job Description spacy_{str(ngram_num)}grams_original_list']
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
    
    if f'{ngram_name}_patterns' in matcher:
        matcher.remove(f'{ngram_name}_patterns')
    assert f'{ngram_name}_patterns' not in matcher


CPU times: user 2min 13s, sys: 359 ms, total: 2min 14s
Wall time: 2min 14s


In [134]:
%%time
# Spacy Allgrams
df_manual['Job Description spacy_123grams_original_list'] = df_manual['Job Description spacy_tokenized'] + df_manual['Job Description spacy_2grams_original_list'] + df_manual['Job Description spacy_3grams_original_list']
df_manual['Job Description spacy_123grams'] = df_manual['Job Description spacy_1grams'] + df_manual['Job Description spacy_2grams'] + df_manual['Job Description spacy_3grams']
df_manual['Job Description spacy_123grams_in_sent'] = (
    df_manual['Job Description spacy_sentencized']
    .str.lower()
    .replace(
        regex={
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_manual[
                'Job Description spacy_123grams_original_list'
            ]
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
)


CPU times: user 22.3 s, sys: 53.2 ms, total: 22.3 s
Wall time: 22.3 s


In [135]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_spacy.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_spacy.csv', index=False)


# Use NLTK to create bi and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_SPACY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [136]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [137]:
from setup_module.imports import *


In [138]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_spacy.pkl').reset_index(drop=True)


In [139]:
%%time
df_manual['Job Description nltk_1grams_original_list'] = df_manual['Job Description nltk_tokenized']
df_manual['Job Description nltk_1grams'] = df_manual['Job Description nltk_tokenized'].apply(
    lambda tokens: [
        tuple(token.split())
        for token in tokens
    ]
)


CPU times: user 9.13 ms, sys: 285 µs, total: 9.41 ms
Wall time: 9.26 ms


In [141]:
%%time
# NLTK bi and trigrams
ngram_dict = {
    'bigram': 2,
    'trigram': 3
}

for ngram_name, ngram_num in ngram_dict.items():

    df_manual[f'Job Description nltk_{str(ngram_num)}grams_original_list'] = df_manual['Job Description nltk_tokenized'].apply(
        lambda tokens:
            list(
                '_'.join(ngram_list)
                for ngram_list in nltk.ngrams(tokens, ngram_num)
            )
    )

    df_manual[f'Job Description nltk_{str(ngram_num)}grams'] = df_manual['Job Description nltk_tokenized'].apply(
        lambda tokens: list(nltk.ngrams(tokens, ngram_num))
    )

    df_manual[f'Job Description nltk_{str(ngram_num)}grams_in_sent'] = df_manual['Job Description spacy_sentencized'].str.lower().replace(
        regex = {
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_manual[f'Job Description nltk_{str(ngram_num)}grams_original_list']
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )


CPU times: user 3min 19s, sys: 329 ms, total: 3min 19s
Wall time: 3min 19s


In [142]:
%%time
# NLTK Allgrams
df_manual['Job Description nltk_123grams_original_list'] = (
    df_manual['Job Description nltk_tokenized']
    + df_manual['Job Description nltk_2grams_original_list']
    + df_manual['Job Description nltk_3grams_original_list']
)
df_manual['Job Description nltk_123grams'] = (
    df_manual['Job Description nltk_1grams']
    + df_manual['Job Description nltk_2grams']
    + df_manual['Job Description nltk_3grams']
)
df_manual['Job Description nltk_123grams_in_sent'] = (
    df_manual['Job Description spacy_sentencized']
    .str.lower()
    .replace(
        regex={
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_manual[
                'Job Description nltk_123grams_original_list'
            ]
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
)


CPU times: user 3min 19s, sys: 329 ms, total: 3min 19s
Wall time: 3min 19s


In [143]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_spacy_nltk.csv', index=False)


# Use Gensim to create bi and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [144]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [145]:
from setup_module.imports import *


In [146]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk.pkl').reset_index(drop=True)


In [147]:
df_manual['Job Description gensim_1grams_original_list'] = df_manual['Job Description gensim_tokenized']
df_manual['Job Description gensim_1grams'] = df_manual['Job Description gensim_tokenized'].apply(
    lambda tokens: [
        tuple(token.split())
        for token in tokens
    ]
)


In [148]:
%%time
# Gensim bi and trigrams
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'

# Gensim Bigrams
bigram = Phraser(Phrases(df_manual['Job Description gensim_tokenized'], connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
df_manual['Job Description gensim_2grams_original_list_all'] = bigram[df_manual['Job Description gensim_tokenized']]
df_manual['Job Description gensim_2grams_original_list'] = df_manual['Job Description gensim_2grams_original_list_all'].apply(
    lambda ngrams_list: [
        ngram_
        for ngram_ in ngrams_list
        if len(re.findall('[a-zA-Z]*\_[a-zA-Z]*', ngram_)) != 0
    ]
)
df_manual['Job Description gensim_2grams'] = df_manual['Job Description gensim_2grams_original_list'].apply(
    lambda ngrams: [
        tuple(ngram.split('_'))
        for ngram in ngrams
        if '_' in ngram
    ]
)
df_manual['Job Description gensim_2grams_in_sent'] = (
    df_manual['Job Description spacy_sentencized']
    .str.lower()
    .apply(
        lambda sentence: ' '.join(
            preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
        )
    )
    .replace(
        regex={
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_manual[
                'Job Description gensim_2grams_original_list'
            ]
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
)

# Gensim Trigrams
trigram = Phraser(Phrases(df_manual['Job Description gensim_2grams_original_list_all'], connector_words=ENGLISH_CONNECTOR_WORDS, min_count=1, threshold=1))
df_manual['Job Description gensim_3grams_original_list_all'] = trigram[df_manual['Job Description gensim_2grams_original_list_all']]
df_manual['Job Description gensim_3grams_original_list'] = df_manual['Job Description gensim_3grams_original_list_all'].apply(
    lambda ngrams_list: [
        ngram_
        for ngram_ in ngrams_list
        if len(re.findall('[a-zA-Z]*\_[a-zA-Z]*\_[a-zA-Z]*', ngram_)) != 0
    ]
)
df_manual['Job Description gensim_3grams'] = df_manual['Job Description gensim_3grams_original_list'].apply(
    lambda ngrams: [
        tuple(ngram.split('_'))
        for ngram in ngrams
        if '_' in ngram
    ]
)
df_manual['Job Description gensim_3grams_in_sent'] = (
    df_manual['Job Description spacy_sentencized']
    .str.lower()
    .apply(
        lambda sentence: ' '.join(
            preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
        )
    )
    .replace(
        regex={
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_manual[
                'Job Description gensim_3grams_original_list'
            ]
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
)


CPU times: user 19.6 s, sys: 81.3 ms, total: 19.6 s
Wall time: 19.6 s


In [149]:
%%time
# Gensim Allgrams
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'

df_manual['Job Description gensim_123grams_original_list'] = (
    df_manual['Job Description gensim_tokenized']
    + df_manual['Job Description gensim_2grams_original_list']
    + df_manual['Job Description gensim_3grams_original_list']
)
df_manual['Job Description gensim_123grams'] = (
    df_manual['Job Description gensim_1grams']
    + df_manual['Job Description gensim_2grams']
    + df_manual['Job Description gensim_3grams']
)
df_manual['Job Description gensim_123grams_in_sent'] = (
    df_manual['Job Description spacy_sentencized']
    .str.lower()
    .apply(
        lambda sentence: ' '.join(
            preprocess_string(re.sub(pattern, ' ', sentence.strip().lower()))
        )
    )
    .replace(
        regex={
            re.escape(' '.join(ngram_.split('_'))): re.escape(ngram_)
            for ngrams_list in df_manual[
                'Job Description gensim_123grams_original_list'
            ]
            for ngram_ in ngrams_list
            if '_' in ngram_
        }
    )
)


CPU times: user 18.8 s, sys: 25.6 ms, total: 18.8 s
Wall time: 18.8 s


In [150]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk_gensim.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_spacy_nltk_gensim.csv', index=False)


# Create word frequencies for uni, bi, and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_SPACY_NLTK_GENSIM
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [151]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [152]:
from setup_module.imports import *


In [153]:
def get_abs_frequency(row, text_col, ngram_num, embedding_library):

    abs_word_freq = defaultdict(int)
    for word in row[f'Job Description {embedding_library}_{ngram_num}grams_original_list']:
        abs_word_freq[word] += 1

        abs_wtd_df = (
            pd.DataFrame.from_dict(abs_word_freq, orient='index')
            .rename(columns={0: 'abs_word_freq'})
            .sort_values(by=['abs_word_freq'], ascending=False)
            )
        abs_wtd_df.insert(1, 'abs_word_perc', value=abs_wtd_df['abs_word_freq'] / abs_wtd_df['abs_word_freq'].sum())
        abs_wtd_df.insert(2, 'abs_word_perc_cum', abs_wtd_df['abs_word_perc'].cumsum())

        row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_freq'] = str(abs_wtd_df['abs_word_freq'].to_dict())
        row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_perc'] = str(abs_wtd_df['abs_word_perc'].to_dict())
        row[f'Job Description {embedding_library}_{ngram_num}grams_abs_word_perc_cum'] = str(abs_wtd_df['abs_word_perc_cum'].to_dict())

    return row


In [154]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_spacy_nltk_gensim.pkl').reset_index(drop=True)


In [155]:
%%time
ngrams_list=[1, 2, 3, 123]
embedding_libraries_list = ['spacy', 'nltk', 'gensim']

for embedding_library, ngram_num in itertools.product(embedding_libraries_list, ngrams_list):
    df_manual = df_manual.apply(lambda row: get_abs_frequency(row=row, text_col='Job Description spacy_tokenized', ngram_num=ngram_num, embedding_library=embedding_library), axis='columns')


CPU times: user 5min 32s, sys: 6.02 s, total: 5min 38s
Wall time: 5min 33s


In [156]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_frequency.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_frequency.csv', index=False)


# Create BoW dictionary, corpus, and tfidf matrix for uni, bi, and trigrams


### START HERE IF SOURCING FROM df_manual_NGRAMS_FREQUENCY
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [157]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [158]:
from setup_module.imports import *


In [159]:
def get_corpus_and_dictionary(row, ngram_num, embedding_library):
    
    ngrams_original_list = row[f'Job Description {embedding_library}_{ngram_num}grams_original_list']
    dictionary = Dictionary([ngrams_original_list])
    BoW_corpus = [dictionary.doc2bow(ngrams_original_list)]
    tfidf = TfidfModel(BoW_corpus, smartirs='ntc')
    tfidf_matrix = [tfidf[doc] for doc in BoW_corpus]

    row[f'Job Description {embedding_library}_{ngram_num}grams_dictionary'] = dictionary
    row[f'Job Description {embedding_library}_{ngram_num}grams_BoW_corpus'] = BoW_corpus
    row[f'Job Description {embedding_library}_{ngram_num}grams_tfidf'] = tfidf
    row[f'Job Description {embedding_library}_{ngram_num}grams_tfidf_matrix'] = tfidf_matrix
    
    return row


In [160]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_frequency.pkl').reset_index(drop=True)


In [161]:
%%time
ngrams_list=[1, 2, 3, 123]
embedding_libraries_list = ['spacy', 'nltk', 'gensim']

for embedding_library, ngram_num in itertools.product(embedding_libraries_list, ngrams_list):
    df_manual = df_manual.apply(
        lambda row: get_corpus_and_dictionary(
            row=row, ngram_num=ngram_num, embedding_library=embedding_library
        ),
        axis='columns'
    )

assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_frequency.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_BoW.csv', index=False)


CPU times: user 1min, sys: 2.18 s, total: 1min 3s
Wall time: 1min 1s


In [162]:
df_manual.columns


Index(['% Sector per Workforce', '% per Sector', '% per Social Category', '% per Workforce', 'Age', 'Age_Mixed Age', 'Age_Num', 'Age_Older', 'Age_Older (>= 45 years)_% per Sector', 'Age_Older (>= 45 years)_% per Social Category',
       ...
       'Job Description gensim_2grams_tfidf', 'Job Description gensim_2grams_tfidf_matrix', 'Job Description gensim_3grams_dictionary', 'Job Description gensim_3grams_BoW_corpus', 'Job Description gensim_3grams_tfidf', 'Job Description gensim_3grams_tfidf_matrix', 'Job Description gensim_123grams_dictionary', 'Job Description gensim_123grams_BoW_corpus', 'Job Description gensim_123grams_tfidf', 'Job Description gensim_123grams_tfidf_matrix'], dtype='object', length=183)

In [163]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_ngrams_BoW.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_ngrams_BoW.csv', index=False)


# ATTN: This script should be run AFTER all bi and trigrams (spacy, nltk, and gensim) completed.


# Use spacy and nltk for sentiment scoring


### START HERE IF SOURCING FROM df_manual_NGRAMS_BOW
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [164]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [165]:
from setup_module.imports import *


In [166]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_ngrams_BoW.pkl').reset_index(drop=True)


In [167]:
%%time
# Spacy sentiment
if 'spacytextblob' not in nlp.pipe_names:
    nlp.add_pipe('spacytextblob')

df_manual['Job Description spacy_sentiment'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda sentence: float(nlp(sentence)._.blob.polarity)
    if isinstance(sentence, str) else np.nan
)


CPU times: user 30.1 s, sys: 80 ms, total: 30.2 s
Wall time: 30.2 s


In [168]:
%%time
# NLTK sentiment
df_manual['Job Description nltk_sentiment'] = df_manual['Job Description spacy_sentencized'].apply(
    lambda sentence: float(sentim_analyzer.polarity_scores(sentence)['compound'])
    if isinstance(sentence, str) else np.nan
)


CPU times: user 539 ms, sys: 2.75 ms, total: 542 ms
Wall time: 541 ms


In [169]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_sentiment_spacy_nltk.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_sentiment_spacy_nltk.csv', index=False)


# ATTN: This script should be run AFTER all sentiment scoring (spacy and nltk) completed.


### START HERE IF SOURCING FROM df_manual_SENTIMENT_SPACY_NLTK
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


# Word2Vec and FastText embeddings


In [170]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [171]:
from setup_module.imports import *


In [172]:
def build_train_word2vec(df, ngram_number, embedding_library, size = 300, words=None, t = time.time(), cores = multiprocessing.cpu_count()):
    if words is None:
        words = [
            'she',
            'he',
            'support',
            'leader',
            'management',
            'team',
            'business',
            'customer',
            'risk',
            'build',
            'computer',
            'programmer',
        ]
    sentences = df[f'Job Description {embedding_library}_{ngram_number}grams_original_list'].values

    w2v_model = Word2Vec(
        sentences=sentences,
        vector_size=size,
        min_count=0,
        window=2,
        sample=6e-5,
        alpha=0.03,
        min_alpha=0.0007,
        negative=20,
        workers=cores - 1,
        sg = 1,
    )

    w2v_model.build_vocab(sentences, progress_per=10000)
    print(f'Time to train the model for {size}: {round((time.time() - t) / 60, 2)} mins')

    w2v_model.train(
        sentences,
        total_examples=w2v_model.corpus_count,
        epochs=30,
        report_delay=1,
    )

    print(f'Time to build w2v_vocab for {size}: {round((time.time() - t) / 60, 2)} mins')
    w2v_vocab = list(w2v_model.wv.index_to_key)

    print(f'Checking words form list of length {len(words)}')
    print(f'WORDS LIST: {words}')

#     for word in words:
#         print(f'Checking word:\n{word.upper()}:')
#         try:
# #             print(f'Word2Vec {size}: {w2v_model.wv[word]}')
#             print(f'Length of {size} model vobal: {len(w2v_vocab)}')
#             print(f'{size} - Positive most similar to {word}: {w2v_model.wv.most_similar(positive=word, topn=5)}')
#             print(f'{size} - Negative most similar to {word}: {w2v_model.wv.most_similar(negative=word, topn=5)}')

#         except KeyError as e:
#             print(e)

    return w2v_vocab, w2v_model

def word2vec_embeddings(sentences, w2v_vocab, w2v_model, size=300):

    sentences = [word for word in sentences if word in w2v_vocab]

    return (
        np.mean(w2v_model.wv[sentences], axis=0)
        if sentences
        else np.zeros(size)
    )



In [173]:
def build_train_fasttext(df, ngram_number, embedding_library, size = 300, words=None, t = time.time(), cores = multiprocessing.cpu_count()):
    if words is None:
        words = [
            'she',
            'he',
            'support',
            'leader',
            'management',
            'team',
            'business',
            'customer',
            'risk',
            'build',
            'computer',
            'programmer',
        ]
    sentences = df[f'Job Description {embedding_library}_{ngram_number}grams_original_list'].values

    ft_model = FastText(
        sentences=sentences,
        vector_size=size,
        min_count=0,
        window=2,
        sample=6e-5,
        alpha=0.03,
        min_alpha=0.0007,
        negative=20,
        workers=cores - 1,
        sg = 1,
    )

    ft_model.build_vocab(sentences, progress_per=10000)
    print(f'Time to train the model for {size}: {round((time.time() - t) / 60, 2)} mins')

    ft_model.train(
        sentences,
        total_examples=ft_model.corpus_count,
        epochs=30,
        report_delay=1,
    )

    print(f'Time to build vocab for {size}: {round((time.time() - t) / 60, 2)} mins')
    ft_vocab = list(ft_model.wv.index_to_key)

    print(f'Checking words form list of length {len(words)}')
    print(f'WORDS LIST: {words}')

#     for word in words:
#         print(f'Checking word:\n{word.upper()}:')
#         try:
# #             print(f'FastText {size}: {ft_model_300.wv[word]}')
#             print(f'Length of {size} model vobal: {len(ft_vocab)}')
#             print(f'{size} - Positive most similar to {word}: {ft_model.wv.most_similar(positive=word, topn=5)}')
#             print(f'{size} - Negative most similar to {word}: {ft_model.wv.most_similar(negative=word, topn=5)}')

#         except KeyError as e:
#             print(e)

    return ft_vocab, ft_model

def fasttext_embeddings(sentences, ft_vocab, ft_model, size=300):

    sentences = [word for word in sentences if word in ft_vocab]

    return np.mean(ft_model.wv[sentences], axis=0) if sentences else np.zeros(size)


In [174]:
def get_glove(glove_file = f'{llm_path}/gensim/glove/glove.840B.300d.txt'):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf8') as glove:

        for line in glove:
            values = line.split()
            word = values[0]

            with contextlib.suppress(ValueError):
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
    print(f'Found {len(embeddings_index)} word vectors.')

    return embeddings_index


In [175]:
def sent2vec(sentences, embeddings_index=None, external_glove=True, extra_preprocessing_enabled=False):

    if external_glove is False and embeddings_index is None:
        embeddings_index= get_glove()

    if extra_preprocessing_enabled is False:
        words = sentences

    elif extra_preprocessing_enabled is True:
        stop_words = set(sw.words('english'))
        words = str(sentences).lower()
        words = word_tokenize(words)
        words = [w for w in words if (w not in stop_words) and (w.isalpha())]

    M = []

    try:
        for w in words:
            try:
                M.append(embeddings_index[w])
            except Exception:
                continue

        M = np.array(M)
        v = M.sum(axis='index')
        return np.zeros(300) if type(v) != np.ndarray else v / np.sqrt((v ** 2).sum())

    except Exception:
        return np.zeros(300)


In [176]:
df_manual = pd.read_pickle(f'{df_save_dir}df_manual_sentiment_spacy_nltk.pkl').reset_index(drop=True)


In [177]:
embedding_models_dict = {
    'w2v': [build_train_word2vec, word2vec_embeddings, Word2Vec],
    'ft': [build_train_fasttext, fasttext_embeddings, FastText],
}


In [178]:
%%time
# Make embeddings
ngrams_list=[1, 2, 3, 123]
embedding_libraries_list = ['spacy', 'nltk', 'gensim']

for embedding_library, ngram_number in itertools.product(embedding_libraries_list, ngrams_list):
    print(f'Building {embedding_library}_{ngram_number}grams model and vocabulary.')

    for embed_model_name, embed_func_list in embedding_models_dict.items():

        build_train_func, embed_func, model_loader = embed_func_list
        print(f'Building {embed_model_name} from {embed_func.__name__} function.')

        vocab, model = build_train_func(
            df=df_manual,
            ngram_number=ngram_number,
            embedding_library=embedding_library,
        )

        print(f'Getting {embed_model_name} embeddings.')

        df_manual[
            f'Job Description {embedding_library}_{ngram_number}grams_mean_{embed_model_name}_embeddings'
        ] = df_manual[
            f'Job Description {embedding_library}_{ngram_number}grams_original_list'
        ].apply(
            lambda sentences: embed_func(sentences, vocab, model)
        )
        model.save(f'{data_dir}embeddings models/{embedding_library}_{ngram_number}grams_{embed_model_name}_model.model')

    # Sent2Vec
    print('Getting sent2vec embeddings.')
    embeddings_index = get_glove()
    df_manual[f'Job Description {embedding_library}_{ngram_number}grams_sent2vec_embeddings'] = df_manual[f'Job Description {embedding_library}_{ngram_number}grams'].apply(lambda sentences: sent2vec(sentences, embeddings_index=embeddings_index, external_glove=True, extra_preprocessing_enabled=False))
    print('Done getting sent2vec embeddings.')
        
    

Building spacy_1grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 0.07 mins
Time to build w2v_vocab for 300: 0.12 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 0.2 mins
Time to build vocab for 300: 0.29 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building spacy_2grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 1.45 mins
Time to build w2v_vocab for 300: 1.46 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 1.53 mins
Time to build vocab for 300: 1.6 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.




Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building spacy_3grams model and vocabulary.
Building w2v from word2vec_embeddings function.
Time to train the model for 300: 2.87 mins
Time to build w2v_vocab for 300: 2.87 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 2.93 mins
Time to build vocab for 300: 2.94 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building spacy_123grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 4.29 mins
Time to build w2v_vocab for 300: 4.38 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 4.49 mins
Time to build vocab for 300: 4.67 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building nltk_1grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 6.03 mins
Time to build w2v_vocab for 300: 6.08 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 6.15 mins
Time to build vocab for 300: 6.25 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building nltk_2grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 7.72 mins
Time to build w2v_vocab for 300: 7.94 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 8.26 mins
Time to build vocab for 300: 8.64 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building nltk_3grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 10.32 mins
Time to build w2v_vocab for 300: 10.57 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 10.95 mins
Time to build vocab for 300: 11.79 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building nltk_123grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 13.55 mins
Time to build w2v_vocab for 300: 14.12 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 15.16 mins
Time to build vocab for 300: 16.33 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building gensim_1grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 18.52 mins
Time to build w2v_vocab for 300: 18.62 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 18.8 mins
Time to build vocab for 300: 18.98 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building gensim_2grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 20.84 mins
Time to build w2v_vocab for 300: 20.89 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 21.01 mins
Time to build vocab for 300: 21.19 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building gensim_3grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 22.96 mins
Time to build w2v_vocab for 300: 22.97 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 23.03 mins
Time to build vocab for 300: 23.04 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
Building gensim_123grams model and vocabulary.
Building w2v from word2vec_embeddings function.




Time to train the model for 300: 24.47 mins
Time to build w2v_vocab for 300: 24.56 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting w2v embeddings.
Building ft from fasttext_embeddings function.




Time to train the model for 300: 24.68 mins
Time to build vocab for 300: 24.87 mins
Checking words form list of length 12
WORDS LIST: ['she', 'he', 'support', 'leader', 'management', 'team', 'business', 'customer', 'risk', 'build', 'computer', 'programmer']
Getting ft embeddings.
Getting sent2vec embeddings.
Found 2195885 word vectors.
Done getting sent2vec embeddings.
CPU times: user 31min 31s, sys: 2min 14s, total: 33min 45s
Wall time: 26min 15s


In [179]:
assert len(df_manual) > 0 and isinstance(df_manual, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_manual)}'
df_manual.to_pickle(f'{df_save_dir}df_manual_for_trainning.pkl')
df_manual.to_csv(f'{df_save_dir}df_manual_for_trainning.csv', index=False)
