In [None]:
import os
import sys
import importlib
from pathlib import Path

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2

In [None]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [None]:
import os
import sys
import re
import time
import json
import csv
import glob
import pickle
import pandas as pd
from pathlib import Path
from googletrans import Translator
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.pipeline import Sentencizer


In [None]:
# This is a manually collected dictionary of incorrect/faulty keywords in scraped site data
with open(f'{scraped_data}CBS/Data/keyword_trans_dict.txt') as f:
    keyword_trans_dict = json.load(f)


In [None]:
# 111 words to fix
len(keyword_trans_dict)


In [None]:
def fix_broken_linkedin_files(glob_path):
    fix_list = []
    data_dict = {}
    data_list = []

    if glob_path.endswith('.json'):

        with open(glob_path, encoding = 'utf-8') as csv_file_handler:
            csv_reader = csv.DictReader(csv_file_handler)

            for rows in csv_reader:
                first_key = str(list(rows.keys())[0])
                key = rows[first_key]
                data_dict[key] = rows

        for num in data_dict:
            data_list.append(data_dict[num])

        with open(glob_path, 'w', encoding = 'utf-8') as json_file_handler:
            json_file_handler.write(json.dumps(data_list, indent = 4))
    
    return data_list


In [None]:
def fix_keywords(df_temp):
    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        for key, value in keyword_trans_dict.items():
            df_temp.loc[
                df_temp['Search Keyword'].astype(str).apply(
                lambda x: x.lower().strip()
                ) == str(key).lower().strip(), 'Search Keyword'
            ] = str(value).lower().strip()

        unfixed = df_temp.loc[
            df_temp['Search Keyword'].astype(str).apply(lambda x: x.lower().strip()).isin([x.lower().strip() for x in list(keyword_trans_dict.keys())])
        ]

        if len(unfixed) != 0:
            for key, value in keyword_trans_dict.items():
                for idx, row in df_temp.iterrows():
                    if row['Search Keyword'].astype(str).lower().strip() == str(key).lower().strip():
                        df_temp.loc[idx, 'Search Keyword'] = str(value).lower().strip()
    

    return df_temp


#### Read paths

In [None]:
glob_paths = []

for site in site_list:
    glob_paths.extend(glob.glob(f'{scraped_data}/{site}/Data/*.json')+glob.glob(f'{scraped_data}/{site}/Data/*.csv'))


In [None]:
# 955 json and csv files
len(glob_paths)


#### Use paths to open files, fix keywords, and drop unneeded columns

In [None]:
# Fix list catches all incorrect/faculty keyword search terms

fix_list = []

# Appended data catches all the fixed and cleaned dfs
appended_data = []

for glob_path in glob_paths:

    if glob_path.endswith('.json'):
        try:
            df_temp = pd.read_json(glob_path).reset_index(drop=True)
        except ValueError:
            fix_list.append(glob_path)
            if 'scraped_data/LinkedIn/Data/linkedin_jobs_df_' in glob_path:
                data_json = fix_broken_linkedin_files(glob_path)
                try:
                    df_temp = pd.read_json(glob_path).reset_index(drop=True)
                except ValueError:
                    fix_list.append(glob_path)
    elif glob_path.endswith('.csv'):
        df_temp = pd.read_csv(glob_path).reset_index(drop=True)

    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        df_temp = fix_keywords(df_temp)
        df_temp.reset_index(drop=True, inplace=True)
        df_temp.drop(columns=cols, axis='columns', inplace=True, errors='ignore')
        df_temp.drop(
        df_temp.columns[
                df_temp.columns.str.contains(
                    'unnamed|index|level', regex=True, case=False, flags=re.I
                )
            ],
            axis='columns',
            inplace=True,
            errors='ignore',
        )
    
        if glob_path.endswith('.json'):
            df_temp.to_json(glob_path, orient='records')
        elif glob_path.endswith('.csv'):
            df_temp.to_csv(glob_path, index=False)

        appended_data.append(df_temp.reset_index(drop=True))

# Concatonate list of dfs into one large df_jobs
df_jobs = pd.concat(appended_data).reset_index(drop=True)

# Save df_jobs to file
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_raw.pkl')
    
    df_jobs.to_csv(f'{data_dir}df_jobs_raw.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


In [None]:
# List of dfs, len = 527
len(appended_data)


In [None]:
# len = 204394
len(df_jobs)


In [None]:
# If we couldn't fix some keywords, we add them to list fix_list and write to file
if len(fix_list) != 0:
    print('Some keywords to fix!')
    with open(f'{data_dir}fix_list.txt', 'w') as f:
        json.dump(fix_list, f)


### START HERE IF SOURCING FROM DF_RAW
### PLEASE SET CORRECT DIRECTORY PATHS BELOW

In [None]:
import os
import sys
import importlib
from pathlib import Path

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2

In [None]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [None]:
import os
import sys
import re
import time
import json
import csv
import glob
import pickle
import pandas as pd
import googletrans
from pathlib import Path
from googletrans import Translator
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.pipeline import Sentencizer


In [None]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_raw.pkl').reset_index(drop=True)


In [None]:
# len = 204394
len(df_jobs)


In [None]:
df_jobs.info()


In [None]:
df_jobs.columns

In [None]:
# Clean columns
df_jobs.columns = df_jobs.columns.to_series().apply(lambda x: str(x).strip())


In [None]:
# Drop NA
df_jobs.dropna(axis='index', how='all', inplace=True)
df_jobs.dropna(axis='columns', how='all', inplace=True)


In [None]:
# len = 204394
len(df_jobs)


In [None]:
# Drop duplicates on subset of 'Job Description'
df_jobs.drop_duplicates(subset=['Company Name', 'Job Description'], keep='first', ignore_index=True, inplace=True)


In [None]:
# len = 64694
len(df_jobs)


In [None]:
# Remove any rows with missing 'Job Description'
df_jobs.drop(
    df_jobs.index[
        df_jobs['Job Description'].isin(nan_list)
    ], 
    axis='index',
    inplace=True,
    errors='ignore'

)

# Save df_jobs to file
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_raw_dropped.pkl')
    
    df_jobs.to_csv(f'{data_dir}df_jobs_raw_dropped.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


### START HERE IF SOURCING FROM DF_RAW_DROPPED
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


In [1]:
import os
import sys
import importlib
from pathlib import Path

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2

In [2]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [3]:
import os
import sys
import re
import time
import json
import csv
import glob
import pickle
import pandas as pd
import googletrans
from pathlib import Path
from googletrans import Translator
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.pipeline import Sentencizer


In [4]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_raw_dropped.pkl').reset_index(drop=True)


In [5]:
# 64684
len(df_jobs)


64684

In [6]:
df_jobs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64684 entries, 0 to 64683
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Search Keyword     64684 non-null  object 
 1   Platform           64684 non-null  object 
 2   Job ID             64684 non-null  object 
 3   Job Title          64684 non-null  object 
 4   Company Name       64679 non-null  object 
 5   Location           64684 non-null  object 
 6   Job Description    64684 non-null  object 
 7   Rating             4130 non-null   float64
 8   Employment Type    63959 non-null  object 
 9   Company URL        61360 non-null  object 
 10  Job URL            64684 non-null  object 
 11  Job Age            64684 non-null  object 
 12  Job Age Number     64684 non-null  object 
 13  Collection Date    64684 non-null  object 
 14  Data Row           60551 non-null  float64
 15  Tracking ID        60551 non-null  object 
 16  Industry           612

In [7]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_raw_language_detected.pkl').reset_index(drop=True)


In [8]:
# 64684
len(df_jobs)


64684

In [9]:
df_jobs['Language'].value_counts()

en    1785
nl     193
Name: Language, dtype: int64

In [None]:
translator = Translator()
googletrans_readtime_error = googletrans.client.httpx._client.httpcore._exceptions.ReadTimeout

# try:
#     time.sleep(60)
#     df_jobs['Language'] = df_jobs['Job Description'].apply(lambda x: translator.detect(str(x).lower().strip()).lang)
# except:
#     time.sleep(3600)
#     df_jobs['Language'] = df_jobs['Job Description'].apply(lambda x: translator.detect(str(x).lower().strip()).lang)

for idx, row in df_jobs.iterrows():
    # This part ensures we don't start lang detection from index 0 if some lang detection was already done
    if (len(str(row['Job Description'])) != 0) and ('Language' in df_jobs.columns) and (type(row['Language']) == float and np.isnan(row['Language'])):

        try:
            print(f'Row {idx}: Translation in progress.')
            time.sleep(60)
            df_jobs.loc[idx, 'Language'] = str(translator.detect(str(row['Job Description']).lower().strip()).lang)
            print(f'Row {idx}: Translation done.')
        except googletrans_readtime_error:
            print(f'Row {idx}: Sleeping for an hour.')
            print('-'*30)
            time.sleep(3600)
            print(f'Row {idx}: Done sleeping.')
            print('-'*30)
            df_jobs.loc[idx, 'Language'] = str(translator.detect(str(row['Job Description']).lower().strip()).lang)

if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_raw_language_detected.pkl')
    
    df_jobs.to_csv(f'{data_dir}df_jobs_raw_language_detected.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')


Row 1978: Translation in progress.
Row 1978: Translation done.
Row 1979: Translation in progress.
Row 1979: Translation done.
Row 1980: Translation in progress.
Row 1980: Translation done.
Row 1981: Translation in progress.
Row 1981: Translation done.
Row 1982: Translation in progress.
Row 1982: Translation done.
Row 1983: Translation in progress.
Row 1983: Translation done.
Row 1984: Translation in progress.
Row 1984: Translation done.
Row 1985: Translation in progress.
Row 1985: Translation done.
Row 1986: Translation in progress.
Row 1986: Translation done.
Row 1987: Translation in progress.
Row 1987: Translation done.
Row 1988: Translation in progress.
Row 1988: Translation done.
Row 1989: Translation in progress.
Row 1989: Translation done.
Row 1990: Translation in progress.
Row 1990: Translation done.
Row 1991: Translation in progress.
Row 1991: Translation done.
Row 1992: Translation in progress.
Row 1992: Translation done.
Row 1993: Translation in progress.
Row 1993: Translatio

Row 2108: Translation done.
Row 2109: Translation in progress.
Row 2109: Translation done.
Row 2110: Translation in progress.
Row 2110: Translation done.
Row 2111: Translation in progress.
Row 2111: Translation done.
Row 2112: Translation in progress.
Row 2112: Translation done.
Row 2113: Translation in progress.
Row 2113: Translation done.
Row 2114: Translation in progress.
Row 2114: Translation done.
Row 2115: Translation in progress.
Row 2115: Translation done.
Row 2116: Translation in progress.
Row 2116: Translation done.
Row 2117: Translation in progress.
Row 2117: Translation done.
Row 2118: Translation in progress.
Row 2118: Translation done.
Row 2119: Translation in progress.
Row 2119: Translation done.
Row 2120: Translation in progress.
Row 2120: Translation done.
Row 2121: Translation in progress.
Row 2121: Translation done.
Row 2122: Translation in progress.
Row 2122: Translation done.
Row 2123: Translation in progress.
Row 2123: Translation done.
Row 2124: Translation in pr

In [None]:
# Drop non-english job descriptions
# df_jobs.drop(df_jobs.index[df_jobs['Language'] == str(language)], axis='index', inplace=True, errors='ignore')


In [None]:
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_raw_language_detected.pkl')
    
    df_jobs.to_csv(f'{data_dir}df_jobs_raw_language_detected.csv', index=False)
else:
    print(f'ERORR: LENGTH OF DF = {len(df_jobs)}')

In [None]:
# df_jobs = pd.read_pickle(f'{data_dir}df_jobs_raw_language_detected.pkl').reset_index(drop=True)


In [None]:
# Find and count unique search keywords
search_keywords = list(set(df_jobs['Search Keyword'].to_list()))


In [None]:
len(search_keywords)


In [None]:
search_keywords


In [None]:
job_descriptions = list(set(df_jobs['Job Description'].to_list()))

In [None]:
len(job_descriptions)

In [None]:
# job_descriptions

In [None]:
# Load NLK
nltk_path = f'{llm_path}/nltk'
nltk.data.path.append(nltk_path)

nltk.download('words', download_dir = nltk_path)
nltk.download('punkt', download_dir = nltk_path)
nltk.download('stopwords', download_dir = nltk_path)


In [None]:
for job_description in job_descriptions:
    print([doc for doc in sentencizer()])


In [None]:
job_sentences = []
for job_description in job_descriptions:
    job_sentences.extend(sent_tokenize(job_description))


In [None]:
job_sentences[0].split('\n')



In [None]:
# Use spacy with nlp to sent tokenize

In [None]:
nlp = spacy.load('en_core_web_sm')