In [1]:
import os
import sys
import re
import time
import json
import csv
import glob
import pickle
import pandas as pd
from pathlib import Path
from googletrans import Translator
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.pipeline import Sentencizer


In [2]:
# MAIN DIR
main_dir = '/Users/nyxinsane/Documents/Work - UvA/Automating Equity/Study 1/Study1_Code/'

# scraping dir
scraped_data = f'{main_dir}scraped_data/'

# data dir
data_dir = f'{main_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'

In [3]:
# This is a manually collected dictionary of incorrect/faulty keywords in scraped site data
with open(f'{scraped_data}CBS/Data/keyword_trans_dict.txt') as f:
    keyword_trans_dict = json.load(f)

In [4]:
# 111 words to fix
len(keyword_trans_dict)

111

In [5]:
def fix_broken_linkedin_files(glob_path):
    fix_list = []
    data_dict = {}
    data_list = []

    if glob_path.endswith('.json'):

        with open(glob_path, encoding = 'utf-8') as csv_file_handler:
            csv_reader = csv.DictReader(csv_file_handler)

            for rows in csv_reader:
                first_key = str(list(rows.keys())[0])
                key = rows[first_key]
                data_dict[key] = rows

        for num in data_dict:
            data_list.append(data_dict[num])

        with open(glob_path, 'w', encoding = 'utf-8') as json_file_handler:
            json_file_handler.write(json.dumps(data_list, indent = 4))
    
    return data_list


In [6]:
def fix_keywords(df_temp):
    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        for key, value in keyword_trans_dict.items():
            df_temp.loc[
                df_temp['Search Keyword'].astype(str).apply(
                lambda x: x.lower().strip()
                ) == str(key).lower().strip(), 'Search Keyword'
            ] = str(value).lower().strip()

        unfixed = df_temp.loc[
            df_temp['Search Keyword'].astype(str).apply(lambda x: x.lower().strip()).isin([x.lower().strip() for x in list(keyword_trans_dict.keys())])
        ]

        if len(unfixed) != 0:
            for key, value in keyword_trans_dict.items():
                for idx, row in df_temp.iterrows():
                    if row['Search Keyword'].astype(str).lower().strip() == str(key).lower().strip():
                        df_temp.loc[idx, 'Search Keyword'] = str(value).lower().strip()
    

    return df_temp

In [7]:
glob_paths = []

for site in site_list:
    glob_paths.extend(glob.glob(f'{scraped_data}/{site}/Data/*.json')+glob.glob(f'{scraped_data}/{site}/Data/*.csv'))

In [8]:
# 955 json and csv files
len(glob_paths)

955

In [9]:
# Fix list catches all incorrect/faculty keyword search terms

fix_list = []

# Appended data catches all the fixed and cleaned dfs
appended_data = []

for glob_path in glob_paths:

    if glob_path.endswith('.json'):
        try:
            df_temp = pd.read_json(glob_path).reset_index(drop=True)
        except ValueError:
            fix_list.append(glob_path)
            if 'scraped_data/LinkedIn/Data/linkedin_jobs_df_' in glob_path:
                data_json = fix_broken_linkedin_files(glob_path)
                try:
                    df_temp = pd.read_json(glob_path).reset_index(drop=True)
                except ValueError:
                    fix_list.append(glob_path)
    elif glob_path.endswith('.csv'):
        df_temp = pd.read_csv(glob_path).reset_index(drop=True)

    if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
        df_temp = fix_keywords(df_temp)
        df_temp.reset_index(drop=True, inplace=True)
        df_temp.drop(columns=cols, axis='columns', inplace=True, errors='ignore')
        df_temp.drop(
        df_temp.columns[
                df_temp.columns.str.contains(
                    'unnamed|index|level', regex=True, case=False, flags=re.I
                )
            ],
            axis='columns',
            inplace=True,
            errors='ignore',
        )
#         df_temp.drop(columns=df_temp.filter(regex=re.compile(r"^unnamed", re.IGNORECASE)).columns, axis='columns', inplace=True, errors='ignore')
    
        if glob_path.endswith('.json'):
            df_temp.to_json(glob_path, orient='records')
        elif glob_path.endswith('.csv'):
            df_temp.to_csv(glob_path, index=False)

        appended_data.append(df_temp.reset_index(drop=True))


In [10]:
# len = 527
len(appended_data)

527

In [11]:
if len(fix_list) != 0:
    print('Some keywords to fix!')
    with open(f'{data_dir}fix_list.txt', 'w') as f:
        json.dump(fix_list, f)


In [12]:
df_jobs = pd.concat(appended_data).reset_index(drop=True)


In [13]:
if len(df_temp) > 0 and isinstance(df_temp, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_raw.pickle')
    
    df_jobs.to_csv(f'{data_dir}df_jobs_raw.csv', index=False)


### START HERE FROM FILE

In [14]:
import os
import sys
import re
import time
import json
import csv
import glob
import pickle
import pandas as pd
from pathlib import Path
from googletrans import Translator
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.pipeline import Sentencizer

In [15]:
# MAIN DIR
main_dir = '/Users/nyxinsane/Documents/Work - UvA/Automating Equity/Study 1/Study1_Code/'

# scraping dir
scraped_data = f'{main_dir}scraped_data/'

# data dir
data_dir = f'{main_dir}data/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'

In [16]:
df_jobs = pd.read_pickle(f'{data_dir}df_jobs_raw.pkl').reset_index(drop=True)


In [17]:
# len = 204394
len(df_jobs)

204394

In [18]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204394 entries, 0 to 204393
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Search Keyword     204394 non-null  object 
 1   Platform           204394 non-null  object 
 2   Job ID             204394 non-null  object 
 3   Job Title          204394 non-null  object 
 4   Company Name       204384 non-null  object 
 5   Location           204394 non-null  object 
 6   Job Description    204379 non-null  object 
 7   Rating             51158 non-null   float64
 8   Employment Type    203334 non-null  object 
 9   Company URL        193929 non-null  object 
 10  Job URL            204394 non-null  object 
 11  Job Age            204394 non-null  object 
 12  Job Age Number     204394 non-null  object 
 13  Collection Date    204394 non-null  object 
 14  Data Row           153230 non-null  float64
 15  Tracking ID        153230 non-null  object 
 16  In

In [19]:
df_jobs.columns

Index(['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name',
       'Location', 'Job Description', 'Rating', 'Employment Type',
       'Company URL', 'Job URL', 'Job Age', 'Job Age Number',
       'Collection Date', 'Data Row', 'Tracking ID', 'Industry', 'Job Date',
       'Type of ownership'],
      dtype='object')

In [None]:
translator = Translator()

for idx, row in df_jobs.iterrows():
    if len(str(row['Job Description'])) != 0:
        try:
            df_jobs.loc[idx, 'Language'] = translator.detect(str(row['Job Description']).lower().strip()).lang
        except:
            time.sleep(3600)
            df_jobs.loc[idx, 'Language'] = translator.detect(str(row['Job Description']).lower().strip()).lang

In [None]:
# translator = Translator()
# try:
#     df_jobs['Language'] = df_jobs['Job Description'].apply(lambda x: translator.detect(str(x).lower().strip()).lang)
# except:
#     time.sleep(3600)
#     df_jobs['Language'] = df_jobs['Job Description'].apply(lambda x: translator.detect(str(x).lower().strip()).lang)
    

In [None]:
df_jobs['Language'].value_counts()

In [None]:
if len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame):
    df_jobs.to_pickle(f'{data_dir}df_jobs_translated.pkl')
    
    df_jobs.to_csv(f'{data_dir}df_jobs_translated.csv', index=False)

In [None]:
# Clean columns
df_jobs.columns = df_jobs.columns.to_series().apply(lambda x: str(x).strip())


In [None]:
# Drop NA
df_jobs.dropna(axis='index', how='all', inplace=True)
df_jobs.dropna(axis='columns', how='all', inplace=True)

In [None]:
len(df_jobs)

In [None]:
# Drop duplicates in general and on subset of 'Job Description'
subset_list=[int_variable, str_variable]

# df_jobs.drop_duplicates(keep='first', ignore_index=True, inplace=True)
df_jobs.drop_duplicates(subset=['Platform', 'Job ID', 'Search Keyword', 'Job Description'], keep='first', ignore_index=True, inplace=True)


In [None]:
len(df_jobs)

In [None]:
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']

for variable in subset_list:
    df_jobs = df_jobs.loc[
        (
            df_jobs[variable].apply(lambda x: isinstance(x, str))
        )
        & (~df_jobs[variable].isin(nan_list))
    ]



In [None]:
len(df_jobs)

In [None]:
df_jobs['Job Description']

In [None]:
# Drop non-english job descriptions
# df_jobs.drop(df_jobs.index[df_jobs['Language'] == str(language)], axis='index', inplace=True, errors='ignore')


In [None]:
# 44
len(df_jobs)

In [None]:
# df_jobs = pd.read_pickle(f'{data_dir}df_jobs_dropped.pickle')


In [None]:
# Find and count unique search keywords
search_keywords = list(set(df_jobs['Search Keyword'].to_list()))


In [None]:
len(search_keywords)


In [None]:
search_keywords


In [None]:
job_descriptions = list(set(df_jobs['Job Description'].to_list()))

In [None]:
len(job_descriptions)

In [None]:
# job_descriptions

In [None]:
# Load NLK
nltk_path = f'{llm_path}/nltk'
nltk.data.path.append(nltk_path)

nltk.download('words', download_dir = nltk_path)
nltk.download('punkt', download_dir = nltk_path)
nltk.download('stopwords', download_dir = nltk_path)

In [None]:
# Load SpaCY
sentencizer = Sentencizer(punct_chars=[',,', ',,,', ',,,,'])


In [None]:
for job_description in job_descriptions:
    print([doc for doc in sentencizer()])

In [None]:
job_sentences = []
for job_description in job_descriptions:
    job_sentences.extend(sent_tokenize(job_description))

In [None]:
job_sentences[0].split('\n')



In [None]:
# Use spacy with nlp to sent tokenize

In [None]:
nlp = spacy.load('en_core_web_sm')