# ATTN: This script should be run AFTER all embeddings are completed.


### START HERE IF SOURCING FROM DF_JOBS
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


# Descriptives and visualization


In [None]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [None]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# df save sir
df_save_dir = f'{data_dir}final dfs/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# models dir
models_save_path = f'{data_dir}classification models/'

# output tables dir
table_save_path = f'{data_dir}output tables/'

# plots dir
plot_save_path = f'{data_dir}plots/'

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
languages = ["en", "['nl', 'en']", ['en', 'nl']]
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [None]:
import string
import re
import time
import json
import csv
import glob
import pickle
import random
import itertools
import unicodedata
import contextlib
import pandas as pd
import numpy as np
import multiprocessing
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import googletrans
from googletrans import Translator
from collections import defaultdict


In [None]:
# Funtion to print df gender and age info
def df_gender_age_info(
    df,
    ivs_all = [
        'Gender',
        'Gender_Num',
        'Gender_Female',
        'Gender_Mixed',
        'Gender_Male',
        'Age',
        'Age_Num',
        'Age_Older',
        'Age_Mixed',
        'Age_Younger',
    ],
):
    # Print Info
    print('\nDF INFO:\n')
    df.info()

    for iv in ivs_all:
        try:
            counts = df[f'{iv}'].value_counts()
            percentages = df[f'{iv}'].value_counts(normalize=True).mul(100).round(1).astype(float)
            print('='*20)
            print(f'{iv}:')
            print('-'*20)
            print(f'{iv} Counts:\n{counts}')
            print('-'*20)
            print(f'{iv} Percentages:\n{percentages}')

            try:
                mean = df[f"{iv}"].mean().round(2).astype(float)
                sd = df[f"{iv}"].std().round(2).astype(float)
                print('-'*20)
                print(f'{iv} Mean: {mean}')
                print('-'*20)
                print(f'{iv} Standard Deviation: {sd}')

            except Exception:
                pass
        except Exception:
            print(f'{iv} not available.')

    print('\n')


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_trainning.pkl').reset_index(drop=True)


In [None]:
# Visualize data balance
mpl.style.use(f'{code_dir}/setup_module/apa.mplstyle-main/apa.mplstyle')
# plt.style.use('ggplot')
plt.style.use('tableau-colorblind10')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.3f}'.format)


In [None]:
# All info
df_jobs.info()

# Gender and Age info by job ad
print('='*30)
print('Gender and Age info at Sentence Level')
print('-'*30)
df_gender_age_info(df_manual)

# Gender and Age info by job ad
print('='*30)
print('Gender and Age info at Job Advertisement Level')
print('-'*30)
df_gender_age_info(df_manual.groupby(['Job ID']).first())


In [None]:
# Ploting Gender and Age
df_gender_transposed = df_jobs['Gender'].value_counts(normalize=True).mul(100).round(2).astype(float).to_frame().T
df_age_transposed = df_jobs['Age'].value_counts(normalize=True).mul(100).round(2).astype(float).to_frame().T

fig, axs = plt.subplots(1, 2)
fig.suptitle('Training Dataset: Gender and Age Sentence Percentages')

df_gender_transposed.plot(
    kind='bar', legend=True, stacked=True, ax=axs[0], color=['C5', 'C2', 'C0']
)
df_age_transposed.plot(
    kind='bar', legend=True, stacked=True, ax=axs[1], color=['C5', 'C2', 'C0']
)

for ax in axs:
    for container in ax.containers:
        labels = [f'{height:.1f}%' for v in container if (height:= v.get_height()) > 0]
        ax.bar_label(container, labels=labels, label_type='center', color='white')
        ax.legend(loc='upper right', fontsize=8)

for save_format in ['eps', 'png']:
    fig.savefig(
        f'{plot_save_path}Collected Gender and Age Sentences.{save_format}',
        format=save_format, dpi=3000, bbox_inches='tight'
    )


# ATTN: This script should be run AFTER all visualizations are completed.


### START HERE IF SOURCING FROM DF_JOBS_FOR_TRAINNING
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


# Make descriptive tables


In [None]:
import os
import sys
import importlib
from pathlib import Path
import numpy as np

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

# %load_ext autoreload
# %autoreload 2


In [None]:
# MAIN DIR
main_dir = f'{str(Path(code_dir).parents[0])}/'

# code_dir
code_dir = f'{code_dir}/'
sys.path.append(code_dir)

# scraping dir
scraped_data = f'{code_dir}scraped_data/'

# data dir
data_dir = f'{code_dir}data/'

# df save sir
df_save_dir = f'{data_dir}final dfs/'

# lang models dir
llm_path = f'{data_dir}Language Models'

# sites
site_list=['Indeed', 'Glassdoor', 'LinkedIn']

# models dir
models_save_path = f'{data_dir}classification models/'

# output tables dir
table_save_path = f'{data_dir}output tables/'

# plots dir
plot_save_path = f'{data_dir}plots/'

# columns
cols=['Sector', 
      'Sector Code', 
      'Gender', 
      'Age', 
      'Language', 
      'Dutch Requirement', 
      'English Requirement', 
      'Gender_Female', 
      'Gender_Mixed', 
      'Gender_Male', 
      'Age_Older', 
      'Age_Mixed', 
      'Age_Younger', 
      'Gender_Num', 
      'Age_Num', 
      '% Female', 
      '% Male', 
      '% Older', 
      '% Younger']

int_variable: str = 'Job ID'
str_variable: str = 'Job Description'
gender: str = 'Gender'
age: str = 'Age'
language: str = 'en'
languages = ["en", "['nl', 'en']", ['en', 'nl']]
str_cols = ['Search Keyword', 'Platform', 'Job ID', 'Job Title', 'Company Name', 'Location', 'Job Description', 'Company URL', 'Job URL', 'Tracking ID']
nan_list = [None, 'None', '', ' ', [], -1, '-1', 0, '0', 'nan', np.nan, 'Nan']
pattern = r'[\n]+|[,]{2,}|[|]{2,}|[\n\r]+|(?<=[a-z]\.)(?=\s*[A-Z])|(?=\:+[A-Z])'


In [None]:
import string
import re
import time
import json
import csv
import glob
import pickle
import random
import itertools
import unicodedata
import contextlib
import pandas as pd
import numpy as np
import multiprocessing
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import googletrans
from googletrans import Translator
from collections import defaultdict


In [None]:
# Function to make descriptives tables
def make_descriptives_table(df, v, level):
    
    gender_order = ['Female', 'Male', 'Mixed Gender']
    age_order = ['Older', 'Younger', 'Mixed Age']
    ivs_dict = {'Gender': gender_order, 'Age': age_order}
    
    binary_order = [0, 1]
    dvs_dict = {'Warmth': binary_order, 'Competence': binary_order}

    if level.title() == 'Job':
        level_df = df.groupby(['Job ID']).first()
    elif level.title() == 'Sentence':
        level_df = df
    else:
        raise Exception(f'Specified level {level} not in data.')

    if v in list(ivs_dict.keys()):
        cat_dict = ivs_dict
        index = [
            f'{v_cat}-dominated'
            if 'Mixed' not in v_cat
            else
            f'{"-".join(iv_cat.split())}'
            for v_cat in cat_dict[v]
        ]
        caption = [
            f'{v}_{v_cat.split()[0]}'
            for v_cat in cat_dict[v]
        ]
    elif v in list(dvs_dict.keys()):
        cat_dict = dvs_dict
        index = str(
            f'{v_cat}'
            for v_cat in cat_dict[v]
        )
        caption = level_df[v]

    desc_dict = {
        'Sectors': index,
        'n': [
            level_df[v].value_counts()[v_cat]
            for v_cat in cat_dict[v]
        ],
        '%': [
            level_df[v].value_counts(normalize=True).mul(100).round(2).astype(float)[v_cat]
            for v_cat in cat_dict[v]
        ],
        'M': [
            level_df[caption].mean().round(2).astype(float)
            for v_cat in cat_dict[v]
        ],
        'S.D.': [
            level_df[caption].std().round(2).astype(float)
            for v_cat in cat_dict[v]
        ]
    }
    
    # Make DF from dict
    df_desc = pd.DataFrame(desc_dict)
    df_desc.set_index('Sectors', inplace=True)

    return df_desc


In [None]:
def make_multiindex_cols(data_type = 'Collected', iv = 'Gender', data_structure = 'Job Advertisements'):

    if iv.title() == 'Gender':
        cols = [
            (f'{data_type.title()} Job Advertisements', f'{iv} Groups', data_structure.title(), 'n'),
            (f'{data_type.title()} Job Advertisements', f'{iv} Groups', data_structure.title(), '%'),
            (f'{data_type.title()} Job Advertisements', f'{iv} Groups', data_structure.title(), 'M'),
            (f'{data_type.title()} Job Advertisements', f'{iv} Groups', data_structure.title(), 'S.D.'),
        ]
    elif iv.title() == 'Age':
        cols = [
            (f'{iv} Groups', data_structure.title(), 'n'),
            (f'{iv} Groups', data_structure.title(), '%'),
            (f'{iv} Groups', data_structure.title(), 'M'),
            (f'{iv} Groups', data_structure.title(), 'S.D.'),
        ]
    
    return cols


In [None]:
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_trainning.pkl').reset_index(drop=True)


In [None]:
# Make dfs for Warmth and Competence
df_desc_warmth_job = make_descriptives_table(df_jobs, 'Warmth', 'Job')
df_desc_comp_job = make_descriptives_table(df_jobs, 'Competence', 'Job')
df_desc_warmth_sent = make_descriptives_table(df_jobs, 'Warmth', 'Sentence')
df_desc_comp_sent = make_descriptives_table(df_jobs, 'Competence', 'Sentence')


In [None]:
# Make dfs for Gender and Age
df_desc_gender_job = make_descriptives_table(df_jobs, 'Gender', 'Job')
df_desc_age_job = make_descriptives_table(df_jobs, 'Age', 'Job')
df_desc_gender_sent = make_descriptives_table(df_jobs, 'Gender', 'Sentence')
df_desc_age_sent = make_descriptives_table(df_jobs, 'Age', 'Sentence')


In [None]:
# Make dfs into multiindex dfs
df_desc_gender_job.columns = pd.MultiIndex.from_tuples(
    make_multiindex_cols(
        data_type='Collected', iv='Gender', data_structure='Job Advertisements'
    )
)
df_desc_gender_sent.columns = pd.MultiIndex.from_tuples(
    make_multiindex_cols(
        data_type='Collected', iv='Gender', data_structure='Sentences'
    )
)
df_desc_age_job.columns = pd.MultiIndex.from_tuples(
    make_multiindex_cols(
        data_type='Collected', iv='Age', data_structure='Job Advertisements'
    )
)
df_desc_age_sent.columns = pd.MultiIndex.from_tuples(
    make_multiindex_cols(
        data_type='Collected', iv='Age', data_structure='Sentences'
    )
)


In [None]:
# Merge DFs for Gender and Age
df_desc_gender = pd.merge(df_desc_gender_job, df_desc_gender_sent, left_index=True, right_index=True)
df_desc_age = pd.merge(df_desc_age_job, df_desc_age_sent, left_index=True, right_index=True)


In [None]:
# Save Tables
# Gender
df_desc_gender.to_csv(f'{table_save_path}Gender - Collected Job Advertisement Descriptives.csv', index=False)
df_desc_gender.to_pickle(f'{table_save_path}Gender - Collected Job Advertisement Descriptives.pkl')
with pd.option_context('max_colwidth', 10000000000):
    df_desc_gender.to_latex(f'{table_save_path}Gender - Collected Job Advertisement Descriptives.tex', index=False, longtable=True, escape=True, multicolumn=True, multicolumn_format='c', position='H', caption='Sectoral Gender and Age Composition and Segregation, Keywords, Counts, and Percentages', label='Jobs Count per Sector (x 1000)')
df_desc_gender.to_markdown(f'{table_save_path}Gender - Collected Job Advertisement Descriptives.md', index=True)
# save_sector_excel(df_sectors_all, data_save_dir)

# Age
df_desc_age.to_csv(f'{table_save_path}Age - Collected Job Advertisement Descriptives.csv', index=False)
df_desc_age.to_pickle(f'{table_save_path}Age - Collected Job Advertisement Descriptives.pkl')
with pd.option_context('max_colwidth', 10000000000):
    df_desc_age.to_latex(f'{table_save_path}Age - Collected Job Advertisement Descriptives.tex', index=False, longtable=True, escape=True, multicolumn=True, multicolumn_format='c', position='H', caption='Sectoral Gender and Age Composition and Segregation, Keywords, Counts, and Percentages', label='Jobs Count per Sector (x 1000)')
df_desc_age.to_markdown(f'{table_save_path}Age - Collected Job Advertisement Descriptives.md', index=True)
# save_sector_excel(df_sectors_all, data_save_dir)
