In [1]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import *


Using MPS


0it [00:00, ?it/s]

<Figure size 640x480 with 0 Axes>

In [3]:
# Function to remove stopwords
def remove_code(keywords_lst: list, keyword_clean_lst=None) -> list:

    if keyword_clean_lst is None:
        keyword_clean_lst = []

    for s in keywords_lst:
        lst = s.split()
        for i in lst:
            if len(i) <= 2:
                lst.remove(i)
            keyword_clean_lst.append(' '.join(lst))

    # map(lambda n: n * 2, [1, 2, 3, 4, 5])
    return [x for x in keyword_clean_lst if x != '']


In [4]:
# Function to to check file exists and not empty
def is_non_zero_file(fpath):

    return (os.path.isfile(fpath) and os.path.getsize(fpath) > 0)


In [5]:
# Function to validate path or file
def validate_path(file: str) -> str:

    if not os.path.isdir(file) and is_non_zero_file(file) is False:
        # file = input(f'No file found at {file}.\nPlease enter correct path.')
        try:
            print(f'File {file} not found.')
        except Exception as e:
            print(e.json())
    return file


In [6]:
# Function to clean keyword list
def clean_and_translate_keyword_list(
    keywords_lst: list,
    translate_enabled = None
) -> list:

    if translate_enabled is None:
        translate_enabled = False
    elif translate_enabled is True:
        translator = Translator()

    assert all(isinstance(i, str) for i in keywords_lst), 'Keywords must be strings.'

    # Collect all and and comma containing keywords
    and_comma = [i for i in keywords_lst if (',' in i) or ('and' in i)]

    # Remove ands and commas and append to keywords
    if len(and_comma) > 0:
        for i in and_comma:
            for x in re.split('and|,', i.strip().lower()):
                keywords_lst.append(x.strip().lower())

        # Remove duplicates
        keywords_lst = list(set(keywords_lst) ^ set(and_comma))

    else:
        keywords_lst = list(set(keywords_lst))

    # # Remove codes
    keywords_lst = remove_code(keywords_lst)

    # Singularize and remove duplicates
    keywords_list = list(
        set(
            list(
                map(
                    lambda line: (Word(line.lower()).singularize()).lower(),
                    keywords_lst,
                )
            )
        )
    )

    # Remove all non-specific keywords
    for i in keywords_list:
        if 'other ' in i.lower() and i.lower() not in ['other business support', 'other service activities']:
            keywords_list.append(i.lower().split('other')[1])
            keywords_list.remove(i)
        if ' (excl.' in i.lower():
            keywords_list.append(i.lower().split(' (excl.')[0].lower())
            keywords_list.remove(i)
        if '_(excl' in i.lower():
            keywords_list.append(i.lower().split('_(excl')[0].lower())
            keywords_list.remove(i)
    for i in keywords_list:
        if ' (' in i.lower():
            keywords_list.append(i.lower().split(' (')[0].lower())
            keywords_list.remove(i)
        if "-Noon's" in i.lower():
            keywords_list.append(i.lower().split('-Noon')[0].lower())
            keywords_list.remove(i)
        if len(i) <= 2:
            keywords_list.remove(i)
    for i in keywords_list:
        for w_keyword, r_keyword in keyword_trans_dict.items():
            if str(i.lower()) == w_keyword.lower():
                keywords_list.remove(i)
                keywords_list.append(r_keyword)

    # Remove duplicates
    keywords_list = list(filter(None, list(set(keywords_list))))

    # Translate to Dutch
    if translate_enabled is True:
        for english_keyword in keywords_list:
            while True:
                try:
                    dutch_keyword = translator.translate(english_keyword).text
                except Exception as e:
                    time.sleep(0.3)
                    continue
                break
            keywords_list.append(dutch_keyword.lower())

        # Remove duplicates
        keywords_list = list(filter(None, list(set(keywords_list))))

    return list(
        filter(None, list({i.lower().strip() for i in keywords_list if i}))
    )


In [7]:
# %% 1. CBS Data request
def get_cbs_odata(
    cbs_path: str = None,
    sectors_file_path = None,
    table_url: str = None,
    table_id: str = None,
    year: str = None,
    addition_url: str = None,
    select: list = None,
    odata_colnames_normalized: dict = None
):
    if cbs_path is None:
        cbs_path: str = validate_path(f'{code_dir}/1. Scraping/CBS/')
    if sectors_file_path is None:
        sectors_file_path: str = validate_path(f'{cbs_path}Found Data/')
    if table_url is None:
        table_url='https://opendata.cbs.nl/ODataAPI/OData/'
    if table_id is None:
        table_id='81434ENG'
    if year is None:
        year='2020'
    if addition_url is None:
        addition_url='/UntypedDataSet'
    if select is None:
        select=['SexOfEmployee', 'TypeOfEmploymentContract', 'OtherCharacteristicsEmployee', 'IndustryClassBranchSIC2008', 'Periods', 'Jobs_1']
    if odata_colnames_normalized is None:
        odata_colnames_normalized = {'IndustryClassBranchSIC2008': 'Industry class / branch (SIC2008)', 'SexOfEmployee': 'Sex of employee', 'OtherCharacteristicsEmployee': 'Other characteristics employee', 'Jobs_1': 'Employment/Jobs (x 1 000)', 'TypeOfEmploymentContract': 'Type of employment contract'}
    # data: https://opendata.cbs.nl/#/CBS/en/dataset/81434ENG/table?ts=1663627369191
    # instruction: https://data.overheid.nl/dataset/410-bevolking-op-eerste-van-de-maand--geslacht--leeftijd--migratieachtergrond
    # github: https://github.com/statistiekcbs/CBS-Open-Data-v4

    tables = cbsodata.get_table_list()
    for table in tables:
        if table['Identifier'] == table_id:
            data_info = table
    info = cbsodata.get_info(table_id)
    diffs = list(set(info.keys()) - set(data_info.keys()))
    for i in diffs:
        data_info[i] = info[i]

    with open(f'{sectors_file_path}cbs_data_info.json', 'w', encoding='utf8') as f:
        json.dump(data_info, f)

    dimensions = defaultdict(dict)
    for sel in select:
        if sel != 'Jobs_1':
            meta_data = pd.DataFrame(cbsodata.get_meta(table_id, sel))
        if sel == 'TypeOfEmploymentContract':
            meta_data = meta_data.loc[~meta_data['Title'].str.contains('Type of employment contract:')]
        if sel == 'OtherCharacteristicsEmployee':
            meta_data = meta_data.loc[~meta_data['Key'].str.contains('NAT')]
        if sel == 'Periods':
            meta_data = meta_data.loc[meta_data['Title'].astype(str) == year]

        for title, key in zip(meta_data['Title'].tolist(), meta_data['Key'].tolist()):
            if sel != 'Jobs_1':
                dimensions[sel][title] = key
    with open(f'{sectors_file_path}cbs_data_dimensions.json', 'w', encoding='utf8') as f:
        json.dump(dimensions, f)

    while True:
        try:
            data = pd.DataFrame(cbsodata.get_data(table_id, select=select))
            break
        except ConnectionError:
            time.sleep(5)

    data = data.loc[~data['TypeOfEmploymentContract'].str.contains('Type of employment contract:') & ~data['OtherCharacteristicsEmployee'].str.contains('Nationality:') & data['Periods'].str.contains(str(year))]
    data = data.rename(columns=odata_colnames_normalized)

    data.to_csv(f'{sectors_file_path}{year}_Gender x Age_CBS_DATA_from_code.csv')
    data.to_csv(f'{sectors_file_path}Sectors Tables/FINAL/{year}_Gender x Age_CBS_DATA_from_code.csv')
    data.to_pickle(f'{sectors_file_path}{year}_Gender x Age_CBS_DATA_from_code.pkl')
    data.to_pickle(f'{sectors_file_path}Sectors Tables/FINAL/{year}_Gender x Age_CBS_DATA_from_code.pkl')

    # target_url = table_url + table_id + addition_url

    # data = pd.DataFrame()
    # while target_url:
    #     r = requests.get(target_url).json()
    #     data = data.append(pd.DataFrame(r['value']))

    #     if '@odata.nextLink' in r:
    #         target_url = r['@odata.nextLink']
    #     else:
    #         target_url = None

    return data


In [8]:
# Function to rearrgane gender and age columns
def get_only_df(df_sectors, col_name, opp_col_name):
    df_only = df_sectors.pivot_table(values='n', index=['Code', 'Sector Name', opp_col_name], columns=[col_name], aggfunc='sum')
    df_only = df_only.reset_index(drop=False)
    df_only = df_only.loc[df_only[opp_col_name] == 'Total']
    df_only = df_only.drop(columns=[opp_col_name, 'Total'])
    df_only.name = col_name

    return df_only


In [9]:
def save_sector_excel(
    df_sectors_all,
    file_save_path,
    sheet_name=None,
    age_limit=None,
    startrow=None,
    startcol=None,
    year: str = None,
):
    if sheet_name is None:
        sheet_name = 'All'
    if age_limit is None:
        age_limit = 45
    if startrow is None:
        startrow = 4 # rows 1 - 3 reserved for titles
    if startcol is None:
        startcol = 1
    if year is None:
        year = '2020'

    # Define last rows and cols locs
    header_range = len(df_sectors_all.columns.levels)
    endrow = startrow + header_range + df_sectors_all.shape[0]
    endcol = startcol + df_sectors_all.shape[1]

    # Write
    writer = pd.ExcelWriter(f'{file_save_path}.xlsx')
    df_sectors_all.to_excel(writer, sheet_name=sheet_name, merge_cells=True, startrow=startrow, startcol=startcol)
    workbook  = writer.book
    worksheet = writer.sheets[sheet_name]
    worksheet.set_row(startrow + header_range, None, None, {'hidden': True}) # hide the empty row that appears after the headers
    worksheet.set_column(startrow, 1, None, None, {'hidden': True}) # hide the index column

    # Title and others, len=startrow+1 (1=empty startrow)
    worksheet.merge_range(1, startcol, 1, endcol, 'Table', workbook.add_format({'bold': True, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'left', 'top': True, 'bottom': True, 'left': False, 'right': False}))
    worksheet.merge_range(2, startcol, 2, endcol, f'Sectoral Gender and Age Composition and Segregation, Keywords, Counts, and Percentages ({year})', workbook.add_format({'italic': True, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'left', 'top': True, 'bottom': True, 'left': False, 'right': False}))
    worksheet.merge_range(3, startcol, 3, endcol, 'Jobs Count per Sector (x 1000)', workbook.add_format({'bold': False, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'center', 'top': True, 'bottom': True, 'left': False, 'right': False}))

    # Format column headers
    col_width_dict = {
        'n': 5.5,
        'Code': 4.5,
        'Sector Name': 28.5,
        'Keywords': 30,
        'Keywords n': 13.5,
        '% per Sector': 12,
        '% per Social Category': 19.5,
        '% per Workforce': 15.5,
        'Dominant Category': 24.5,
        '% Sector per Workforce': 21.5
    }
    for i, (col_num, col_value) in tqdm_product(range(header_range), (enumerate(df_sectors_all.columns.values))):
        row_to_write = startrow + i
        col_to_write = startcol + 1 + col_num # 1 is for index
        header_formats = {'bold': False, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'center', 'top': True, 'bottom': True, 'left': False, 'right': False, 'text_wrap': True}

        if col_value[i] == 'n':
            header_formats |= {'italic': True}
        if col_value[i] == 'Keywords n':
            header_formats |= {'italic': True}
        for col_name, col_width in col_width_dict.items():
            if col_value[i] == col_name:
                worksheet.set_column(col_to_write, col_to_write, col_width)

        worksheet.write(row_to_write, col_to_write, col_value[i], workbook.add_format(header_formats))

    # Format body
    perc = [col_num for col_num, value in enumerate(df_sectors_all.columns.values) if '%' in value[-1]]
    num = [col_num for col_num, value in enumerate(df_sectors_all.columns.values) if value[-1] in ['n' or 'Keywords n']]
    word = [col_num for col_num, value in enumerate(df_sectors_all.columns.values) if value[-1] in ['Code', 'Sector Name', 'Dominant Category']]
    keyword = [col_num for col_num, value in enumerate(df_sectors_all.columns.values) if value[-1] == 'Keywords']

    row_idx, col_idx = df_sectors_all.shape
    for c, r in tqdm_product(range(col_idx), range(row_idx)):
        row_to_write = startrow + header_range + 1 + r # 1 is for the hidden empty column under the header
        col_to_write = startcol + 1 + c # 1 is for index
        body_formats = {'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'left': False, 'right': False}

        if r == row_idx-1:
            body_formats |= {'top': True, 'bottom': True}
        if c in perc:
            body_formats |= {'num_format': '0.0', 'align': 'center'}
        elif c in word or c in keyword:
            body_formats |= {'align': 'left', 'text_wrap': True}
        elif c in num:
            body_formats |= {'num_format': '0', 'align': 'center'}

        try:
            worksheet.write(row_to_write, col_to_write, df_sectors_all.iloc[r, c], workbook.add_format(body_formats))
        except TypeError:
            value = (
                str(df_sectors_all.iloc[r, c])
                if isinstance(df_sectors_all.iloc[r, c], list)
                else ''
            )
            worksheet.write(row_to_write, col_to_write, value, workbook.add_format(body_formats))

    # Add Note
    note_format = {'italic': True, 'font_name': 'Times New Roman', 'font_size': 10, 'font_color': 'black', 'align': 'left', 'left': False, 'right': False}
    worksheet.merge_range(endrow + 1 , startcol, endrow + 1, endcol, 'Note.', workbook.add_format(note_format))
    # Add gender thresholds
    worksheet.merge_range(endrow + 2 , startcol, endrow + 2, endcol, f'Threshold for gender = {df_sectors_all.loc[df_sectors_all.index[-1], ("Gender", "Female", "% per Workforce")]:.2f}% ± 20%', workbook.add_format(note_format))
    # Add age thresholds
    worksheet.merge_range(endrow + 3 , startcol, endrow + 3, endcol, f'Threshold for age = {df_sectors_all.loc[df_sectors_all.index[-1], ("Age", f"Older (>= {age_limit} years)", "% per Workforce")]:.2f}% ± 10%', workbook.add_format(note_format))
    # Add source
    note_format['font_size'] = 8
    worksheet.merge_range(endrow + 4 , startcol, endrow + 4, endcol, 'Source: Centraal Bureau voor de Statistiek (CBS)', workbook.add_format(note_format))

    writer.close()


In [10]:
# Function to get sector df from cbs
def get_sector_df_from_cbs(
    save_enabled: bool = True,
    parent_dir: str = f'{scraped_data}CBS/',
    cols: list = None,
    select: list = None,
    get_cbs_odata_enabled: bool = False,
    odata_colnames_normalized: dict = None,
    year: str = None,
    age_limit: int = None,
    age_ratio: int = None,
    gender_ratio: int = None,
):
    if cols is None:
        cols = ['Industry class / branch (SIC2008)', 'Sex of employee', 'Other characteristics employee', 'Employment/Jobs (x 1 000)']
    if select is None:
        select = ['SexOfEmployee', 'TypeOfEmploymentContract', 'OtherCharacteristicsEmployee', 'IndustryClassBranchSIC2008', 'Periods', 'Jobs_1']
    if odata_colnames_normalized is None:
        odata_colnames_normalized = {'IndustryClassBranchSIC2008': 'Industry class / branch (SIC2008)', 'SexOfEmployee': 'Sex of employee', 'OtherCharacteristicsEmployee': 'Other characteristics employee', 'Jobs_1': 'Employment/Jobs (x 1 000)', 'TypeOfEmploymentContract': 'Type of employment contract'}
    if year is None:
        year = '2020'
    if age_limit is None:
        age_limit = 45
    if age_ratio is None:
        age_ratio = 10
    if gender_ratio is None:
        gender_ratio = 15

    sectors_file_path: str = validate_path(f'{parent_dir}Found Data/')
    data_save_dir1: str = validate_path(f'{parent_dir}Data/')
    data_save_dir2: str = table_save_path

    with open(f'{parent_dir}/Found Data/keywords_dict_manual.json', 'r', encoding='utf8') as f:
        keywords_dict_manual = json.load(f)

    if get_cbs_odata_enabled is True:
        df_sectors = get_cbs_odata(year=year)
        df_sectors = df_sectors.rename(columns=odata_colnames_normalized)
    elif get_cbs_odata_enabled is False:
        # Read, clean, create code variable
        df_sectors_paths = [
            '_Gender x Age_CBS_DATA_from_code', '_Gender x Age_CBS_DATA_from_code'
        ]
        file_types = ['pkl', 'csv']
        for df_sectors_path, file_type in itertools.product(df_sectors_paths, file_types):
            try:
                if file_type == 'pkl':
                    df_sectors = pd.read_pickle(f'{sectors_file_path}{year}{df_sectors_path}.pkl')
                elif file_type == 'csv':
                    df_sectors = pd.read_csv(f'{sectors_file_path}{year}{df_sectors_path}.csv', delimiter=',')
                df_sectors = df_sectors.rename(columns=odata_colnames_normalized)
                df_sectors.to_pickle(f'{sectors_file_path}{year}{df_sectors_path}.pkl')
                break
            except Exception as e:
                print(f'{e}\nFile not found. Getting data from CBS OData API.')
                get_cbs_odata_enabled = True
                df_sectors = get_cbs_odata(year=year)
                df_sectors = df_sectors.rename(columns=odata_colnames_normalized)

    df_sectors = df_sectors[cols]
    df_sectors = df_sectors.rename({'Sex of employee': 'Gender', 'Other characteristics employee': 'Age Range (in years)', 'Industry class / branch (SIC2008)': 'Sector Name', 'Employment/Jobs (x 1 000)': 'n'}, axis = 1)
    df_sectors.insert(0, 'Code', df_sectors['Sector Name'].progress_apply(lambda row: row[0]))
    df_sectors['Sector Name'] = df_sectors['Sector Name'].progress_apply(lambda row: row[2:].strip() if '-' not in row else row[3:].strip())

    # Categorize by age label
    all_age = df_sectors['Age Range (in years)'].unique().tolist()[1:]
    for i, word in enumerate(all_age):
        if word.startswith(str(age_limit)):
            young = all_age[:i]
            old = all_age[i:]
    conditions = [
        (df_sectors['Age Range (in years)'].isin(old)),
        (df_sectors['Age Range (in years)'].isin(young))
    ]
    choices = [f'Older (>= {age_limit} years)', f'Younger (< {age_limit} years)']
    age_cat = np.select(conditions, choices, default='Total')
    df_sectors.insert(3, 'Age', age_cat)
    choices.append('Total')

    # Change gender label
    df_sectors['Gender'] = df_sectors['Gender'].replace({'Sex: Female': 'Female', 'Sex: Male': 'Male'})

    # Rearrgane columns
    # Gender
    df_gender_only = get_only_df(df_sectors, 'Gender', 'Age')

    # Age
    df_age_only = get_only_df(df_sectors, 'Age', 'Gender')

    # Total
    df_total_only = df_sectors.pivot_table(values='n', index=['Code', 'Sector Name', 'Gender', 'Age'], aggfunc='sum')
    df_total_only = df_total_only.reset_index(drop=False)
    df_total_only = df_total_only.loc[(df_total_only['Gender'] == 'Total') & (df_total_only['Age'] == 'Total')]
    df_total_only = df_total_only.drop(columns=['Gender', 'Age'])
    df_total_only = df_total_only.rename(columns={'n': 'Total Workforce'})
    df_total_only.name = 'Total'

    # Merge all
    df_sectors_all = pd.merge(pd.merge(df_gender_only, df_age_only, how='outer'), df_total_only, how='outer')

    # Take out "All economic activities" row
    au = df_sectors_all.loc[df_sectors_all['Sector Name'] == 'All economic activities']
    au.loc[au['Code'] != 'A-U', 'Code'] = 'A-U'
    df_sectors_all = df_sectors_all[df_sectors_all['Sector Name'] != 'All economic activities']
    df_sectors_all = df_sectors_all.groupby(['Code'], as_index=True).agg({'Sector Name': 'first', **dict.fromkeys(df_sectors_all.loc[:, ~df_sectors_all.columns.isin(['Code', 'Sector Name'])].columns.to_list(), 'sum')})
    df_sectors_all = df_sectors_all.reset_index(drop=False)

    # Add keywords
    df_sectors_all.insert(2, 'Keywords', df_sectors_all['Code'].progress_apply(lambda row: keywords_dict_manual[row]['Used_Sector_Keywords'] if row in keywords_dict_manual and isinstance(row, str) else np.nan))
    df_sectors_all['Keywords'] = df_sectors_all['Keywords'].progress_apply(lambda row: clean_and_translate_keyword_list(row) if isinstance(row, list) else np.nan)
    df_sectors_all.insert(
        3,
        'Keywords n',
        df_sectors_all['Keywords'].progress_apply(
            lambda row: len(row) if isinstance(row, list) else np.nan
        ),
    )

    # Add totals in bottom row
    df_sectors_all.loc[df_sectors_all[df_sectors_all['Sector Name'] == 'Other service activities'].index.values.astype(int)[0]+1, 'Sector Name'] = 'Total (excluding A-U)'
    df_sectors_all.iloc[df_sectors_all[df_sectors_all['Sector Name'] == 'Total (excluding A-U)'].index.values.astype(int)[0], ~df_sectors_all.columns.isin(['Code', 'Sector Name', 'Keywords'])] = df_sectors_all.sum(numeric_only=True)
    df_sectors_all.columns = pd.MultiIndex.from_tuples([('Industry class / branch (SIC2008)', 'Code'), ('Industry class / branch (SIC2008)', 'Sector Name'), ('Industry class / branch (SIC2008)', 'Keywords'), ('Industry class / branch (SIC2008)', 'Keywords n'), ('Female', 'n'), ('Male', 'n'), (f'Older (>= {age_limit} years)', 'n'), (f'Younger (< {age_limit} years)', 'n'), ('Total Workforce', 'n')], names = ['Social category', 'Counts'])

    # Make percentages
    for index, row in df_sectors_all.items():
        if ('Total' not in index[0]) and ('Industry' not in index[0]) and ('%' not in index[1]) and ('n' in index[1]) and (not isinstance(row[0], str)) and (not math.isnan(row[0])):
            df_sectors_all[(index[0], '% per Sector')] = row/df_sectors_all[('Total Workforce', 'n')]*100
            df_sectors_all[(index[0], '% per Social Category')] = row/df_sectors_all.loc[df_sectors_all[df_sectors_all[('Industry class / branch (SIC2008)', 'Sector Name')] == 'Total (excluding A-U)'].index.values.astype(int)[0], index]*100
            df_sectors_all[(index[0], '% per Workforce')] = row/df_sectors_all.loc[df_sectors_all[df_sectors_all[('Industry class / branch (SIC2008)', 'Sector Name')] == 'Total (excluding A-U)'].index.values.astype(int)[0], ('Total Workforce', 'n')]*100
        if ('Total' in index[0]):
            df_sectors_all[(index[0], '% Sector per Workforce')] = row/df_sectors_all.loc[df_sectors_all[df_sectors_all[('Industry class / branch (SIC2008)', 'Sector Name')] == 'Total (excluding A-U)'].index.values.astype(int)[0], ('Total Workforce', 'n')]*100

    # Set cut-off
    condition_exclude_total = df_sectors_all[('Industry class / branch (SIC2008)', 'Sector Name')].astype(str) != 'Total (excluding A-U)'
    # Gender
    total_female = df_sectors_all.loc[df_sectors_all[df_sectors_all[('Industry class / branch (SIC2008)', 'Sector Name')] == 'Total (excluding A-U)'].index.values.astype(int)[0], ('Female', '% per Workforce')]
    # female_dominated = total_female + (gender_ratio / 100)
    female_dominated = total_female + gender_ratio
    df_sectors_all.loc[df_sectors_all[('Female', '% per Sector')] >= female_dominated, ('Sectoral Gender Segregation', 'Dominant Category')] = 'Female'
    # male_dominated = total_female - (gender_ratio / 100)
    male_dominated = total_female - gender_ratio
    df_sectors_all.loc[df_sectors_all[('Female', '% per Sector')] <= male_dominated, ('Sectoral Gender Segregation', 'Dominant Category')] = 'Male'
    condition_male = df_sectors_all[('Female', '% per Sector')] > male_dominated
    condition_female = df_sectors_all[('Female', '% per Sector')] < female_dominated
    df_sectors_all.loc[(condition_male) & (condition_female) & (condition_exclude_total), ('Sectoral Gender Segregation', 'Dominant Category')] = 'Mixed Gender'
    # Age
    total_old = df_sectors_all.loc[df_sectors_all[df_sectors_all[('Industry class / branch (SIC2008)', 'Sector Name')] == 'Total (excluding A-U)'].index.values.astype(int)[0], (f'Older (>= {age_limit} years)', '% per Workforce')]
    # old_dominated = total_old + (age_ratio / 100)
    old_dominated = total_old + age_ratio
    df_sectors_all.loc[df_sectors_all[(f'Older (>= {age_limit} years)', '% per Sector')] >= old_dominated, ('Sectoral Age Segregation', 'Dominant Category')] = 'Older'
    # young_dominated = total_old - (age_ratio / 100)
    young_dominated = total_old - age_ratio
    df_sectors_all.loc[df_sectors_all[(f'Older (>= {age_limit} years)', '% per Sector')] <= young_dominated, ('Sectoral Age Segregation', 'Dominant Category')] = 'Younger'
    condition_old = df_sectors_all[(f'Older (>= {age_limit} years)', '% per Sector')] < old_dominated
    condition_young = df_sectors_all[(f'Older (>= {age_limit} years)', '% per Sector')] > young_dominated
    df_sectors_all.loc[(condition_old) & (condition_young) & (condition_exclude_total), ('Sectoral Age Segregation', 'Dominant Category')] = 'Mixed Age'

    # Add AU and other rows
    au.insert(2, 'Keywords', np.nan)
    au.insert(3, 'Keywords n', np.nan)
    au[['Sectoral Gender Segregation', 'Sectoral Age Segregation']] = np.nan
    au.columns = pd.MultiIndex.from_tuples([col for col in df_sectors_all.columns if '%' not in col[1]])
    df_sectors_all = pd.concat([au, df_sectors_all], ignore_index=True)

    # Arrange columns
    df_sectors_all = df_sectors_all.reindex(columns=df_sectors_all.columns.reindex(['Industry class / branch (SIC2008)', 'Female', 'Male', 'Sectoral Gender Segregation', f'Older (>= {age_limit} years)', f'Younger (< {age_limit} years)', 'Sectoral Age Segregation', 'Total Workforce'], level=0)[0])
    df_sectors_all = df_sectors_all.reindex(columns=df_sectors_all.columns.reindex(['Code', 'Sector Name', 'Keywords', 'Keywords n', 'n', '% per Sector', '% per Social Category', '% per Workforce', '% Sector per Workforce', 'Dominant Category'], level=1)[0])

    print('+'*30)
    print(f'Thresholds:\ngender ratio ={gender_ratio}%\nage ratio:{age_ratio}%')
    print('+'*30)
    print(f'Sector is female-domainate if it comprises {female_dominated:.1f}% females or more.')
    print('-'*10)
    print(f'Sector is male-domainate if it comprises {male_dominated:.1f}% females or less.')
    print('-'*10)
    print(f'Sector is older worker-domainate if it comprises {old_dominated:.1f}% older workers or more.')
    print('-'*10)
    print(f'Sector is younger worker-domainate if it comprises {young_dominated:.1f}% older workers or less.')
    print('*'*30)

    level1_cols_tuple = []
    for col in df_sectors_all.columns:
        if ('SIC2008' in col[0]):
            level1_cols_tuple.append(('SBI Sector Titles', *col))
        elif (re.search(r'[Mm]ale', col[0])) or ('Gender' in col[0]):
            level1_cols_tuple.append(('Gender', *col))
        elif ('45' in col[0]) or ('Age' in col[0]):
            level1_cols_tuple.append(('Age', *col))
        elif ('Total' in col[0]):
            level1_cols_tuple.append(('Total Workforce', *col))

    df_sectors_all.columns = pd.MultiIndex.from_tuples(level1_cols_tuple, names=['Variables', 'Categories', 'Counts'])

    if save_enabled:
        for data_save_dir in [data_save_dir1, data_save_dir2]:
            if not os.path.exists(data_save_dir):
                os.makedirs(data_save_dir)
            # File save path
            file_save_path = f'{data_save_dir}Sectors Output from script'
            # CSV
            df_sectors_all.to_csv(f'{file_save_path}.csv', index=False)
            # PKL
            df_sectors_all.to_pickle(f'{file_save_path}.pkl')
            # TEX
            with pd.option_context('max_colwidth', 10000000000):
                df_sectors_all.style.to_latex(
                    f'{file_save_path}.tex',
                    convert_css=True,
                    environment='longtable',
                    hrules=True,
                    # escape=True,
                    # multicolumn=True,
                    multicol_align='c',
                    position='H',
                    caption='Sectoral Gender and Age Composition and Segregation, Keywords, Counts, and Percentages', label='Jobs Count per Sector (x 1000)'
                )
            # MD
            df_sectors_all.to_markdown(f'{file_save_path}.md', index=True)
            # EXCEL
            save_sector_excel(df_sectors_all, file_save_path)

    return df_sectors_all


In [11]:
# Get the table for Sector Composition + used keyowrds + classification of dominant caterory
df_sectors_all = get_sector_df_from_cbs(year=2020)

progress-bar:   0%|          | 0/1218 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/1218 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/19 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/19 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/19 [00:00<?, ?it/s]

++++++++++++++++++++++++++++++
Thresholds:
gender ratio =15%
age ratio:10%
++++++++++++++++++++++++++++++
Sector is female-domainate if it comprises 62.6% females or more.
----------
Sector is male-domainate if it comprises 32.6% females or less.
----------
Sector is older worker-domainate if it comprises 52.1% older workers or more.
----------
Sector is younger worker-domainate if it comprises 32.1% older workers or less.
******************************


0it [00:00, ?it/s]

  0%|          | 0/504 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/504 [00:00<?, ?it/s]

In [12]:
df_sectors_all

Variables,SBI Sector Titles,SBI Sector Titles,SBI Sector Titles,SBI Sector Titles,Gender,Gender,Gender,Gender,Gender,Gender,Gender,Gender,Gender,Age,Age,Age,Age,Age,Age,Age,Age,Age,Total Workforce,Total Workforce
Categories,Industry class / branch (SIC2008),Industry class / branch (SIC2008),Industry class / branch (SIC2008),Industry class / branch (SIC2008),Female,Female,Female,Female,Male,Male,Male,Male,Sectoral Gender Segregation,Older (>= 45 years),Older (>= 45 years),Older (>= 45 years),Older (>= 45 years),Younger (< 45 years),Younger (< 45 years),Younger (< 45 years),Younger (< 45 years),Sectoral Age Segregation,Total Workforce,Total Workforce
Counts,Code,Sector Name,Keywords,Keywords n,n,% per Sector,% per Social Category,% per Workforce,n,% per Sector,% per Social Category,% per Workforce,Dominant Category,n,% per Sector,% per Social Category,% per Workforce,n,% per Sector,% per Social Category,% per Workforce,Dominant Category,n,% Sector per Workforce
0,A-U,All economic activities,,,4029.0,,,,4362.0,,,,,3500.0,,,,4892.0,,,,,8391.0,
1,A,Agriculture and industry,"[agriculture, fishing, forestry]",3.0,310.0,22.16,2.58,1.23,1089.0,77.84,8.22,4.31,Male,690.0,49.32,6.49,2.73,708.0,50.61,4.84,2.8,Mixed Age,1399.0,5.54
2,B,"Industry (no construction), energy","[quarrying, mining, quarry]",3.0,423.0,20.92,3.52,1.67,1599.0,79.08,12.07,6.33,Male,1066.0,52.72,10.03,4.22,954.0,47.18,6.52,3.78,Older,2022.0,8.0
3,C,Manufacturing,"[production manager, industry, supply, manufac...",7.0,174.0,22.72,1.45,0.69,592.0,77.28,4.47,2.34,Male,413.0,53.92,3.88,1.63,354.0,46.21,2.42,1.4,Older,766.0,3.03
4,D,Energy supply,"[energy supply, energy, electricity]",3.0,8.0,27.59,0.07,0.03,21.0,72.41,0.16,0.08,Male,15.0,51.72,0.14,0.06,13.0,44.83,0.09,0.05,Mixed Age,29.0,0.11
5,E,Water supply and waste management,"[water stock, water supply, water, waste manag...",4.0,7.0,19.44,0.06,0.03,29.0,80.56,0.22,0.11,Male,21.0,58.33,0.2,0.08,16.0,44.44,0.11,0.06,Older,36.0,0.14
6,F,Construction,"[build, construction]",2.0,42.0,12.5,0.35,0.17,294.0,87.5,2.22,1.16,Male,160.0,47.62,1.51,0.63,178.0,52.98,1.22,0.7,Mixed Age,336.0,1.33
7,G,Commercial services,"[marketing manager, selling, wholesaling, buyi...",11.0,3421.0,43.13,28.47,13.54,4510.0,56.87,34.04,17.85,Mixed Gender,2704.0,34.09,25.44,10.7,5228.0,65.92,35.73,20.69,Mixed Age,7931.0,31.39
8,H,Transportation and storage,"[logistics manager, logistics, storage, transp...",5.0,95.0,24.61,0.79,0.38,291.0,75.39,2.2,1.15,Male,205.0,53.11,1.93,0.81,181.0,46.89,1.24,0.72,Older,386.0,1.53
9,I,Accommodation and food serving,"[hotel, accommodation, food service, service, ...",7.0,199.0,50.25,1.66,0.79,196.0,49.49,1.48,0.78,Mixed Gender,75.0,18.94,0.71,0.3,320.0,80.81,2.19,1.27,Younger,396.0,1.57


In [13]:
# %% Function to get sbi_sectors_dict
def get_sbi_sectors_list(
    save_enabled=None,
    parent_dir=None,
    ):

    if save_enabled is None:
        save_enabled=True
    if parent_dir is None:
        parent_dir=f'{scraped_data}CBS/'

    sib_5_loc = validate_path(f'{parent_dir}Found Data/SBI_ALL_NACE_REV2.csv')
    keywords_dict_manual_path = validate_path(f'{parent_dir}Found Data/keywords_dict_manual.json')
    data_save_dir = validate_path(f'{parent_dir}Data/')

    df_sbi_sectors = pd.read_csv(sib_5_loc, delimiter=',')
    df_sbi_sectors.columns = df_sbi_sectors.columns.str.strip()
    df_sbi_sectors = df_sbi_sectors.rename(columns = {'Description': 'Old_Sector_Name'})
    df_sbi_sectors = df_sbi_sectors.dropna(subset=['Old_Sector_Name', 'Code'])
    df_sbi_sectors['Old_Sector_Name'] = df_sbi_sectors['Old_Sector_Name'].progress_apply(lambda x: x.lower().strip())
    df_sbi_sectors = df_sbi_sectors.loc[df_sbi_sectors['Level'] == 1]
    df_sbi_sectors = df_sbi_sectors.drop(columns=['Level', 'Parent', 'This item includes', 'This item also includes', 'Rulings', 'This item excludes', 'Reference to ISIC Rev. 4'])

    df_sectors_all = pd.read_pickle(f'{data_save_dir}Sectors Output from script.pkl')[[('SBI Sector Titles'), ('Gender'), ('Age')]].droplevel('Categories', axis='columns')[[('SBI Sector Titles', 'Code'), ('SBI Sector Titles', 'Sector Name'), ('SBI Sector Titles', 'Keywords'), ('Gender', 'Dominant Category'), ('Age', 'Dominant Category')]].droplevel('Variables', axis='columns')
    df_sectors_all.columns = ['Code', 'Sector Name', 'Keywords', 'Gender Dominant Category', 'Age Dominant Category']
    df_sbi_sectors = df_sbi_sectors.merge(df_sectors_all, how='inner', on='Code')
    df_sbi_sectors = df_sbi_sectors.rename(columns = {'Sector Name': 'Sector_Name', 'Keywords': 'Used_Sector_Keywords', 'Gender Dominant Category': 'Gender_Dominant_Category', 'Age Dominant Category': 'Age_Dominant_Category'})
    df_sbi_sectors['Sector_Name'] = df_sbi_sectors['Sector_Name'].progress_apply(lambda x: x.strip().lower() if isinstance(x, str) else np.nan)
    df_sbi_sectors['Used_Sector_Keywords'] = df_sbi_sectors['Used_Sector_Keywords'].progress_apply(lambda x: clean_and_translate_keyword_list(x) if isinstance(x, list) else np.nan)
    df_sbi_sectors = df_sbi_sectors.set_index(df_sbi_sectors['Code'])

    df_sbi_sectors.to_csv(f'{data_save_dir}SBI-5_Sectors.csv', index=True)
    df_sbi_sectors.to_excel(f'{data_save_dir}SBI-5_Sectors.xlsx', index=True)
    df_sbi_sectors.to_pickle(f'{data_save_dir}SBI-5_Sectors.pkl')

    sbi_english_keyword_list = [i for index, row in df_sbi_sectors['Used_Sector_Keywords'].items() if isinstance(row, list) for i in row]
    sbi_english_keyword_list = clean_and_translate_keyword_list(sbi_english_keyword_list)

    sbi_english_keyword_dict = df_sbi_sectors['Used_Sector_Keywords'].to_dict()
    sbi_sectors_dict = df_sbi_sectors.to_dict('index')
    sbi_sectors_dict_full = {}
    sbi_sectors_dom_gen = {}
    sbi_sectors_dom_age = {}
    sbi_sectors_keywords_gen_dom = defaultdict(list)
    sbi_sectors_keywords_age_dom = defaultdict(list)
    sbi_sectors_keywords_full_dom = defaultdict(list)
    for index, row in df_sbi_sectors.iterrows():
        sbi_sectors_dict_full[row['Sector_Name']] = row['Used_Sector_Keywords']
        sbi_sectors_dom_gen[row['Sector_Name']] = row['Gender_Dominant_Category']
        sbi_sectors_dom_age[row['Sector_Name']] = row['Age_Dominant_Category']
    for cat_keywords in df_sbi_sectors[['Gender_Dominant_Category', 'Used_Sector_Keywords']].to_dict(orient='split')['data']:
        sbi_sectors_keywords_gen_dom[cat_keywords[0]].extend(cat_keywords[1])
    for cat_keywords in df_sbi_sectors[['Age_Dominant_Category', 'Used_Sector_Keywords']].to_dict(orient='split')['data']:
        sbi_sectors_keywords_age_dom[cat_keywords[0]].extend(cat_keywords[1])
    for d in (sbi_sectors_keywords_gen_dom, sbi_sectors_keywords_age_dom):
        sbi_sectors_keywords_full_dom |= d

    if save_enabled is True:
        with open(f'{data_save_dir}sbi_english_keyword_list.txt', 'w', encoding='utf8') as f:
            for i in sbi_english_keyword_list:
                f.write(f'{i.lower()}\n')
        with open(f'{data_save_dir}sbi_english_keyword_dict.json', 'w', encoding='utf8') as f:
            json.dump(sbi_english_keyword_dict, f)
        with open(f'{data_save_dir}sbi_sectors_dict.json', 'w', encoding='utf8') as f:
            json.dump(sbi_sectors_dict, f)

        with open(f'{data_save_dir}sbi_sectors_keywords_gen_dom.json', 'w', encoding='utf8') as f:
            json.dump(sbi_sectors_keywords_gen_dom, f)
        with open(f'{data_save_dir}sbi_sectors_keywords_age_dom.json', 'w', encoding='utf8') as f:
            json.dump(sbi_sectors_keywords_age_dom, f)
        with open(f'{data_save_dir}sbi_sectors_keywords_full_dom.json.json', 'w', encoding='utf8') as f:
            json.dump(sbi_sectors_keywords_full_dom, f)

    return (
        sbi_english_keyword_list,
        sbi_english_keyword_dict,
        sbi_sectors_dict,
        sbi_sectors_dict_full,
        sbi_sectors_dom_gen,
        sbi_sectors_dom_age,
        sbi_sectors_keywords_gen_dom,
        sbi_sectors_keywords_age_dom,
        sbi_sectors_keywords_full_dom
    )


In [14]:
# Get the list of keywords for the SBI sectors
(
    sbi_english_keyword_list,
    sbi_english_keyword_dict,
    sbi_sectors_dict,
    sbi_sectors_dict_full,
    sbi_sectors_dom_gen,
    sbi_sectors_dom_age,
    sbi_sectors_keywords_gen_dom,
    sbi_sectors_keywords_age_dom,
    sbi_sectors_keywords_full_dom
) = get_sbi_sectors_list()


progress-bar:   0%|          | 0/996 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/19 [00:00<?, ?it/s]

progress-bar:   0%|          | 0/19 [00:00<?, ?it/s]