# Demographic analysis

This notebook features the analysis of the EU LFS microdata. While we have included it in the repository for the sake of completeness, we are not able to share input microdata and hence it will not be possible to independently replicate these particular results.

In [1]:
%run ../notebook_preamble.ipy

In [2]:
import os
import collections
from scipy.stats import chi2_contingency

In [3]:
def assign_risk_cat2(some_row):
    '''
    
High risk: r == 3, b == 0
Low risk: r == 0, b == 3

Categories:
0 - High risk
1 - Low risk
2 - Other

    '''
    
    if some_row['risk_q'] == 3 and some_row['bottleneck_q'] == 0:
        res = 0
    elif some_row['risk_q'] == 0 and some_row['bottleneck_q'] == 3:
        res = 1
    else:
        res = 2
    return res

In [4]:
base_dir = useful_paths.codebase_dir
eurostat_dir = 'n/a' # Can't share the microdata
eurostat_output_dir = ''

# 1. Generate automation risk estimates at ISCO 3-digit and 4-digit level

In [5]:
sml = pd.read_csv(os.path.join(base_dir, 'data/interim/automation_analysis', 'agg_job_task_scores_esco_revised_w_no_phys_bott.csv'))

In [6]:
sml.head(3)

Unnamed: 0,O*NET-SOC Code,Title,id,esco_occupation,isco_code,weighted_risk,high_risk,low_risk,weight_enablers,weight_bottlenecks,weight_phys_bottlenecks,bottleneck_q,risk_q,risk_cat,risk_cat_label,not_phys_weighted_risk,not_phys_bottlenecks
0,11-1011.00,chief executives,1104,member of parliament,1111.0,3.49985,0.563389,0.436611,0.962482,0.356627,0.163575,0,2,2,Other,3.548024,0.193053
1,11-1011.00,chief executives,1111,secretary general,1112.0,3.49985,0.563389,0.436611,0.962482,0.356627,0.163575,0,2,2,Other,3.548024,0.193053
2,11-1011.00,chief executives,1288,chief executive officer,1120.0,3.49985,0.563389,0.436611,0.962482,0.356627,0.163575,0,2,2,Other,3.548024,0.193053


In [8]:
sml['isco_code'] = sml['isco_code'].astype(int).astype(str)

In [9]:
sml['isco_3d'] = sml['isco_code'].apply(lambda x: x[:3])

## 1.2 Calculate risk using a combination of weighted risk and proportion of bottlenecks

In [10]:
isco_3d_risk = sml.groupby('isco_3d')[['weighted_risk', 'weight_bottlenecks']].mean()

In [11]:
isco_3d_risk['risk_q'] = pd.qcut(isco_3d_risk['weighted_risk'], 4, labels=False)
isco_3d_risk['bottleneck_q'] = pd.qcut(isco_3d_risk['weight_bottlenecks'], 4, labels=False)

In [12]:
isco_3d_risk['risk_cat'] = isco_3d_risk.apply(assign_risk_cat2, axis =1)

In [13]:
isco_3d_risk.head(3)

Unnamed: 0_level_0,weighted_risk,weight_bottlenecks,risk_q,bottleneck_q,risk_cat
isco_3d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
111,3.501665,0.35899,2,0,2
112,3.486389,0.295306,2,0,2
121,3.464577,0.404742,1,0,2


In [14]:
#Save risk estimates at 3-d ISCO as a dict
isco_risk_cat = dict()
for ix, row in isco_3d_risk.iterrows():
    isco_risk_cat[ix] = row['risk_cat']

In [15]:
#isco_risk_cat['931']

In [16]:
isco_risk_cat['224'] = 2 #appears to be unmapped to O*NET occupations and thus missing from earlier analysis

In [17]:
#with open(os.path.join(framework_dir, 'isco_risk_cat'), 'wb') as f:
#    pickle.dump(isco_risk_cat, f)

In [18]:
#Same as above but for 4-d ISCO
isco_4d_risk = sml.groupby('isco_code')[['weighted_risk', 'weight_bottlenecks']].mean()

In [19]:
isco_4d_risk['risk_q'] = pd.qcut(isco_4d_risk['weighted_risk'], 4, labels=False)
isco_4d_risk['bottleneck_q'] = pd.qcut(isco_4d_risk['weight_bottlenecks'], 4, labels=False)

In [20]:
isco_4d_risk['risk_cat'] = isco_4d_risk.apply(assign_risk_cat2, axis =1)

## 1.2 Save risk estimates at 3-digit and 4-digit ISCO group level; add group titles

In [21]:
isco_titles = pd.read_excel(os.path.join('lookups', 'struct08.xls'))

In [22]:
isco_titles['isco_code'] = isco_titles['isco_code'].astype(str)

In [23]:
# Create isco title lookup dict
isco_titles_dict = {}
for ix, row in isco_titles.iterrows():
    isco_titles_dict[row['isco_code']] = row['title']

In [24]:
# Add minor group title (e.g. mg_title)
isco_3d_risk['mg_title'] = isco_3d_risk.index.map(lambda x: isco_titles_dict[x])

In [25]:
# View 3-d ISCO occupations in high risk category
#isco_3d_risk[isco_3d_risk['risk_cat'] == 0]

In [28]:
# Add unit group title (e.g. ug_title)
isco_4d_risk['ug_title'] = isco_4d_risk.index.map(lambda x: isco_titles_dict[x])

In [29]:
# View 4-d ISCO occupations in high risk category
#isco_4d_risk[isco_4d_risk['risk_cat'] == 0]

In [33]:
isco_4d_risk.to_csv(os.path.join(base_dir, 'data/interim/automation_analysis', 'isco_4d_risk.csv'))

In [34]:
isco_3d_risk.to_csv(os.path.join(base_dir, 'data/interim/automation_analysis', 'isco_3d_risk.csv'))

# 2. Analyse Eurostat data

## 2.1 Clean LFS files and study breakdowns

In [36]:
def filter_df(input_df):
    '''
    Duplicates: remove duplicates (combination of household id, household member number and survey wave).
    ISCO3D: Remove entries with no ISCO3D (this will remove those younger than 15); 
    recode ISCO3D first as int then as str. Remove entries with ISCO3D equal to '999' 
    (this will filter out inactive and unemployed) and those with values with fewer than 3 digits.
    AGE: Remove entries with people older than 75 (75-79 age band, shown in EU LFS as age 77)
    '''
    #drop duplicates
    input_df_unique = input_df.drop_duplicates(subset = ['HHNUM', 'HHSEQNUM','QUARTER', 'REM'])
    
    #choose subset of columns
    input_df_unique = input_df_unique[['HHNUM', 'AGE', 'SEX', 'REGION', 
                           'ILOSTAT', 'STAPRO', 'NACE1D', 'ISCO3D', 'HATLEV1D',
                          'INCDECIL', 'NATIONAL', 'FTPT', 'TEMP', 'TEMPDUR', 'COEFF']]
    
    #ensure that ISCO3D is a string with len == 3 (need to remove missing values first)
    input_df_unique.dropna(subset = ['ISCO3D'], inplace = True)
    input_df_unique['ISCO3D'] = input_df_unique['ISCO3D'].astype(int).astype(str)
        
    #remove entries equal to '999', '200' and those with fewer than 3 digits
    input_df_filtered = input_df_unique[input_df_unique['ISCO3D'] !='999']
    input_df_filtered = input_df_filtered[input_df_filtered['ISCO3D'] !='200']
    input_df_filtered = input_df_filtered[input_df_filtered['ISCO3D'] !='330']
    input_df_filtered = input_df_filtered[input_df_filtered['ISCO3D'] !='230']
    input_df_filtered = input_df_filtered[~(input_df_filtered.ISCO3D.str.len() <3)]
        
    #filter out people older than 77
    input_df_filtered = input_df_filtered[input_df_filtered['AGE'] <=77]
        
    return input_df_filtered

In [37]:
def demo_breakdown(input_df, groupby_var, risk_var, agg_var):
    '''
    Produce breakdown of a given dataframe by a specified demographic variable and by risk category both 
    a) as proportions and
    b) as counts (employment in thousands)
    '''
    breakdown = input_df.groupby([groupby_var, risk_var])[agg_var].sum()
    group_totals = input_df.groupby(groupby_var)[agg_var].sum()
    breakdown_prop = breakdown/group_totals*100
    return breakdown, breakdown_prop

In [38]:
def pivot_breakdown(input_df, vals, index_var, column_var, thousands=False):
    '''
    Produce contingency table for chi-square analysis
    '''
    input_df = input_df.reset_index()
    if thousands:
        input_df['COEFF'] = input_df['COEFF']*1000
    input_df[['RISK_CAT', 'COEFF']] = input_df[['RISK_CAT', 'COEFF']].astype(int)
    pivot_df = pd.pivot_table(input_df, values= vals, index=[index_var],
                    columns=[column_var])
    return pivot_df

In [39]:
# Bring it all together for all years

#Generate list of files in folder
#file_list = os.listdir(os.path.join(eurostat_dir, 'Yearly','UK_YEAR_1998_onwards'))
file_list = os.listdir(os.path.join(eurostat_dir, 'Yearly','IT_YEAR_1998_onwards'))
# file_list = os.listdir(os.path.join(eurostat_dir, 'Yearly','FR_YEAR_1998_onwards'))

#Define demographic variables to explore
demographic_vars = ['SEX', 'AGE', 'REGION', 'NACE1D', 'HATLEV1D','INCDECIL', 'NATIONAL', 'FTPT', 'TEMP']

#Define output dicts for storing results
breakdown_props = collections.defaultdict(dict)
breakdown_counts = collections.defaultdict(dict)
chi_square_res = collections.defaultdict(dict)
df_chunks = []

#Iterate over files in the folder
for file in file_list:
    year = file[2:6]
    
    print(year)
    
#    year_file = pd.read_csv(os.path.join(eurostat_dir, 'Yearly','UK_YEAR_1998_onwards', file))
    year_file = pd.read_csv(os.path.join(eurostat_dir, 'Yearly','IT_YEAR_1998_onwards', file))
#     year_file = pd.read_csv(os.path.join(eurostat_dir, 'Yearly','FR_YEAR_1998_onwards', file))
    filtered_df = filter_df(year_file)
    filtered_df['RISK_CAT'] = filtered_df['ISCO3D'].apply(lambda x: isco_risk_cat[x])
    filtered_df['YEAR'] = year
    
    df_chunks.append(filtered_df)


    for demographic_var in demographic_vars:
        print(demographic_var)
        breakdown_count, breakdown_prop = demo_breakdown(filtered_df, demographic_var, 'RISK_CAT', 'COEFF')
        breakdown_counts[year][demographic_var] = breakdown_count
        breakdown_props[year][demographic_var] = breakdown_prop

        breakdown_p = pivot_breakdown(breakdown_count, 'COEFF', demographic_var, 'RISK_CAT', True)
        #This is condition for chi-square test, which requires at least 5 observations in each group
        #WARNING: given that numbers are in thousands, the threshold could have been 0.005, so technically 
        #we could have run chi-square on more demographic breakdowns than we did for the report
        check_condition = (breakdown_p >=5).all(axis = 1).all() 
        print(check_condition)

        if check_condition:
            stat, p, dof, expected = chi2_contingency(breakdown_p.values)

        else:
            stat = np.nan
            p = np.nan

        print(stat,p)
        chi_square_res[year][demographic_var] = (stat, p)
        
all_years = pd.concat(df_chunks)        
        
#Save outputs
#with open(os.path.join(eurostat_output_dir, 'uk_breakdown_props_r_KK.pkl'), 'wb') as f:
#    pickle.dump(breakdown_props, f)
    
#with open(os.path.join(eurostat_output_dir, 'uk_chi_square_res_r_KK.pkl'), 'wb') as f:
#    pickle.dump(chi_square_res, f)  
    
#all_years.to_pickle(os.path.join(eurostat_output_dir, 'uk_all_years_r_KK.pkl'))

#Save outputs
with open(os.path.join(eurostat_output_dir, 'ita_breakdown_props_r_KK.pkl'), 'wb') as f:
    pickle.dump(breakdown_props, f)
    
with open(os.path.join(eurostat_output_dir, 'ita_chi_square_res_r_KK.pkl'), 'wb') as f:
    pickle.dump(chi_square_res, f)  
    
all_years.to_pickle(os.path.join(eurostat_output_dir, 'ita_all_years_r_KK.pkl'))

#Save outputs
# with open(os.path.join(eurostat_output_dir, 'fra_breakdown_counts_r_KK.pkl'), 'wb') as f:
#     pickle.dump(breakdown_counts, f)

# with open(os.path.join(eurostat_output_dir, 'fra_breakdown_props_r_KK.pkl'), 'wb') as f:
#     pickle.dump(breakdown_props, f)
    
# with open(os.path.join(eurostat_output_dir, 'fra_chi_square_res_r_KK.pkl'), 'wb') as f:
#     pickle.dump(chi_square_res, f)  
    
# all_years.to_pickle(os.path.join(eurostat_output_dir, 'fra_all_years_r_KK.pkl'))

## 2.2 Proportion of workers by risk category by all demographic variables at a national level

In [142]:
#Combine breakdowns (as proportions) over 5 year period
for demographic_var in demographic_vars:
    chunks = []
    for k in breakdown_props.keys():
        chunks.append(breakdown_props[k][demographic_var])

    combined_chunks = pd.concat(chunks, axis =1)
    combined_chunks.columns = breakdown_props.keys()

    df_name = demographic_var.lower()
#    df_file_name = 'uk_combined_breakdown'+'_'+df_name+'_r'+'.'+'csv'
#    df_file_name = 'ita_combined_breakdown'+'_'+df_name+'_r'+'.'+'csv'
    df_file_name = 'fra_combined_breakdown'+'_'+df_name+'_r'+'.'+'csv'


    print(df_file_name)

    combined_chunks.to_csv(os.path.join(eurostat_output_dir, df_file_name))

fra_combined_breakdown_sex_r.csv
fra_combined_breakdown_age_r.csv
fra_combined_breakdown_region_r.csv
fra_combined_breakdown_nace1d_r.csv
fra_combined_breakdown_hatlev1d_r.csv
fra_combined_breakdown_incdecil_r.csv
fra_combined_breakdown_national_r.csv
fra_combined_breakdown_ftpt_r.csv
fra_combined_breakdown_temp_r.csv


## 2.3 Employment (thousands) by 3-digit ISCO occupations at national level

In [49]:
#National breakdown by 3d ISCO
country_file = 'fra_all_years_r.pkl'#'ita_all_years_r.pkl' #'uk_all_years_r.pkl'
country = pd.read_pickle(os.path.join(eurostat_output_dir, 'all_years_lfs', country_file))

In [50]:
country.head(3)

Unnamed: 0,HHNUM,AGE,SEX,REGION,ILOSTAT,STAPRO,NACE1D,ISCO3D,HATLEV1D,INCDECIL,NATIONAL,FTPT,TEMP,TEMPDUR,COEFF,RISK_CAT,YEAR
0,368040,32,1,10,1,3.0,J,251,H,,001-EU15,1,2,4.0,0.0,2.0,2014
1,368041,22,2,10,1,3.0,I,524,H,,000-OWN COUNTRY,2,2,1.0,0.0,0.0,2014
4,368044,17,1,10,1,0.0,P,235,M,99.0,000-OWN COUNTRY,2,9,9.0,0.0,2.0,2014


In [46]:
# country[country.YEAR=='2015'].COEFF.sum()

In [58]:
country[(country.YEAR=='2015') & (country.COEFF<0.10) & (country.COEFF>0.0)]

Unnamed: 0,HHNUM,AGE,SEX,REGION,ILOSTAT,STAPRO,NACE1D,ISCO3D,HATLEV1D,INCDECIL,NATIONAL,FTPT,TEMP,TEMPDUR,COEFF,RISK_CAT,YEAR
106924,475899,22,1,Y1,1,3.0,G,522,M,2.0,000-OWN COUNTRY,1,2,6.0,0.07933,0.0,2015
107037,475952,37,2,Y1,1,3.0,Q,911,L,1.0,000-OWN COUNTRY,2,1,9.0,0.09959,1.0,2015
107218,476026,22,1,Y1,1,3.0,O,962,M,3.0,000-OWN COUNTRY,1,2,7.0,0.06342,2.0,2015
107320,476068,47,2,Y1,1,3.0,P,232,H,1.0,000-OWN COUNTRY,2,2,2.0,0.07783,2.0,2015
107321,476068,22,1,Y1,1,3.0,G,522,M,1.0,000-OWN COUNTRY,2,2,3.0,0.07783,0.0,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503708,511263,37,1,Y3,1,0.0,G,522,L,99.0,012-EAST ASIA,1,9,9.0,0.07983,0.0,2015
503713,511265,37,1,Y3,1,3.0,O,411,H,6.0,000-OWN COUNTRY,1,1,9.0,0.09062,0.0,2015
503724,511267,47,2,Y3,1,3.0,O,234,H,10.0,000-OWN COUNTRY,1,1,9.0,0.04943,2.0,2015
503746,511272,47,2,Y3,1,0.0,C,751,H,99.0,000-OWN COUNTRY,2,9,9.0,0.07550,2.0,2015


In [59]:
isco_chunks = []
years = []
for name, group in country.groupby('YEAR'):
    print(name)
    isco_breakdown = group.groupby('ISCO3D')['COEFF'].sum()
#    isco_breakdown = isco_breakdown*1000
    isco_breakdown.fillna(0, inplace = True)
    isco_breakdown = isco_breakdown.round(3)
    isco_chunks.append(isco_breakdown)
    years.append(name)

2014
2015
2016
2017
2018


In [60]:
isco_chunks_combined = pd.concat(isco_chunks, axis =1 )

In [61]:
isco_chunks_combined.columns = years

In [44]:
#isco_chunks_combined

In [104]:
country_output_file = 'ita_breakdown_by_isco.csv'#'uk_breakdown_by_isco.csv' #'fra_breakdown_by_isco.csv'
isco_chunks_combined.to_csv(os.path.join(eurostat_output_dir, country_output_file))

## 2.4 Combine employment data with risk estimates and occupational group titles

In [62]:
#Read in breakdown by ISCO
#country_file = 'uk_breakdown_by_isco.csv'
#country_file = 'ita_breakdown_by_isco.csv'
country_file = 'fra_breakdown_by_isco.csv'

country_isco_breakdown = pd.read_csv(os.path.join(eurostat_output_dir, 'nat_breakdown_by_isco', country_file), 
                                     index_col = 0)

In [63]:
country_isco_breakdown.index = country_isco_breakdown.index.astype(str)

In [94]:
country_isco_breakdown2 = country_isco_breakdown.merge(isco_3d_risk[['mg_title', 'risk_cat']], 
                                                       left_index = True,
                                                      right_on = 'isco_3d')

In [95]:
#country_file2 = 'uk_breakdown_by_isco_w_risk.csv'
#country_file2 = 'ita_breakdown_by_isco_w_risk.csv'
country_file2 = 'fra_breakdown_by_isco_w_risk.csv'

#country_isco_breakdown2.to_csv(os.path.join(eurostat_output_dir, 'nat_breakdown_by_isco', country_file2))

## 2.5 Employment by risk category by all demographic variables at a national level

In [70]:
#Define demographic variables to explore
demographic_vars = ['SEX', 'AGE', 'REGION', 'NACE1D', 'HATLEV1D','INCDECIL', 'NATIONAL', 'FTPT', 'TEMP']

#Define output dicts for storing results
breakdown_counts = collections.defaultdict(dict)

#Iterate over files in the folder
for name, group in country.groupby('YEAR'):
    print(name)
    
    for demographic_var in demographic_vars:
        print(demographic_var)
        breakdown_count, breakdown_prop = demo_breakdown(group, demographic_var, 'RISK_CAT', 'COEFF')
        breakdown_counts[name][demographic_var] = breakdown_count  
        
#Save outputs
country_breakdown_output_file = 'ita_breakdown_counts_r.pkl' #'uk_breakdown_counts_r.pkl' #'fra_breakdown_counts_r.pkl'
with open(os.path.join(eurostat_output_dir, country_breakdown_output_file), 'wb') as f:
    pickle.dump(breakdown_counts, f)

2014
SEX
AGE
REGION
NACE1D
HATLEV1D
INCDECIL
NATIONAL
FTPT
TEMP
2015
SEX
AGE
REGION
NACE1D
HATLEV1D
INCDECIL
NATIONAL
FTPT
TEMP
2016
SEX
AGE
REGION
NACE1D
HATLEV1D
INCDECIL
NATIONAL
FTPT
TEMP
2017
SEX
AGE
REGION
NACE1D
HATLEV1D
INCDECIL
NATIONAL
FTPT
TEMP
2018
SEX
AGE
REGION
NACE1D
HATLEV1D
INCDECIL
NATIONAL
FTPT
TEMP


In [71]:
# Combine breakdowns (as employment in thousands) over 5 year period
for demographic_var in demographic_vars:
    chunks = []
    for k in breakdown_counts.keys():
        chunks.append(breakdown_counts[k][demographic_var])

    combined_chunks = pd.concat(chunks, axis =1)
    combined_chunks.columns = breakdown_counts.keys()
    combined_chunks.fillna(0, inplace = True)
    combined_chunks = combined_chunks.round(3)
    
    df_name = demographic_var.lower()
    country_name = 'ita' #'uk' #'fra'
    df_file_name = country_name+'_combined_breakdown_k'+'_'+df_name+'.'+'csv'


    print(df_file_name)

    combined_chunks.to_csv(os.path.join(eurostat_output_dir, df_file_name))

ita_combined_breakdown_k_sex.csv
ita_combined_breakdown_k_age.csv
ita_combined_breakdown_k_region.csv
ita_combined_breakdown_k_nace1d.csv
ita_combined_breakdown_k_hatlev1d.csv
ita_combined_breakdown_k_incdecil.csv
ita_combined_breakdown_k_national.csv
ita_combined_breakdown_k_ftpt.csv
ita_combined_breakdown_k_temp.csv


In [72]:
combined_chunks

Unnamed: 0_level_0,Unnamed: 1_level_0,2014,2015,2016,2017,2018
TEMP,RISK_CAT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,3368.821,3390.139,3462.786,3472.961,3442.556
1,1.0,2155.867,2171.437,2163.946,2130.256,2090.212
1,2.0,8754.522,8806.336,9029.608,9124.98,9086.934
2,0.0,433.792,439.46,448.231,507.331,581.26
2,1.0,499.884,521.604,541.367,562.0,640.673
2,2.0,1329.523,1408.451,1427.791,1641.795,1809.812
9,0.0,1325.883,1335.883,1328.912,1282.721,1230.884
9,1.0,773.583,781.146,742.399,711.11,681.558
9,2.0,3379.259,3339.863,3356.959,3329.089,3382.505


## 2.6 Generate breakdowns for subnational regions

In [96]:
#check country
print(country_file)

fra_breakdown_by_isco.csv


In [97]:
country['REGION'] = country['REGION'].astype(str)

In [98]:
sorted(country['REGION'].unique())

['10',
 'B0',
 'C1',
 'C2',
 'D1',
 'D2',
 'E1',
 'E2',
 'F1',
 'F2',
 'F3',
 'G0',
 'H0',
 'I1',
 'I2',
 'I3',
 'J1',
 'J2',
 'K1',
 'K2',
 'L0',
 'M0',
 'Y1',
 'Y2',
 'Y3',
 'Y4']

In [176]:
region_name = 'I0'
#Italy: Lombardy C4, Lazio I4
#France: Ile-de-France 10 (NUTS3 FR101)
#UK: Scotland M0, London I0

In [177]:
region = country[country['REGION'] == region_name]

### By 3D ISCO

In [179]:
reg_isco_chunks = []
years = []
for name, group in region.groupby('YEAR'):
    print(name)
    reg_isco_breakdown = group.groupby('ISCO3D')['COEFF'].sum()
#    isco_breakdown = isco_breakdown*1000
    reg_isco_breakdown.fillna(0, inplace = True)
    reg_isco_breakdown = reg_isco_breakdown.round(3)
    reg_isco_chunks.append(reg_isco_breakdown)
    years.append(name)

2014
2015
2016
2017
2018


In [180]:
reg_isco_chunks_combined = pd.concat(reg_isco_chunks, axis =1 )
reg_isco_chunks_combined.columns = years

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [181]:
region_output_file = region_name+'_breakdown_by_isco.csv'#'uk_breakdown_by_isco.csv' #'fra_breakdown_by_isco.csv'
reg_isco_chunks_combined.to_csv(os.path.join(eurostat_output_dir, region_output_file))

In [182]:
reg_isco_chunks_combined.head()

Unnamed: 0,2014,2015,2016,2017,2018
111,,,1.567,0.978,4.206
112,24.281,24.058,25.604,21.566,61.141
121,98.999,115.831,110.769,146.854,119.652
122,69.791,77.265,76.301,80.688,69.668
131,,,,0.937,


### Now for all demographic variables

In [138]:
#Define output dicts for storing results
reg_breakdown_props = collections.defaultdict(dict)
reg_breakdown_counts = collections.defaultdict(dict)
reg_chi_square_res = collections.defaultdict(dict)

#Iterate over files in the folder
for name, group in region.groupby('YEAR'):
    print(name)
    for demographic_var in demographic_vars:
        print(demographic_var)
        reg_breakdown_count, reg_breakdown_prop = demo_breakdown(group, demographic_var, 'RISK_CAT', 'COEFF')
        reg_breakdown_counts[name][demographic_var] = reg_breakdown_count
        reg_breakdown_props[name][demographic_var] = reg_breakdown_prop

        breakdown_p = pivot_breakdown(reg_breakdown_count, 'COEFF', demographic_var, 'RISK_CAT')
        check_condition = (breakdown_p >=5).all(axis = 1).all()
        print(check_condition)

        if check_condition:
            stat, p, dof, expected = chi2_contingency(breakdown_p.values)

        else:
            stat = np.nan
            p = np.nan

        print(stat,p)
        reg_chi_square_res[name][demographic_var] = (stat, p)



2014
SEX
True
126.81963217417317 2.8937884231616828e-28
AGE
False
nan nan
REGION
True
0.0 1.0
NACE1D
False
nan nan
HATLEV1D
True
460.49631977776244 2.336538693929838e-98
INCDECIL
True
406.83590434400406 7.798246025409434e-74
NATIONAL
False
nan nan
FTPT
True
78.31334333842307 9.873555332888645e-18
TEMP
True
49.21226883925394 5.272731074955377e-10
2015
SEX
True
127.17990115837009 2.416770209638365e-28
AGE
False
nan nan
REGION
True
0.0 1.0
NACE1D
False
nan nan
HATLEV1D
True
596.7063512222509 7.999190569774163e-128
INCDECIL
True
402.27336664116086 6.900540625982267e-73
NATIONAL
False
nan nan
FTPT
True
49.20081628731281 2.0709923308208866e-11
TEMP
True
34.439122494972366 6.05588310858137e-07
2016
SEX
True
84.05259325976137 5.600299827910657e-19
AGE
False
nan nan
REGION
True
0.0 1.0
NACE1D
False
nan nan
HATLEV1D
True
604.7957124234227 1.4200126578168812e-129
INCDECIL
True
522.7639708619542 4.9444767162272535e-98
NATIONAL
False
nan nan
FTPT
True
71.6790313284414 2.723303930969881e-16
TEMP
Tru

In [139]:
# Save results
counts_name = region_name+'_'+'breakdown_counts.pkl'
props_name = region_name+'_'+'breakdown_props.pkl'
chi_square_name = region_name+'_'+'chi_square_res.pkl'


with open(os.path.join(eurostat_output_dir, counts_name), 'wb') as f:
    pickle.dump(reg_breakdown_counts, f)

with open(os.path.join(eurostat_output_dir, props_name), 'wb') as f:
    pickle.dump(reg_breakdown_props, f)
    
with open(os.path.join(eurostat_output_dir, chi_square_name), 'wb') as f:
    pickle.dump(reg_chi_square_res, f)

In [140]:
# Combine breakdowns for proportions
for demographic_var in demographic_vars:
    chunks = []
    for k in reg_breakdown_props.keys():
        chunks.append(reg_breakdown_props[k][demographic_var])

    combined_chunks = pd.concat(chunks, axis =1)
    combined_chunks.columns = reg_breakdown_props.keys()

    df_name = demographic_var.lower()
    df_file_name = region_name+'_'+df_name+'.'+'csv'


    print(df_file_name)

    combined_chunks.to_csv(os.path.join(eurostat_output_dir, df_file_name))

10_sex.csv
10_age.csv
10_region.csv
10_nace1d.csv
10_hatlev1d.csv
10_incdecil.csv
10_national.csv
10_ftpt.csv
10_temp.csv


In [141]:
# Combine breakdowns for counts (employment in thousands)
for demographic_var in demographic_vars:
    chunks = []
    for k in reg_breakdown_counts.keys():
        chunks.append(reg_breakdown_counts[k][demographic_var])

    combined_chunks = pd.concat(chunks, axis =1)
    combined_chunks.columns = reg_breakdown_counts.keys()
    combined_chunks.fillna(0, inplace = True)
    combined_chunks = combined_chunks.round(3)
    
    df_name = demographic_var.lower()
    df_file_name = region_name+'_'+df_name+'_k'+'.'+'csv'


    print(df_file_name)

    combined_chunks.to_csv(os.path.join(eurostat_output_dir, df_file_name))

10_sex_k.csv
10_age_k.csv
10_region_k.csv
10_nace1d_k.csv
10_hatlev1d_k.csv
10_incdecil_k.csv
10_national_k.csv
10_ftpt_k.csv
10_temp_k.csv


## 2.7 Check significance for chi-square test for a given variable

In [138]:
# France
# with open(os.path.join(eurostat_output_dir, 'all_prop_demo_breakdown', 'fra_chi_square_res_r.pkl'), 'rb') as f:
with open(os.path.join(eurostat_output_dir, '', 'fra_chi_square_res_r_KK.pkl'), 'rb') as f:
    fra_chi_square = pickle.load(f)

In [145]:
# demographic_vars = ['SEX', 'AGE', 'REGION', 'NACE1D', 'HATLEV1D','INCDECIL', 'NATIONAL', 'FTPT', 'TEMP']
demo_var = 'NATIONAL'
for k in fra_chi_square:
    print(fra_chi_square[k][demo_var])

(499840.77100020554, 0.0)
(417213.26518200483, 0.0)
(551292.139385549, 0.0)
(467488.1710338946, 0.0)
(529749.4657162151, 0.0)


In [20]:
# Italy
with open(os.path.join(eurostat_output_dir, 'all_prop_demo_breakdown', 'ita_chi_square_res_r.pkl'), 'rb') as f:
    ita_chi_square = pickle.load(f)

In [30]:
demo_var = 'INCDECIL'
for k in ita_chi_square:
    print(ita_chi_square[k][demo_var])

(1284.1033523011054, 7.498732717806765e-260)
(1262.4839257160058, 3.186581351538294e-255)
(1188.360553708729, 2.306230634126418e-239)
(1105.9221776772379, 9.63057236392359e-222)
(1105.437292131926, 1.2224544762533198e-221)


In [19]:
# UK
with open(os.path.join(eurostat_output_dir, 'all_prop_demo_breakdown', 'uk_chi_square_res_r.pkl'), 'rb') as f:
    uk_chi_square = pickle.load(f)

In [20]:
demo_var = 'AGE'
for k in uk_chi_square:
    print(uk_chi_square[k][demo_var])

(515.5678816590101, 9.713525533738354e-94)
(523.2752145609305, 2.423011948604621e-95)
(513.9229183887859, 2.1348679218602766e-93)
(414.3936021187407, 8.283211854651931e-73)
(458.3499213231952, 7.120740477140461e-82)
