In [3]:
import numpy as np
import pandas as pd

In [4]:
bgvar_22 = pd.read_csv('avars_202201_EN_1.0p.csv', sep=';')
hea_22 = pd.read_stata('ch22o_EN_1.0p.dta') # Maja
rel_22 = pd.read_stata('cr22o_EN_1.0p.dta') # Tyna
soc_22 = pd.read_stata('cs22o_EN_1.1p.dta') # Tyna
fam_22 = pd.read_stata('cf22o_EN_1.0p.dta') # Tyna
wor_22 = pd.read_stata('cw22o_EN_1.0p.dta') # Tyna
per_22 = pd.read_stata('cp22n_EN_1.0p.dta') # Maja
val_22 = pd.read_stata('cv22n_EN_1.0p.dta') # Maja
ass_22 = pd.read_stata('ca22h_EN_1.0p.dta') # Mate
inc_22 = pd.read_stata('ci22o_EN_1.0p.dta') # Mate
hou_22 = pd.read_stata('cd22o_EN_1.0p.dta') # Mate

In [3]:
#column combinators (replaces given columns with one containing their avg values)
def average_columns(df, *columns):
    new_column_name = columns[-1]
    
    try:
        selected_columns = list(columns[:-1])
        avg_col = df[selected_columns].astype(float).mean(axis=1)
        return pd.DataFrame({new_column_name: avg_col})
    except KeyError as e:
        print(f'KeyError: {e} not found in DataFrame columns.')
        return pd.DataFrame({new_column_name: np.nan})

#replaces given columns with one containing their sums (not working ;-())
def sum_columns(df, new_column, *columns):
    df[new_column] = df[columns].sum(axis=1)

Background variables:

1. geslacht to sex
2. leeftijd to age
3. lftdcat to age_cat
4. aantalhh to hhm_num
5. aantalki to hh_chi
6. burgstat to civ_sta
7. woning to dwe_typ
8. sted to urb_lvl (Urban character: Surrounding address density per km2 extremely urban 2,500 or more very 1,500 to 2,500 moderately 1,000 to 1,500 slightly 500 to 1,000 not less than 500) 
9. belbezig to pri_occ 
10. brutoink to mon_inc 
11. brutocat mon_inc_cat 12. brutohh_f to hh_inc 
13. oplzon to edu_lvl

In [4]:
bgvar_22[bgvar_22.select_dtypes(include='category').columns] = bgvar_22[bgvar_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)
bg_var_names = {'nomem_encr':'nomem_encr', 'geslacht':'sex','leeftijd':'age', 'lftdcat':'age_cat', 'aantalhh':'hhm_num', 'aantalki':'hh_chi', 'burgstat':'civ_sta', 'woning':'dwe_typ', 'sted':'urb_lvl',  'belbezig':'pri_occ', 'brutoink':'mon_inc', 'brutocat':'mon_inc_cat','brutohh_f':'hh_inc', 'oplzon':'edu_lvl'}
bg_var = bgvar_22.loc[:, list(bg_var_names.keys())].rename(columns=bg_var_names)
bg_var.head()

Unnamed: 0,nomem_encr,sex,age,age_cat,hhm_num,hh_chi,civ_sta,dwe_typ,urb_lvl,pri_occ,mon_inc,mon_inc_cat,hh_inc,edu_lvl
0,800009,1,66,7,2,0,1,1,5,11,-13,13,510560846027652,5
1,800015,1,59,6,3,1,1,1,1,1,4519,10,549165915636499,6
2,800041,1,24,2,5,3,5,1,5,1,-13,13,799627402661866,5
3,800057,1,47,5,5,3,5,1,1,1,8000,12,16500,6
4,800058,2,24,2,1,0,5,2,1,7,550,2,550,6


Concepts in Health:
1. **Subjective health** (ch22o004 to sub_hea)
2. **Relative health** (ch22o005 to rel_hea)
3. Life expectancy
4. **Mental health** (all neg as men_hea_neg and and all pos as men_hea_pos)
5. Height, weight
6. **Chronical problems** (ch22o018 to disability)
7. Health problems
8. Impediment in labour
9. **Substance use** (ch22o126 to smoker, ch22o133 to alcohol, all substances to subs)
10. Medication
11. Physical activity - only until 2018
12. Health improving behaviour - - only until 2018
13. Use of healthcare
14. Sight and hearing
15. Health insurance

In [5]:
#select categorical columns and convert to numeric
hea_22[hea_22.select_dtypes(include='category').columns] = hea_22[hea_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)

#average for mental health and drugs
hea_toavg = [('ch22o011','ch22o014','ch22o012', 'men_hea_neg'),('ch22o013', 'ch22o015', 'men_hea_pos'), ('ch22o159', 'ch22o160', 'ch22o161', 'ch22o161', 'ch22o162', 'ch22o163', 'ch22o270', 'subs')]
hea_22 = pd.concat([hea_22] + [average_columns(hea_22, *column_set) for column_set in hea_toavg], axis=1)

#create a new df hea_22_con with selected columnns and new columns containing avgs (representing concepts)
hea_con = {'nomem_encr':'nomem_encr', 'ch22o004': 'sub_hea', 'ch22o005': 'rel_hea', 'ch22o018': 'disability', 'ch22o020':'mob_issues', 'ch22o126':'smoker', 'ch22o133':'alcohol', 'men_hea_neg':'men_hea_neg', 'men_hea_pos':'men_hea_pos', 'subs':'subs'}
hea_22_con = hea_22.loc[:, list(hea_con.keys())].rename(columns=hea_con)

#ready
hea_22_con.head()

Unnamed: 0,nomem_encr,sub_hea,rel_hea,disability,mob_issues,smoker,alcohol,men_hea_neg,men_hea_pos,subs
0,800001.0,1,2,0,0,1,4,1.0,3.0,0.0
1,800002.0,3,2,1,0,-1,3,1.0,3.5,0.0
2,800009.0,1,1,0,1,1,1,0.0,3.5,0.0
3,800015.0,2,2,0,0,-1,1,0.0,3.0,0.0
4,800028.0,2,2,1,3,1,3,0.333333,4.0,0.0


Concepts in Religion and Ethnicity:

1. **Religious upbringing** (cr23p135 to rel_upb)
2. **Religious affiliation** (cr23p144 to rel_aff)
3. **Religiosity** (cr23p162 to relig)
4. Religious orthodoxy
5. Nationality 
6. **Origin** (cr23p079 – cr23p087 to lang_ori, cr23p079 indicating Dutch or not)
7. **Ethnic identification** (cr23p164 to eth_id)
8. Language proficiency and use

In [6]:
rel_con = {'nomem_encr':'nomem_encr','cr22o135':'rel_upb', 'cr22o144':'rel_aff', 'cr22o162':'relig', 'cr22o079':'lang_ori', 'cr22o164':'eth_id'}
rel_22[rel_22.select_dtypes(include='category').columns] = rel_22[rel_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)
rel_22_con = rel_22.loc[:, list(rel_con.keys())].rename(columns=rel_con)
rel_22_con.head()

Unnamed: 0,nomem_encr,rel_upb,rel_aff,relig,lang_ori,eth_id
0,800001.0,-1,-1,4,1,0
1,800002.0,2,5,1,1,1
2,800009.0,3,2,0,1,2
3,800015.0,1,-1,4,0,3
4,800028.0,-1,-1,4,1,1


Concepts in Social Intergation and Leisure:

1. Loneliness
2. Social contacts 
3. **Satisfaction leisure** (cs22o001 to lei_sat)
4. Social engagement
5. Volunteer aid
6. Time expenditure voluntary work
7. **Cultural participation** (sum cs22o494 - cs22o496 to cul_par)
8. **Holidays** (cs22o103 to ab_hol)
9. **Sport** (cs22o105 to spo_h)
10. Media usage

In [7]:
soc_22[soc_22.select_dtypes(include='category').columns] = soc_22[soc_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)
cultosum = ['cs22o494', 'cs22o495', 'cs22o517', 'cs22o093', 'cs22o094', 'cs22o568', 'cs22o516', 'cs22o496']
soc_22['cul_sum'] = soc_22[cultosum].sum(axis=1)
soc_con = {'nomem_encr':'nomem_encr','cs22o001':'lei_sat', 'cul_sum':'cul_par', 'cs22o103':'ab_hol', 'cs22o105':'spo_h'}
soc_22_con = soc_22.loc[:, list(soc_con.keys())].rename(columns=soc_con)
print(soc_22_con)

      nomem_encr  lei_sat  cul_par  ab_hol  spo_h
0       800001.0        9        1       0    3.0
1       800002.0        8        0       1    NaN
2       800009.0       11        2       2    NaN
3       800015.0        5        0       1    NaN
4       800028.0       10        6       0    NaN
...          ...      ...      ...     ...    ...
5879    899794.0        9        4       1    3.0
5880    899891.0        8        5       2    6.0
5881    899908.0        3        0       1    4.0
5882    899923.0        8        3       3   10.0
5883    899928.0        9        8       3    5.0

[5884 rows x 5 columns]


  soc_22['cul_sum'] = soc_22[cultosum].sum(axis=1)


Concepts in Family and Household:

1. **Parental relations** (cf22o145+cf22o146 to par_int)
2. **Partnership** (cf22o025 to liv_tog)
3. Marriage
4. Marital history
5. **Children** (cf22o455 to chi_num, cf22o456 to chi1_bir)
6. **Partnership and childbirth intentions** (cf22o128 to chi_fut)
7. Infertility
8. Parental support
9. Parent support
10. Quality of relationship
11. Division of domestic tasks
12. Perceived burden domestic tasks
13. Perceived burden child care
14. Perceived fairness division tasks
15. Child education: Supervision
16. Childcare

In [8]:
fam_22[fam_22.select_dtypes(include='category').columns] = fam_22[fam_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)
famtosum = ['cf22o143', 'cf22o144']
fam_22['fam_sum'] = fam_22[famtosum].sum(axis=1)
fam_con = {'nomem_encr':'nomem_encr', 'fam_sum':'par_int', 'cf22o025':'liv_tog', 'cf22o455':'chi_num', 'cf22o456':'chi1_bir', 'cf22o128':'chi_fut'}
fam_22_con = fam_22.loc[:, list(fam_con.keys())].rename(columns=fam_con)
print(fam_22_con)

      nomem_encr  par_int  liv_tog  chi_num  chi1_bir  chi_fut
0       800001.0       -2        0      NaN       NaN       -1
1       800002.0        2        0      4.0    2005.0        1
2       800009.0       -2        0      3.0    1982.0       -1
3       800015.0       -2        0      2.0    1994.0       -1
4       800028.0       -2        0      2.0    1983.0       -1
...          ...      ...      ...      ...       ...      ...
5936    899891.0        4       -1      NaN       NaN        0
5937    899908.0        4       -1      NaN       NaN        0
5938    899923.0        4        0      2.0    1993.0       -1
5939    899928.0       -2       -1      1.0    1970.0       -1
5940    899946.0        4       -1      NaN       NaN        1

[5941 rows x 6 columns]


  fam_22['fam_sum'] = fam_22[famtosum].sum(axis=1)


Concepts in Work and Schooling:

1. Employment / activity status
2. Hours of paid work
3. **Characteristics current or last job** (cw22o136 to wor_min, cw22o140 to wor_nig, cw22o142 to wor_hom, cw22o402 to wor_sect)
4. Average pay job
5. Second jobs
6. Work aspirations
7. **Satisfaction** (cw22o128 to inc_sat)
8. Employment conditions
9. Commuting
10. Pensions
11. Non-working, non-pension
12. Job search
13. Satisfaction education
14. Highest qualification
15. Highest education
16. Qualifications for job
17. Education and training in last year

cw22o439 to chi_<8

In [9]:
wor_22[wor_22.select_dtypes(include='category').columns] = wor_22[wor_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)
wor_con = {'nomem_encr':'nomem_encr', 'cw22o136':'wor_min', 'cw22o140':'wor_nig', 'cw22o142':'wor_hom', 'cw22o402':'wor_sect', 'cw22o128':'inc_sat', 'cw22o439':'chi_<8'}
wor_22_con = wor_22.loc[:, list(wor_con.keys())].rename(columns=wor_con)
print(wor_22_con)

      nomem_encr  wor_min  wor_nig  wor_hom  wor_sect  inc_sat  chi_<8
0       800002.0     16.0        0        2         4        9       1
1       800009.0      NaN       -1       -1        -1       -1      -1
2       800015.0      5.0        1        3        10        7       1
3       800028.0      NaN       -1       -1        -1       -1      -1
4       800057.0     70.0        0        3         8        8       1
...          ...      ...      ...      ...       ...      ...     ...
5770    899891.0     30.0        0        3        10        8      -1
5771    899908.0      NaN       -1       -1        -1       -1      -1
5772    899923.0     20.0        0        3        10        7       1
5773    899928.0      NaN       -1       -1        -1       -1      -1
5774    899946.0      NaN       -1       -1        14       -1      -1

[5775 rows x 7 columns]


Concepts in Personality:
1. Happiness
2. **Life satisfaction** (cp22n014-016 to life_satf, cp22n070-079 as self_est_pos or self_est_neg)
3. Big five personality
4. Survey attitude
5. Self esteem
6. Mood
7. Need to evaluate
8. **Value orientation** (cp22n100 to res, cp22n110 to ind, cp22n118 to fam_sec, cp22n119 to free, cp22n129 to comf_life, cp22n133 to soc_rec)
9. Inclusion of Others in the Self
10. Social desirability
11. Affects
12. Need for Cognition
13. **Social trust** (cp22n019 to trust )

In [10]:
per_22[per_22.select_dtypes(include='category').columns] = per_22[per_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)

#averages for life satisfaction and self-esteem
per_toavg = [('cp22n014','cp22n015','cp22n016', 'cp22n017', 'cp22n018','life_satf'),('cp22n070', 'cp22n071', 'cp22n073', 'cp22n075', 'cp22n076', 'self_est_pos'), ('cp22n072', 'cp22n074', 'cp22n078', 'cp22n079', 'self_est_neg')]
per_22 = pd.concat([per_22] + [average_columns(per_22, *column_set) for column_set in per_toavg], axis=1)

#create a new df per_22_con with selected columnns and new columns containing avgs (representing concepts)
per_con = {'nomem_encr':'nomem_encr','cp22n100': 'resp', 'cp22n110': 'ind','cp22n118': 'fam_sec','cp22n119': 'free', 'cp22n133': 'soc_rec', 'cp22n129': 'comf_life', 'cp22n019': 'trust', 'life_satf':'life_satf', 'self_est_pos':'self_est_pos', 'self_est_neg':'self_est_neg' }
per_22_con = per_22.loc[:, list(per_con.keys())].rename(columns=per_con)

#ready
per_22_con.head()

Unnamed: 0,nomem_encr,resp,ind,fam_sec,free,soc_rec,comf_life,trust,life_satf,self_est_pos,self_est_neg
0,800002.0,6,5,5,5,4,4,9,6.0,4.8,0.5
1,800009.0,5,5,5,6,2,4,5,4.2,5.4,0.0
2,800015.0,6,6,6,6,0,6,6,4.0,5.6,1.5
3,800028.0,5,5,5,5,4,6,9,5.0,6.0,0.0
4,800057.0,6,6,5,6,4,6,8,5.4,5.8,0.0


Concepts in Values:
1. Beliefs about maternal employment
2. **Gender role attitudes** (cv22n113 to gend_roles)
3. Political position ethnic minorities *page not working*
4. Right wing attitudes *page not working*
5. **Attitude towards foreigners** (cv22n120 to nationalist)
6. Marriage attitudes
7. Family solidarity
8. Norm of having children
9. Work ethic
10. Conservatism
11. Economic conservatism

In [11]:
val_22[val_22.select_dtypes(include='category').columns] = val_22[val_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)

#create a new df val_22_con with selected columnns and new columns containing avgs (representing concepts)
val_con = {'nomem_encr':'nomem_encr','cv22n113': 'gend_roles', 'cv22n120': 'nationalist'}
val_22_con = val_22.loc[:, list(val_con.keys())].rename(columns=val_con)

#ready
val_22_con.head()

Unnamed: 0,nomem_encr,gend_roles,nationalist
0,800009.0,0,1
1,800015.0,0,0
2,800057.0,-1,-1
3,800058.0,1,1
4,800085.0,0,2


Concepts in Economic Situtation: Assets:
1. **Assets** (ca22h008 to ass_own, ca22h023 to ass_val, ca22h012 to tot_bal)

In [12]:
ass_22['ca22h023'] = pd.to_numeric(ass_22['ca22h023'], errors='coerce')
ass_22[ass_22.select_dtypes(include='category').columns] = ass_22[ass_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)
ass_con = {'nomem_encr':'nomem_encr', 'ca22h008':'ass_own','ca22h023':'ass_val', 'ca22h012':'tot_bal'}
ass_22_con = ass_22.loc[:, list(ass_con.keys())].rename(columns=ass_con)

ass_22_con.head()

Unnamed: 0,nomem_encr,ass_own,ass_val,tot_bal
0,800001.0,1,15000.0,25165.0
1,800002.0,0,,50000.0
2,800009.0,1,52000.0,4511.0
3,800015.0,0,,12000.0
4,800028.0,0,,90000.0


Concepts in Income: 
1. Subjective standard of living
2. Satisfaction income
3. Employment income / Earnings cj008
4. Self-employment income
5. Non-employment income: pension
6. Non-employment income: benefits
7. Non-employment income: other
8. Credit, debt, payments
9. **Total income** (ci22o226 to income)
10. **Financial/material well-being** (ci22o252 to fin_sit)
11. Financial expectations
12. Financial management
13. Material deprivation


In [13]:
inc_22[inc_22.select_dtypes(include='category').columns] = inc_22[inc_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)

#create a new df per_22_con with selected columnns and new columns containing avgs (representing concepts)
inc_con = {'nomem_encr':'nomem_encr','ci22o226': 'income', 'ci22o252': 'fin_sit' }
inc_22_con = inc_22.loc[:, list(inc_con.keys())].rename(columns=inc_con)

#ready
inc_22_con.head()

Unnamed: 0,nomem_encr,income,fin_sit
0,800002.0,-1,4
1,800009.0,-1,3
2,800015.0,-1,4
3,800028.0,-1,3
4,800057.0,-1,3


Concepts for Housing:
1. Satisfaction housing
2. Housing characteristics
3. Housing expenditure 
4. **Housing wealth** (cd22o025 to pur_price)

In [5]:
hou_22[hou_22.select_dtypes(include='category').columns] = hou_22[hou_22.select_dtypes(include='category').columns].apply(lambda col: col.astype('category').cat.codes)

hou_con = {'nomem_encr':'nomem_encr','cd22o025': 'pur_price'}
hou_22_con = hou_22.loc[:, list(hou_con.keys())].rename(columns=hou_con)
hou_22_con.head()

Unnamed: 0,nomem_encr,pur_price
0,800002.0,210000.0
1,800015.0,94463.0
2,800057.0,605000.0
3,800127.0,140000.0
4,800156.0,


In [18]:
#merge
all_22 = [bg_var, fam_22_con, hea_22_con, per_22_con, soc_22_con, val_22_con, wor_22_con, inc_22_con, ass_22_con, hou_22_con]

merged_22 = all_22[0]

for all in all_22[1:]:
    merged_22 = pd.merge(merged_22, all, on='nomem_encr', how='inner')

In [19]:
merged_22.head()

Unnamed: 0,nomem_encr,sex,age,age_cat,hhm_num,hh_chi,civ_sta,dwe_typ,urb_lvl,pri_occ,...,wor_hom,wor_sect,inc_sat,chi_<8,income,fin_sit,ass_own,ass_val,tot_bal,pro_price
0,800015,1,59,6,3,1,1,1,1,1,...,3,10,7,1,-1,4,0,,12000.0,94463.0
1,800127,2,38,4,1,0,3,1,1,1,...,3,10,7,-1,0,0,1,5000.0,-10000000000.0,140000.0
2,800170,2,63,6,2,1,3,1,2,1,...,0,12,2,1,9,3,0,,-10000000000.0,75000.0
3,800186,2,82,7,2,0,4,1,3,8,...,-1,-1,-1,-1,7,4,0,,-10000000000.0,29000.0
4,800201,1,48,5,2,0,1,2,4,1,...,0,12,7,-1,7,3,1,6000.0,40000.0,
