# MH of older workers, retirement age and working conditions

## Preprocessing SHARE data

Load libraries

In [1]:
import os
import sys

src_path = os.path.abspath("../")
sys.path.append(src_path)

from utils.common import *
from utils.retirement import *
from utils.share import *

import_libraries()

Preprocess **SHARELIFE data**, apply first filters and create first variables

In [2]:
file_names = ["cv_r.dta", "technical_variables.dta", "dn.dta", "re.dta"]
sharelife = import_share_stata1(file_names=file_names, waves=[7])

In [3]:
sharelife = sharelife_preprocessing(sharelife)

Initial n obs: 63248
Gender, country, 1st year in country - formatted, age 50+ filter - applied
N obs after processing gender and age: 56486
Years of education - calculated
N obs after processing education years: 56486
Current ISCO - identified, those changed job - deleted
N obs after isco job changes: 41719
Years of contribution, 1st year of contribution - calculated
Those worked less than 10 years / started work before age of 10 - deleted
N obs after contribution years: 41719


Preprocess **additional data from SHARE waves 6-8** where isco is available

In [4]:
file_names = ["cv_r.dta", "dn.dta", "ep.dta"]
sharelife_add = import_share_stata1(
    file_names=file_names, waves=[6, 7, 8], convert_categoricals=True
)

In [5]:
sharelife_add = sharelife_add_preprocessing(sharelife_add, sharelife)

N obs initial: 192020
N obs dropping missing isco: 10679
N obs after drop already present in Sharelife: 6948
Gender, country, 1st year in country - formatted, age 50+ filter - applied
N obs after gender and age: 3358
Years of education - calculated
N obs after education: 3358
Current ISCO - identified, those changed job - deleted
N obs after job and isco: 2603
Years of contribution, 1st year of contribution - calculated
Those worked less than 10 years / started work before age of 10 - deleted
N obs after contribution years: 2603


In [6]:
# Concat main and additional datasets
df = pd.concat([sharelife, sharelife_add], axis=0).reset_index(drop=True)

Preprocess **main data from SHARE waves 4-6**

In [7]:
file_names = ["cv_r.dta", "dn.dta", "ep.dta", "ch.dta", "gv_health.dta", "as.dta"]
share = import_share_stata1(
    file_names=file_names, waves=[4, 6], convert_categoricals=True
)

In [8]:
share = share_preprocessing(share, df)

Initial n obs: 126085
Those without ISCO codes - deleted
N obs with ISCO: 48510
N obs after age calculation: 48510
N obs after defining number of children: 48510
Current year, age, number of children and living with a partner - imputed
N obs after leaving only employed: 14944
N obs after defining industry: 14944
Job status, industry of employment - added
N obs after defining finance: 14944
Household income, investments, life insurance - added
N obs after dropping missing sphus:14935
N obs after dropping missing chronic:14935
N obs after dropping missing eurod:14580
Physical and mental health indicators - added
N obs after health: 14580


In [9]:
# Merge with Sharelife data
df = share.merge(df, on=["mergeid"], how="left")

Final preprocessing for **full SHARE dataset**

In [10]:
df = share_final_preprocessing(df)

N obs initial: 14580
Current years of contribution - calculated
Data types - corrected
N obs after data types: 14580
N obs retirement age and filter to be under it: 10230
Retirement age, work horizon and work horizon change by reforms - calculated
Longitudinal weights imputed in STATA - added
N obs after weights: 4644


In [11]:
df["isco"] = pd.NA
df.loc[df["year"] == 2011, "isco"] = df.loc[df["year"] == 2011, "isco2011"]
df.loc[df["year"] == 2015, "isco"] = df.loc[df["year"] == 2015, "isco2015"]

In [12]:
df.duplicated().sum()

0

In [13]:
df.country.unique()

array(['Austria', 'Belgium', 'Czech Republic', 'Switzerland', 'Germany',
       'Denmark', 'Estonia', 'Spain', 'France', 'Italy', 'Sweden',
       'Slovenia'], dtype=object)

In [14]:
df.mergeid.nunique()

2322

In [15]:
df.groupby("year").work_horizon.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011,2322.0,8.619582,3.183506,1.0,6.0,8.0,11.0,15.0
2015,2322.0,5.334436,2.821142,0.17,3.0,5.0,7.5,11.5


In [16]:
df.groupby("country").work_horizon_change.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Austria,310.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Belgium,558.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Czech Republic,350.0,1.136171,0.651663,0.0,1.0,1.0,1.0,5.0
Denmark,392.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Estonia,856.0,1.496495,0.072463,0.0,1.5,1.5,1.5,1.5
France,438.0,0.156941,0.211782,0.0,0.0,0.0,0.34,0.5
Germany,64.0,0.17,0.0,0.17,0.17,0.17,0.17,0.17
Italy,264.0,3.850379,0.563345,0.0,3.75,3.75,3.75,4.75
Slovenia,230.0,2.33913,0.697932,2.0,2.0,2.0,2.0,4.0
Spain,270.0,0.025926,0.07636,0.0,0.0,0.0,0.0,0.25


In [17]:
df.to_csv(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/results/share_clean_w46.csv",
    index=False,
)  # Save resulting dataset