# MH of older workers, retirement age and working conditions

## Preprocessing SHARE data

Load libraries

In [1]:
import os
import sys

src_path = os.path.abspath("../")
sys.path.append(src_path)

from utils.common import *
from utils.retirement import *
from utils.share import *

import_libraries()

Preprocess **SHARELIFE data**, apply first filters and create first variables

In [2]:
file_names = ["cv_r.dta", "technical_variables.dta", "dn.dta", "re.dta"]
sharelife = import_share_stata1(file_names=file_names, waves=[7])

In [3]:
sharelife = sharelife_preprocessing(sharelife)

Initial n obs: 63248
Gender, country, 1st year in country - formatted, age 50+ filter - applied
N obs after processing gender and age: 56486
Years of education - calculated
N obs after processing education years: 56486
Current ISCO - identified
N obs after isco job changes: 41945
Years of contribution, 1st year of contribution - calculated
N obs after contribution years: 41945


Preprocess **additional data from SHARE waves 6-8** where isco is available

In [4]:
file_names = ["cv_r.dta", "dn.dta", "ep.dta"]
sharelife_add = import_share_stata1(
    file_names=file_names, waves=[6, 7, 8], convert_categoricals=True
)

In [5]:
sharelife_add = sharelife_add_preprocessing(sharelife_add, sharelife)

N obs initial: 192020
N obs dropping missing isco: 10679
N obs after drop already present in Sharelife: 6843
Gender, country, 1st year in country - formatted, age 50+ filter - applied
N obs after gender and age: 3262
Years of education - calculated
N obs after education: 3262
Current ISCO - identified, those changed job - deleted
N obs after job and isco: 2527
Years of contribution, 1st year of contribution - calculated
N obs after contribution years: 2527


In [6]:
# Concat main and additional datasets
df = pd.concat([sharelife, sharelife_add], axis=0).reset_index(drop=True)

Preprocess **main data from SHARE waves 4-6**

In [7]:
file_names = ["cv_r.dta", "dn.dta", "ep.dta", "ch.dta", "gv_health.dta", "as.dta"]
share = import_share_stata1(
    file_names=file_names, waves=[4, 5, 6], convert_categoricals=True
)

In [8]:
share = share_preprocessing(share, df)

Initial n obs: 192150
Those without ISCO codes - deleted
N obs with ISCO: 74637
N obs after age calculation: 74637
N obs after defining number of children: 74637
Current year, age, number of children and living with a partner - imputed
N obs after leaving only employed: 23465
N obs after deleting special conditions pension: 19572
Currently not working and eligible to special pensions - deleted
N obs after defining industry: 19572
Job status, industry of employment - added
N obs after defining finance: 19572
Household income, investments, life insurance - added
N obs after dropping missing sphus:19561
N obs after dropping missing chronic:19559
N obs after dropping missing eurod:19136
Physical and mental health indicators - added
N obs after health: 19136


In [9]:
# Merge with Sharelife data
df = share.merge(df, on=["mergeid"], how="left")

**Final full SHARE dataset**

In [10]:
df = share_final_preprocessing(df)

N obs initial: 19136
Current years of contribution - calculated, those with less 10 years - deleted
Data types - corrected
N obs after data types: 17871
0
3031
Retirement age, work horizon and work horizon change by reforms - calculated
N obs after work horizon change: 15484
Longitudinal and crossectional weights - added
N obs after weights: 15418


In [11]:
df.country.unique()

array(['Austria', 'Belgium', 'Czech Republic', 'Switzerland', 'Germany',
       'Denmark', 'Estonia', 'Spain', 'France', 'Hungary', 'Italy',
       'Netherlands', 'Poland', 'Portugal', 'Sweden', 'Slovenia',
       'Luxembourg', 'Greece'], dtype=object)

In [20]:
df.mergeid.nunique()

5873

In [18]:
df.mergeid.value_counts().value_counts()

count
1    1922
2    1413
3    1409
4    1127
5       2
Name: count, dtype: int64

In [19]:
df.duplicated().sum()

0

In [17]:
df = df.drop_duplicates().reset_index(drop=True)

In [22]:
mergeid_counts = df["mergeid"].value_counts()

df[df["mergeid"].isin(mergeid_counts[mergeid_counts == 5].index)]

Unnamed: 0,mergeid,hhid,wave,year,age,nb_children,nb_grandchildren,partnerinhh,job_status,industry,...,work_horizon_change,work_horizon_change_minimum,wblock56,my_wgt,cciw_w4,cciw_w5,cciw_w6,cciw,isco,isco1
755,CZ-490962-02,CZ-490962-A,4,2011,51.0,0.0,0.0,1,Civil servant,Public administration and defence,...,0.67,0.67,0,724.409973,440.863068,,,440.863068,3353,3
824,CZ-906302-01,CZ-906302-A,4,2011,51.0,6.0,2.0,1,Employee,Manufacturing,...,1.67,1.67,0,699.797424,425.884308,,,425.884308,7531,7
3487,CZ-490962-02,CZ-490962-A,5,2013,53.0,1.0,0.0,1,Private sector employee,Public administration and defence,...,0.67,0.67,0,724.409973,,326.637238,,326.637238,3353,3
3488,CZ-490962-02,CZ-490962-A,5,2013,53.0,1.0,0.0,1,Private sector employee,Public administration and defence,...,0.67,0.67,0,724.409973,,326.637238,,326.637238,3353,3
3579,CZ-906302-01,CZ-906302-A,5,2013,53.0,4.0,2.0,1,Private sector employee,Manufacturing,...,1.67,1.67,0,699.797424,,314.776978,,314.776978,7531,7
3580,CZ-906302-01,CZ-906302-A,5,2013,53.0,4.0,2.0,1,Private sector employee,Manufacturing,...,1.67,1.67,0,699.797424,,314.776978,,314.776978,7531,7
7227,CZ-490962-02,CZ-490962-A,5,2013,53.0,1.0,0.0,1,Private sector employee,Public administration and defence,...,1.33,2.33,1,724.409973,,326.637238,,326.637238,3353,3
7306,CZ-906302-01,CZ-906302-A,5,2013,53.0,4.0,2.0,1,Private sector employee,Manufacturing,...,0.33,3.33,1,699.797424,,314.776978,,314.776978,7531,7
10660,CZ-490962-02,CZ-490962-A,6,2015,55.0,0.0,0.0,1,Public sector employee,Public administration and defence,...,1.33,2.33,1,724.409973,,,383.020538,383.020538,3353,3
10736,CZ-906302-01,CZ-906302-A,6,2015,55.0,4.0,2.0,1,Private sector employee,Manufacturing,...,0.33,3.33,1,699.797424,,,369.112976,369.112976,7531,7


In [23]:
df[df.mergeid == "CZ-490962-02"]

Unnamed: 0,mergeid,hhid,wave,year,age,nb_children,nb_grandchildren,partnerinhh,job_status,industry,...,work_horizon_change,work_horizon_change_minimum,wblock56,my_wgt,cciw_w4,cciw_w5,cciw_w6,cciw,isco,isco1
755,CZ-490962-02,CZ-490962-A,4,2011,51.0,0.0,0.0,1,Civil servant,Public administration and defence,...,0.67,0.67,0,724.409973,440.863068,,,440.863068,3353,3
3487,CZ-490962-02,CZ-490962-A,5,2013,53.0,1.0,0.0,1,Private sector employee,Public administration and defence,...,0.67,0.67,0,724.409973,,326.637238,,326.637238,3353,3
3488,CZ-490962-02,CZ-490962-A,5,2013,53.0,1.0,0.0,1,Private sector employee,Public administration and defence,...,0.67,0.67,0,724.409973,,326.637238,,326.637238,3353,3
7227,CZ-490962-02,CZ-490962-A,5,2013,53.0,1.0,0.0,1,Private sector employee,Public administration and defence,...,1.33,2.33,1,724.409973,,326.637238,,326.637238,3353,3
10660,CZ-490962-02,CZ-490962-A,6,2015,55.0,0.0,0.0,1,Public sector employee,Public administration and defence,...,1.33,2.33,1,724.409973,,,383.020538,383.020538,3353,3


In [30]:
df.iloc[[3487, 3488], 20:40]

Unnamed: 0,motivation_lack,country,gender,yrbirth,mobirth,yr1country,yrseducation,isco2011,isco2013,isco2015,yr1contribution,yrscontribution,retirement_age,retirement_age_early,retirement_age_minimum,work_horizon,work_horizon_minimum,work_horizon_change,work_horizon_change_minimum,wblock56
3487,0,Czech Republic,1,1960.0,6.0,1960.0,17.0,3353,3353,3353,1983,31.0,60.67,57.67,57.67,7.67,4.67,0.67,0.67,0
3488,0,Czech Republic,1,1960.0,6.0,1960.0,17.0,3353,3353,3353,1983,31.0,61.67,58.67,58.67,8.67,5.67,0.67,0.67,0


In [24]:
row_3487 = df.iloc[3487]
row_3488 = df.iloc[3488]

# Find variables that are not equal
differences = row_3487 != row_3488

# Filter out variables where values are not equal
variables_with_differences = differences[differences].index.tolist()

# Display variables that are not equal
print("Variables with differences between rows 3487 and 3488:")

Variables with differences between rows 3487 and 3488:


In [14]:
df.groupby("country").work_horizon_change_minimum.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Austria,880.0,0.993182,0.082337,0.0,1.0,1.0,1.0,1.0
Belgium,1497.0,0.658317,0.832165,0.0,0.0,0.0,1.0,3.0
Czech Republic,725.0,1.084221,1.10443,0.33,0.33,0.5,1.33,4.33
Denmark,808.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Estonia,1197.0,0.484127,0.434409,0.0,0.0,0.5,1.0,1.0
France,761.0,0.083837,0.167072,0.0,0.0,0.0,0.0,1.0
Germany,1204.0,0.005249,0.020703,0.0,0.0,0.0,0.0,0.09
Italy,697.0,1.040818,1.34962,0.0,0.0,0.6,1.6,6.25
Luxembourg,249.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Netherlands,57.0,0.11,0.042804,0.08,0.08,0.08,0.17,0.17


In [15]:
df.to_csv(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/results/share_clean_w46.csv",
    index=False,
)  # Save resulting dataset