## Creating the dataset

### Load libraries

In [1]:
import os
import sys

src_path = os.path.abspath("../")
sys.path.append(src_path)

from functools import reduce

import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from scipy.linalg import eigh

from utils.retirement import *

### Load and merge all needed datasets

In [2]:
datasets = []

folders = [
    f"/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew{i}_rel8-0-0_ALL_datasets_stata"
    for i in [4, 6]
]

merge_columns = ["mergeid", "hhid1", "mergeidp1", "coupleid1", "country", "language"]

for folder in folders:
    wave = int(folder.split("sharew")[1].split("_")[0])

    folder_datasets = []

    for filename in os.listdir(folder):
        if (
            filename.endswith("dn.dta")
            or filename.endswith("ep.dta")
            or filename.endswith("ch.dta")
            or filename.endswith("gv_health.dta")
        ):
            file_path = os.path.join(folder, filename)
            dataset = pd.read_stata(file_path)

            dataset["wave"] = wave

            folder_datasets.append(dataset)

    if folder_datasets:
        merged_dataset = reduce(
            lambda left, right: pd.merge(
                left,
                right,
                on=left.columns.intersection(right.columns).tolist(),
                how="inner",
            ),
            folder_datasets,
        )

        datasets.append(merged_dataset)

df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)

  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(dat

In [3]:
# Unique individuals
df.groupby("wave").mergeid.nunique()

wave
4    58000
6    68085
Name: mergeid, dtype: int64

### Choose only necessary columns

In [26]:
df_short = df[
    [
        "mergeid",
        "wave",
        "country",
        "dn002_",
        "dn003_",
        "dn006_",
        "dn010_",
        "dn014_",
        "dn042_",
        "ch001_",
        "ep005_",
        "ep009_",
        "ep010_",
        "ep012_",
        "ep016_",
        "ep616isco",
        "ep152isco",
        "ep141d1",
        "ep141d2",
        "ep141d3",
        # "ep026_",
        # "ep027_",
        # "ep028_",
        # "ep029_",
        # "ep030_",
        # "ep031_",
        # "ep032_",
        # "ep033_",
        # "ep034_",
        # "ep035_",
        # "ep036_",
        # "ep037_",
        "ep071dno",
        "ep671dno",
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
        "eurod",
        "eurodcat",
    ]
]

In [27]:
# Rename some essential columns
df_short = df_short.rename(
    columns={
        "dn003_": "yrbirth",
        "dn006_": "yr1country",
        "dn010_": "education",
        "dn014_": "marital_status",
        "dn042_": "gender",
        "ch001_": "nb_children",
        "ep005_": "employment",
        "ep009_": "job_status",
        "ep010_": "job_start",
        "ep012_": "job_week_hours",
        "ep016_": "job_name",
        "ep616isco": "job_isco_current",
        "ep152isco": "last_main_job",
        "ep141d1": "job_change_type",
        "ep141d2": "job_change_employer",
        "ep141d3": "job_change_promotion",
        "ep071dno": "pension",
        "ep671dno": "pension1",
    }
)

In [31]:
df_short.job_isco_current.value_counts(dropna=False)

job_isco_current
NaN           119670
Don't know       786
4110.0           175
9112.0           145
2221.0           125
               ...  
3252.0             1
7124.0             1
3230.0             1
9212.0             1
310.0              1
Name: count, Length: 406, dtype: int64

In [32]:
df_short.last_main_job.value_counts(dropna=False)

last_main_job
NaN           119661
Don't know       950
4110.0           148
9112.0           145
3343.0           136
               ...  
7316.0             1
2143.0             1
7127.0             1
2522.0             1
312.0              1
Name: count, Length: 379, dtype: int64

In [17]:
len(df_short[df_short.wave == 6]["job_isco"])

68085

In [18]:
df_short[df_short.wave == 6]["job_isco"].isna().sum()

61670

### Delete those who had changes in job

In [6]:
list_changes = []
for i in range(len(df_short)):
    if (
        df_short.job_change_type[i] == "Selected"
        or df_short.job_change_employer[i] == "Selected"
        or df_short.job_change_promotion[i] == "Selected"
    ):
        list_changes.append(df_short.mergeid[i])
df_filtered = df_short[~df_short["mergeid"].isin(list_changes)]

In [7]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    56718
6    66439
Name: mergeid, dtype: int64

### Leave only individuals present in both waves

In [9]:
set_ids = df_filtered.groupby("mergeid")["wave"].apply(set)
mergeids_with_both_waves = set_ids[set_ids.apply(lambda x: len(x) == 2)].index.tolist()

df_filtered = df_filtered[df_filtered["mergeid"].isin(mergeids_with_both_waves)]

In [10]:
df_filtered.groupby("wave").mergeid.nunique()

wave
4    33589
6    33589
Name: mergeid, dtype: int64

### Filter countries

In [11]:
df_filtered.country.unique()

array(['Austria', 'Belgium', 'Czech Republic', 'Switzerland', 'Germany',
       'Denmark', 'Estonia', 'Spain', 'France', 'Italy', 'Poland',
       'Portugal', 'Sweden', 'Slovenia'], dtype=object)

In [12]:
# Delete Sweden
df_filtered = df_filtered[df_filtered["country"] != "Sweden"].reset_index(drop=True)

In [13]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    32287
6    32287
Name: mergeid, dtype: int64

### Filter for only aged 50-67

In [14]:
wave_to_year = {4: 2011, 6: 2015}

# Year of survey
df_filtered["yrsurvey"] = df_filtered["wave"].map(wave_to_year).astype(int)

# Year of birth
df_filtered["yrbirth"] = pd.to_numeric(df_filtered["yrbirth"], errors="coerce")
df_filtered = df_filtered[df_filtered["yrbirth"].notna()].reset_index(drop=True)

# Age
df_filtered["age"] = df_filtered["yrsurvey"] - df_filtered["yrbirth"]

# Filter for 50+
df_filtered = df_filtered[
    (df_filtered["age"] >= 50) & (df_filtered["age"] <= 67)
].reset_index(drop=True)

In [15]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    18664
6    14445
Name: mergeid, dtype: int64

### Filter for not retired and employed

In [16]:
df_filtered = df_filtered[
    df_filtered.employment
    == "Employed or self-employed (including working for family business)"
].reset_index(drop=True)

In [17]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    7757
6    5393
Name: mergeid, dtype: int64

### Filter out those who hold state pensions for disabilities or other special conditions

In [18]:
# ep071dno and ep671dno (from wave 6) - no current state pensions
df_filtered = df_filtered[
    (df_filtered.pension == "Selected") | (df_filtered.pension1 == "Selected")
].reset_index(drop=True)

In [19]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    6796
6    4514
Name: mergeid, dtype: int64

### Calculate contribution years

In [33]:
# Load job episodes panel data (from retrospective waves 3 and 7)
jobs = pd.read_stata(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharewX_rel8-0-0_gv_job_episodes_panel.dta"
)

In [34]:
jobs.columns

Index(['mergeid', 'hhid7', 'hhid3', 'jep_w', 'gender', 'yrbirth', 'age',
       'year', 'country', 'situation', 'working', 'unemployed', 'in_education',
       'retired', 'mainjob', 'ordjob', 'industry', 'job_title',
       'working_hours', 'first_wage', 'currency_fw', 'first_income',
       'currency_fi', 'reason_endjob', 'afterlast', 'lastwage', 'currency_lw',
       'lastincome', 'currency_li', 'first_pension', 'currency_fp',
       'country_res_', 'nchildren_nat', 'nchildren', 'age_youngest',
       'age_youngest_nat', 'withpartner', 'married', 'contrib_employee',
       'contrib_employer', 'early_ret_reduction', 'currency_min_pension',
       'currency_max_pension', 'ret_age', 'early_age', 'min_pension',
       'max_pension', 'current_wage', 'current_currency_w', 'current_income',
       'current_currency_i'],
      dtype='object')

In [30]:
# Calculate number of years of work for each individual
conditions = ["Employee or self-employed", "Short term job (less than 6 months)"]
relevant_rows = jobs[jobs["situation"].isin(conditions)]
result_jobs = (
    relevant_rows.groupby("mergeid").size().reset_index(name="yrscontribution2019")
)
# Calculate the year of first contribution
first_contribution = (
    relevant_rows.groupby("mergeid")["year"].min().reset_index(name="yr1contribution")
)

In [31]:
# Merge with main dataset
df_filtered = df_filtered.merge(result_jobs, on="mergeid", how="left")
df_filtered["yrscontribution"] = df_filtered["yrscontribution2019"] - (
    2019 - df_filtered["yrsurvey"]
)
df_filtered = df_filtered.merge(first_contribution, on="mergeid", how="left")
df_filtered = df_filtered.merge(
    jobs[["mergeid", "year", "withpartner"]].rename(columns={"year": "yrsurvey"}),
    on=["mergeid", "yrsurvey"],
    how="left",
)

# Delete those with less than 10 years of contributions
df_filtered = df_filtered[df_filtered["yrscontribution"] >= 10]
# Delete those who started work before the age of 10
df_filtered = df_filtered[
    df_filtered["yr1contribution"].astype(int)
    >= df_filtered["yrbirth"].astype(int) + 12
]

In [32]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    7124
6    8474
Name: mergeid, dtype: int64

### Set legal retirement ages

In [33]:
# Make some necessary formatting
month_to_numeric = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

df_filtered["mbirth"] = df_filtered["dn002_"].map(month_to_numeric)
df_filtered["yr1country"] = df_filtered["yr1country"].fillna(df_filtered["yrbirth"])
df_filtered = df_filtered[
    ~(
        (df_filtered["country"] == "Czech Republic")
        & (df_filtered["gender"] == "Female")
        & pd.to_numeric(df_filtered["nb_children"], errors="coerce").isna()
    )
]

In [34]:
country_functions_age = {
    "Austria": austria_age,
    "Belgium": belgium_age,
    "Czech Republic": czech_republic_age,
    "Denmark": denmark_age,
    "Estonia": estonia_age,
    "France": france_age,
    "Germany": germany_age,
    "Italy": italy_age,
    "Luxembourg": luxembourg_age,
    "Poland": poland_age,
    "Slovenia": slovenia_age,
    "Spain": spain_age,
    "Switzerland": switzerland_age,
}


def calculate_retirement_age(row):
    country = row["country"]
    if country in country_functions_age:
        return country_functions_age[country](row)
    else:
        return None

In [35]:
# Apply country-wise functions to calculate legal retirement age
df_filtered["retirement_age"] = df_filtered.apply(calculate_retirement_age, axis=1)

# Delete those who are above the retirement age (continue to work longer)
df_filtered = df_filtered[
    df_filtered["retirement_age"] > df_filtered["age"]
].reset_index(drop=True)

In [36]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    6566
6    7970
Name: mergeid, dtype: int64

### Calculate resting work horizon and its change due to reforms

In [37]:
# Calculate resting work horizon
df_filtered["work_horizon"] = df_filtered["retirement_age"] - df_filtered["age"]

In [38]:
country_functions_change = {
    "Austria": austria_change,
    "Belgium": belgium_change,
    "Czech Republic": czech_republic_change,
    "Denmark": denmark_change,
    "Estonia": estonia_change,
    "France": france_change,
    "Germany": germany_change,
    "Italy": italy_change,
    "Luxembourg": luxembourg_change,
    "Poland": poland_change,
    "Slovenia": slovenia_change,
    "Spain": spain_change,
    "Switzerland": switzerland_change,
}


def calculate_horizon_change(row):
    country = row["country"]
    if country in country_functions_change:
        return country_functions_change[country](row)
    else:
        return None

In [39]:
# Apply country-wise functions to calculate work horizon change due to reforms
df_filtered["work_horizon_change"] = df_filtered.apply(calculate_horizon_change, axis=1)

### Calculate mental health indicators

In [20]:
# Filter out those with missing values for eurod scale
df_filtered = df_filtered.dropna(subset=["eurod"]).reset_index(drop=True)

In [21]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    6693
6    4388
Name: mergeid, dtype: int64

In [42]:
# Transform to numeric
df_filtered["eurod"] = df_filtered["eurod"].replace(
    {"Not depressed": 0, "Very depressed": 12}
)
df_filtered["eurodcat"] = df_filtered["eurodcat"].replace({"Yes": 1, "No": 0})
df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
] = df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
].applymap(
    lambda x: 1 if x == "Selected" else 0
)

  ].applymap(


In [43]:
# Conduct a PCA to deduct affective suffering and lack of motivation as separate indicators
columns_for_pca = [
    "euro1",
    "euro2",
    "euro3",
    "euro4",
    "euro5",
    "euro6",
    "euro7",
    "euro8",
    "euro9",
    "euro10",
    "euro11",
    "euro12",
]
data_pca = df_filtered[columns_for_pca]

corr_mat = np.corrcoef(data_pca, rowvar=False)  # Tetrachoric correlation matrix

evals, evecs = eigh(corr_mat)  # Eigenvalues and eigenvectors

fa = FactorAnalyzer(n_factors=2, rotation="varimax", method="ml")
fa.fit(data_pca)

factor_scores = fa.transform(data_pca)

cutoff = 0.55

df_filtered["affective_suffering"] = 0
df_filtered["motivation_lack"] = 0

df_filtered["affective_suffering"] = (factor_scores[:, 0] >= cutoff).astype(int)
df_filtered["motivation_lack"] = (factor_scores[:, 1] >= cutoff).astype(int)

In [44]:
fa.loadings_

array([[0.71778892, 0.05979648],
       [0.03365863, 0.24030064],
       [0.25207875, 0.33871141],
       [0.28795341, 0.197268  ],
       [0.40115568, 0.16228975],
       [0.17681905, 0.39142054],
       [0.41616835, 0.16286874],
       [0.19300032, 0.24843458],
       [0.37475538, 0.2162275 ],
       [0.17719192, 0.29956364],
       [0.01092714, 0.23075122],
       [0.49279713, 0.0648547 ]])

In [45]:
df_filtered.affective_suffering.value_counts()

affective_suffering
0    10074
1     4163
Name: count, dtype: int64

In [46]:
df_filtered.motivation_lack.value_counts()

motivation_lack
0    12403
1     1834
Name: count, dtype: int64

### Leave those still present in both waves

In [22]:
# Leave those still present in both waves
set_ids = df_filtered.groupby("mergeid")["wave"].apply(set)
mergeids_with_both_waves = set_ids[set_ids.apply(lambda x: len(x) == 2)].index.tolist()

df_filtered = df_filtered[df_filtered["mergeid"].isin(mergeids_with_both_waves)]

In [23]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
4    3732
6    3732
Name: mergeid, dtype: int64

### ISCO

In [24]:
df_filtered.job_isco.isna().sum()

7251

In [25]:
df_filtered.job_isco.value_counts(dropna=False)

job_isco
NaN           7251
Don't know      24
4110.0           8
2221.0           5
2341.0           5
              ... 
3253.0           0
3252.0           0
3251.0           0
3240.0           0
4132.0           0
Name: count, Length: 406, dtype: int64

In [36]:
# df_filtered.to_csv(
#   "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/data_clean.csv",
#    index=False,
# )