## Creating the dataset

### Load libraries

In [1]:
import os
import sys

src_path = os.path.abspath("../")
sys.path.append(src_path)

from functools import reduce

import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from scipy.linalg import eigh

from utils.retirement import *

### Load and merge all needed datasets

In [2]:
datasets = []

folders = [
    f"/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew{i}_rel8-0-0_ALL_datasets_stata"
    for i in [1, 2, 4, 5, 6, 7, 8]
]

merge_columns = ["mergeid", "hhid1", "mergeidp1", "coupleid1", "country", "language"]

for folder in folders:
    wave = int(folder.split("sharew")[1].split("_")[0])

    folder_datasets = []

    for filename in os.listdir(folder):
        if (
            filename.endswith("dn.dta")
            or filename.endswith("ep.dta")
            or filename.endswith("ch.dta")
            or filename.endswith("gv_health.dta")
        ):
            file_path = os.path.join(folder, filename)
            dataset = pd.read_stata(file_path)

            dataset["wave"] = wave

            folder_datasets.append(dataset)

    if folder_datasets:
        merged_dataset = reduce(
            lambda left, right: pd.merge(
                left,
                right,
                on=left.columns.intersection(right.columns).tolist(),
                how="inner",
            ),
            folder_datasets,
        )

        datasets.append(merged_dataset)

df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)

  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  df = pd.concat(dat

### Choose only necessary columns

In [3]:
df_short = df[
    [
        "mergeid",
        "wave",
        "country",
        "dn002_",
        "dn003_",
        "dn006_",
        "dn042_",
        "ch001_",
        "ep005_",
        "ep009_",
        "ep026_",
        "ep027_",
        "ep028_",
        "ep029_",
        "ep030_",
        "ep031_",
        "ep032_",
        "ep033_",
        "ep034_",
        "ep035_",
        "ep036_",
        "ep037_",
        "ep071dno",
        "ep671dno",
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
        "eurod",
        "eurodcat",
    ]
]

### Filter countries

In [4]:
# Keep only countries present in both waves in each couple
filtered_countries = []

wave_pairs = [(1, 2), (2, 4), (4, 5), (5, 6), (6, 7), (7, 8)]

for wave1, wave2 in wave_pairs:
    wave1_data = df_short[df_short["wave"] == wave1]
    wave2_data = df_short[df_short["wave"] == wave2]

    common_countries = set(wave1_data["country"]) & set(wave2_data["country"])

    common_countries_wave1 = wave1_data[wave1_data["country"].isin(common_countries)]
    common_countries_wave2 = wave2_data[wave2_data["country"].isin(common_countries)]

    common_countries_wave1["wave_couple"] = f"w{wave1}{wave2}"
    common_countries_wave2["wave_couple"] = f"w{wave1}{wave2}"

    filtered_countries.append(common_countries_wave1)
    filtered_countries.append(common_countries_wave2)

df_filtered = pd.concat(filtered_countries, sort=False, axis=0).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_countries_wave2["wave_couple"] = f"w{wave1}{wave2}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_countries_wave1["wave_couple"] = f"w{wave1}{wave2}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_countries_wave2["wave_couple"] = f"w{wave1}{wave2}"
A value is trying to be set 

In [5]:
# Leave only 15 principal countries of interest
countries_list = [
    "Austria",
    "Belgium",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "France",
    "Germany",
    "Italy",
    "Luxembourg",
    "Netherlands",
    "Poland",
    "Portugal",
    "Slovenia",
    "Spain",
    "Switzerland",
]

df_filtered = df_filtered[df_filtered["country"].isin(countries_list)].reset_index(
    drop=True
)

In [6]:
print(
    f"After filtering we have {df_filtered.country.nunique()} out of {df_short.country.nunique()} countries left. They are: {df_filtered.country.unique()}"
)

After filtering we have 15 out of 29 countries left. They are: ['Austria' 'Belgium' 'Switzerland' 'Germany' 'Denmark' 'Spain' 'France'
 'Italy' 'Netherlands' 'Czech Republic' 'Poland' 'Estonia' 'Slovenia'
 'Luxembourg' 'Portugal']


In [7]:
for couple in df_filtered.wave_couple.unique():
    print(
        f"{couple} - {df_filtered[df_filtered.wave_couple == couple].country.nunique()} common countries:"
    )
    print(df_filtered[df_filtered.wave_couple == couple]["country"].unique())

w12 - 9 common countries:
['Austria' 'Belgium' 'Switzerland' 'Germany' 'Denmark' 'Spain' 'France'
 'Italy' 'Netherlands']
w24 - 11 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Spain' 'France' 'Italy' 'Netherlands' 'Poland']
w45 - 12 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Estonia' 'Spain' 'France' 'Italy' 'Netherlands' 'Slovenia']
w56 - 12 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Estonia' 'Spain' 'France' 'Italy' 'Luxembourg' 'Slovenia']
w67 - 14 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Estonia' 'Spain' 'France' 'Italy' 'Luxembourg' 'Poland' 'Portugal'
 'Slovenia']
w78 - 13 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Estonia' 'Spain' 'France' 'Italy' 'Luxembourg' 'Poland' 'Slovenia']


### Filter for only aged 50-67

In [8]:
wave_to_year = {1: 2004, 2: 2007, 4: 2011, 5: 2013, 6: 2015, 7: 2017, 8: 2020}

# Year of survey
df_filtered["yrsurvey"] = df_filtered["wave"].map(wave_to_year).astype(int)

# Year of birth
df_filtered["yrbirth"] = df_filtered["dn003_"]
df_filtered["yrbirth"] = pd.to_numeric(df_filtered["yrbirth"], errors="coerce")
df_filtered = df_filtered[df_filtered["yrbirth"].notna()].reset_index(drop=True)

# Age
df_filtered["age"] = df_filtered["yrsurvey"] - df_filtered["yrbirth"]

# Filter for 50+
df_filtered = df_filtered[
    (df_filtered["age"] >= 50) & (df_filtered["age"] <= 67)
].reset_index(drop=True)

### Filter for not retired and employed

In [9]:
df_filtered["employment"] = df_filtered["ep005_"]
df_filtered = df_filtered[
    df_filtered.employment
    == "Employed or self-employed (including working for family business)"
].reset_index(drop=True)

### Filter out those who hold state pensions for disabilities or other special conditions

In [10]:
# ep071dno and ep671dno (from wave 6) - no current state pensions
df_filtered = df_filtered[
    (df_filtered.ep071dno == "Selected") | (df_filtered.ep671dno == "Selected")
].reset_index(drop=True)

### Calculate contribution years

In [11]:
# Load job episodes panel data (from retrospective waves 3 and 7)
jobs = pd.read_stata(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharewX_rel8-0-0_gv_job_episodes_panel.dta"
)

In [12]:
# Calculate number of years of work for each individual
conditions = ["Employee or self-employed", "Short term job (less than 6 months)"]
relevant_rows = jobs[jobs["situation"].isin(conditions)]
result_jobs = (
    relevant_rows.groupby("mergeid").size().reset_index(name="yrscontribution")
)

In [13]:
first_contribution = (
    relevant_rows.groupby("mergeid")["year"].min().reset_index(name="yr1contribution")
)

In [14]:
# Merge with main dataset
df_filtered = df_filtered.merge(result_jobs, on="mergeid", how="left")
df_filtered = df_filtered.merge(first_contribution, on="mergeid", how="left")

# Delete those with less than 10 years of contributions
df_filtered = df_filtered[df_filtered["yrscontribution"] >= 10]
# Delete those who started work before the age of 10
df_filtered = df_filtered[
    df_filtered["yr1contribution"].astype(int)
    >= df_filtered["yrbirth"].astype(int) + 12
]

### Set legal retirement ages

In [15]:
# Make some necessary formatting
month_to_numeric = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

df_filtered["gender"] = df_filtered["dn042_"]
df_filtered["nb_children"] = df_filtered["ch001_"]
df_filtered["mbirth"] = df_filtered["dn002_"].map(month_to_numeric)
df_filtered["yr1country"] = df_filtered["dn006_"]
df_filtered["yr1country"] = df_filtered["yr1country"].fillna(df_filtered["yrbirth"])
df_filtered["job_status"] = df_filtered["ep009_"]
df_filtered = df_filtered[
    ~(
        (df_filtered["country"] == "Czech Republic")
        & (df_filtered["gender"] == "Female")
        & pd.to_numeric(df_filtered["nb_children"], errors="coerce").isna()
    )
]

In [16]:
country_functions_age = {
    "Austria": austria_age,
    "Belgium": belgium_age,
    "Czech Republic": czech_republic_age,
    "Denmark": denmark_age,
    "Estonia": estonia_age,
    "France": france_age,
    "Germany": germany_age,
    "Italy": italy_age,
    "Luxembourg": luxembourg_age,
    "Netherlands": netherlands_age,
    "Poland": poland_age,
    "Portugal": portugal_age,
    "Slovenia": slovenia_age,
    "Spain": spain_age,
    "Switzerland": switzerland_age,
}


def calculate_retirement_age(row):
    country = row["country"]
    if country in country_functions_age:
        return country_functions_age[country](row)
    else:
        return None

In [17]:
# Apply country-wise functions to calculate legal retirement age
df_filtered["retirement_age"] = df_filtered.apply(calculate_retirement_age, axis=1)

# Delete those who are above the retirement age (continue to work longer)
df_filtered = df_filtered[
    df_filtered["retirement_age"] > df_filtered["age"]
].reset_index(drop=True)

### Calculate resting work horizon and its change due to reforms

In [18]:
# Calculate resting work horizon
df_filtered["work_horizon"] = df_filtered["retirement_age"] - df_filtered["age"]

In [19]:
country_functions_change = {
    "Austria": austria_change,
    "Belgium": belgium_change,
    "Czech Republic": czech_republic_change,
    "Denmark": denmark_change,
    "Estonia": estonia_change,
    "France": france_change,
    "Germany": germany_change,
    "Italy": italy_change,
    "Luxembourg": luxembourg_change,
    "Netherlands": netherlands_change,
    "Poland": poland_change,
    "Portugal": portugal_change,
    "Slovenia": slovenia_change,
    "Spain": spain_change,
    "Switzerland": switzerland_change,
}


def calculate_horizon_change(row):
    country = row["country"]
    if country in country_functions_change:
        return country_functions_change[country](row)
    else:
        return None

In [20]:
# Apply country-wise functions to calculate work horizon change due to reforms
df_filtered["work_horizon_change"] = df_filtered.apply(calculate_horizon_change, axis=1)

# Filter for change <0
df_filtered = df_filtered[df_filtered["work_horizon_change"] >= 0].reset_index(
    drop=True
)

### Calculate mental health indicators

In [21]:
# Filter out those with missing values for eurod scale
df_filtered = df_filtered.dropna(subset=["eurod"]).reset_index(drop=True)

In [22]:
# Transform to numeric
df_filtered["eurod"] = df_filtered["eurod"].replace(
    {"Not depressed": 0, "Very depressed": 12}
)
df_filtered["eurodcat"] = df_filtered["eurodcat"].replace({"Yes": 1, "No": 0})
df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
] = df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
].applymap(
    lambda x: 1 if x == "Selected" else 0
)

  ].applymap(


In [23]:
# Conduct a PCA to deduct affective suffering and lack of motivation as separate indicators
columns_for_pca = [
    "euro1",
    "euro2",
    "euro3",
    "euro4",
    "euro5",
    "euro6",
    "euro7",
    "euro8",
    "euro9",
    "euro10",
    "euro11",
    "euro12",
]
data_pca = df_filtered[columns_for_pca]

corr_mat = np.corrcoef(data_pca, rowvar=False)  # Tetrachoric correlation matrix

evals, evecs = eigh(corr_mat)  # Eigenvalues and eigenvectors

fa = FactorAnalyzer(n_factors=2, rotation="varimax", method="ml")
fa.fit(data_pca)

factor_scores = fa.transform(data_pca)

cutoff = 0.55

df_filtered["affective_suffering"] = 0
df_filtered["motivation_lack"] = 0

df_filtered["affective_suffering"] = (factor_scores[:, 0] >= cutoff).astype(int)
df_filtered["motivation_lack"] = (factor_scores[:, 1] >= cutoff).astype(int)

In [24]:
fa.loadings_

array([[0.72173158, 0.07005404],
       [0.02150356, 0.21090593],
       [0.2352875 , 0.30564443],
       [0.26387677, 0.21396168],
       [0.39401196, 0.17469246],
       [0.17411812, 0.37978196],
       [0.40375263, 0.19374724],
       [0.19070015, 0.25484749],
       [0.36144166, 0.27437803],
       [0.16484074, 0.32981706],
       [0.02631649, 0.21843803],
       [0.48822628, 0.07129066]])

In [25]:
df_filtered.affective_suffering.value_counts()

affective_suffering
0    42302
1    17969
Name: count, dtype: int64

In [26]:
df_filtered.motivation_lack.value_counts()

motivation_lack
0    52032
1     8239
Name: count, dtype: int64

### Working conditions: demand-control model

In [27]:
# Convert work characteristics questions to numeric
for i in [26, 27, 28, 29, 34, 35]:
    df_filtered[f"ep0{i}_"] = df_filtered[f"ep0{i}_"].replace(
        {
            "Strongly agree": 5,
            "Agree": 4,
            "Don't know": 3,
            "Disagree": 2,
            "Strongly disagree": 1,
        }
    )
    df_filtered[f"ep0{i}_"] = pd.to_numeric(df_filtered[f"ep0{i}_"], errors="coerce")
    df_filtered = df_filtered.dropna(subset=[f"ep0{i}_"])

for i in range(30, 34):
    df_filtered[f"ep0{i}_"] = df_filtered[f"ep0{i}_"].replace(
        {
            "Strongly agree": 1,
            "Agree": 2,
            "Don't know": 3,
            "Disagree": 4,
            "Strongly disagree": 5,
        }
    )
    df_filtered[f"ep0{i}_"] = pd.to_numeric(df_filtered[f"ep0{i}_"], errors="coerce")
    df_filtered = df_filtered.dropna(subset=[f"ep0{i}_"]).reset_index(drop=True)

In [28]:
# Calculate groups above and below median for job demands and control
df_filtered["job_demands"] = 0
df_filtered["job_control"] = 0

for i in range(len(df_filtered)):
    if ((df_filtered["ep027_"][i] + df_filtered["ep028_"][i]) / 2) > (
        (df_filtered["ep027_"] + df_filtered["ep028_"]) / 2
    ).median():
        df_filtered["job_demands"][i] = 1
    if (
        (df_filtered["ep029_"][i] + df_filtered["ep030_"][i] + df_filtered["ep031_"][i])
        / 3
    ) > (
        (df_filtered["ep029_"] + df_filtered["ep030_"] + df_filtered["ep031_"]) / 3
    ).median():
        df_filtered["job_control"][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_control"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_demands"][i] = 1


In [29]:
# Define categories of jobs - passive, active, high strain, low strain
df_filtered["job_passive"] = 0
df_filtered["job_active"] = 0
df_filtered["job_high_strain"] = 0
df_filtered["job_low_strain"] = 0

for i in range(len(df_filtered)):
    if df_filtered["job_demands"][i] == 0 and df_filtered["job_control"][i] == 0:
        df_filtered["job_passive"][i] = 1
    elif df_filtered["job_demands"][i] == 1 and df_filtered["job_control"][i] == 1:
        df_filtered["job_active"][i] = 1
    elif df_filtered["job_demands"][i] == 1 and df_filtered["job_control"][i] == 0:
        df_filtered["job_high_strain"][i] = 1
    else:
        df_filtered["job_low_strain"][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_passive"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_low_strain"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_active"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_high_strain"][i] = 1


In [30]:
# Add job recognition, prospects, insecurity
df_filtered["job_poor_recognition"] = 0
df_filtered["job_poor_prospects"] = 0
df_filtered["job_insecurity"] = 0

for i in range(len(df_filtered)):
    if ((df_filtered["ep032_"][i] + df_filtered["ep033_"][i]) / 2) > (
        (df_filtered["ep032_"] + df_filtered["ep033_"]) / 2
    ).median():
        df_filtered["job_poor_recognition"][i] = 1
    if df_filtered["ep034_"][i] > df_filtered["ep034_"].median():
        df_filtered["job_poor_prospects"][i] = 1
    if df_filtered["ep035_"][i] > df_filtered["ep035_"].median():
        df_filtered["job_insecurity"][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_poor_recognition"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_insecurity"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["job_poor_prospects"][i] = 1


In [31]:
# Unique individuals
print(f"There are {df_filtered.mergeid.nunique()} unique individuals in our sample.")
print(f"By wave couples:")
print(df_filtered.groupby("wave_couple").mergeid.nunique())
print(f"By wave:")
print(df_filtered.groupby("wave").mergeid.nunique())

There are 14380 unique individuals in our sample.
By wave couples:
wave_couple
w12    5047
w24    8207
w45    6723
w56    6535
w67    6592
w78    3250
Name: mergeid, dtype: int64
By wave:
wave
1    2987
2    4538
4    6762
5     668
6    6505
7     612
8    2779
Name: mergeid, dtype: int64


In [32]:
# df_filtered.to_csv('/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/data_clean.csv',index=False)