## Creating the dataset

### Load libraries

In [1]:
import os
import sys

src_path = os.path.abspath("../")
sys.path.append(src_path)

from functools import reduce

import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from pandas.io.stata import StataReader
from scipy.linalg import eigh

from utils.retirement import *

### Load and merge all needed datasets for Sharelife

In [2]:
folder = "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew7_rel8-0-0_ALL_datasets_stata/"
merge_columns = ["mergeid", "hhid7", "mergeidp7", "coupleid7", "country", "language"]
datasets = []

for filename in os.listdir(folder):
    if (
        filename.endswith("cv_r.dta")
        or filename.endswith("technical_variables.dta")
        or filename.endswith("dn.dta")
        # or filename.endswith("ra.dta")
        # or filename.endswith("cc.dta")
        # or filename.endswith("dq.dta")
        or filename.endswith("fs.dta")
        # or filename.endswith("gl.dta")
        or filename.endswith("rh.dta")
        or filename.endswith("hs.dta")
        or filename.endswith("rc.dta")
        or filename.endswith("re.dta")
        or filename.endswith("rp.dta")
        or filename.endswith("wq.dta")
        or filename.endswith("gv_weights.dta")
    ):
        file_path = os.path.join(folder, filename)
        dataset = pd.read_stata(file_path, convert_categoricals=False)
        datasets.append(dataset)

df = reduce(lambda left, right: pd.merge(left, right, on=merge_columns), datasets)

In [3]:
# Leave only Sharelife part
df = df[df.mn103_ == 1].reset_index(drop=True)

In [4]:
# Unique individuals
df.mergeid.nunique()

63248

### Transform some variables, filter data and choose only necessary columns

Transform countries from codes to names

In [5]:
with StataReader(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew7_rel8-0-0_ALL_datasets_stata/sharew7_rel8-0-0_cv_r.dta",
    convert_categoricals=True,
) as reader:
    data = reader.read()
    value_labels = reader.value_labels()
df["country"] = df["country"].replace(value_labels.get("country"))

df["country"].value_counts(dropna=False)

country
Estonia           5115
Slovenia          3691
Poland            3559
Spain             3424
Belgium           3333
Czech Republic    3292
Italy             3000
Germany           2984
Austria           2693
Croatia           2408
France            2188
Israel            2131
Sweden            2130
Romania           2114
Slovakia          2077
Lithuania         2035
Finland           2007
Bulgaria          1998
Denmark           1962
Latvia            1734
Switzerland       1648
Hungary           1538
Portugal          1282
Malta             1261
Luxembourg        1250
Cyprus            1233
Greece            1161
Name: count, dtype: int64

Transform gender to 1=female, 0=male

In [6]:
df["gender"] = df["gender"].replace({1: 0, 2: 1})

df["gender"].value_counts(dropna=False)

gender
1    36040
0    27208
Name: count, dtype: int64

Filter for aged 50+ as for 2010 (57+ in 2017)

In [7]:
df = df[df.age2017 >= 57].reset_index(drop=True)

Transform partnerinhh to 1=lives with partner and 0=without

In [8]:
df["partnerinhh"] = df["partnerinhh"].replace({3: 0})

df["partnerinhh"].value_counts(dropna=False)

partnerinhh
1    38124
0    16443
Name: count, dtype: int64

Format variable indicating first year in the country

In [9]:
# Rename and fill missing values with year of birth
df["dn006_"] = df["dn006_"].fillna(df["yrbirth"])
df = df.rename(columns={"dn006_": "yr1country"})

# Drop individuals with missing answers
df = df[df.yr1country > 0].reset_index(drop=True)

df["yr1country"].describe()

count    54558.000000
mean      1947.486638
std          9.321115
min       1912.000000
25%       1941.000000
50%       1949.000000
75%       1955.000000
max       2017.000000
Name: yr1country, dtype: float64

Identify the number of education years

In [10]:
waves = [1, 2, 4, 5, 6, 7]
dfs = []

for wave in waves:
    file_path = f"/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew{wave}_rel8-0-0_ALL_datasets_stata/sharew{wave}_rel8-0-0_dn.dta"
    data = pd.read_stata(file_path, convert_categoricals=False)
    dfs.append(data)

dn_data = pd.concat(dfs, ignore_index=True)

edu_sum = dn_data.groupby("mergeid").dn041_.sum().to_frame().reset_index()
edu_sum = edu_sum[(edu_sum.dn041_ >= 0) & (edu_sum.dn041_ <= 40)].reset_index(drop=True)
edu_sum = edu_sum.rename(columns={"dn041_": "yrseducation"})

df = df.merge(edu_sum, on="mergeid", how="left")

# Drop individuals with missing values
df = df.dropna(subset="yrseducation").reset_index(drop=True)

df.yrseducation.describe()

count    53605.000000
mean        10.819513
std          4.462452
min          0.000000
25%          8.000000
50%         11.000000
75%         13.000000
max         40.000000
Name: yrseducation, dtype: float64

Calculate the number of children

In [11]:
# Drop individuals with missing values (drop if refusal and 0 if nan)
df[["rc023_", "rc039_"]] = df[["rc023_", "rc039_"]].fillna(0)
df = df[df.rc023_ >= 0].reset_index(drop=True)

# Calculate the number of children as sum of biological and adopted
df["nb_children"] = df["rc023_"] + df["rc039_"]

df["nb_children"].describe()

count    53566.000000
mean         2.088228
std          1.342086
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         16.000000
Name: nb_children, dtype: float64

Identify current job isco

In [12]:
isco_columns = [f"re012isco_{i}" for i in range(1, 21)]


def get_last_valid(row):
    last_valid_index = row.last_valid_index()
    if pd.notnull(last_valid_index):
        return row[last_valid_index]
    else:
        return pd.NA


df["isco"] = df[isco_columns].apply(get_last_valid, axis=1)

# Drop individuals with missing values
df = df[df.isco > 0].reset_index(drop=True)
df = df.dropna(subset="isco").reset_index(drop=True)

# Correct some of the codes that are wrong due to formatting - one 0 is missing at the end
df["isco"] = df["isco"].astype(int)
df["isco"] = df["isco"].apply(lambda x: x * 10 if 99 < x < 1000 else x)

# Drop codes with less than 10 individuals
isco_filter = df["isco"].value_counts().to_frame().reset_index()
selected_isco = isco_filter[isco_filter["count"] >= 10]["isco"].tolist()
df = df[df["isco"].isin(selected_isco)].reset_index(drop=True)

df["isco"].value_counts()

isco
4110    1249
5223    1033
9112     716
2221     635
2341     604
        ... 
8132      10
4413      10
2240      10
2432      10
7542      10
Name: count, Length: 392, dtype: int64

Leave only individuals that did not change job between 2010 and 2017

In [13]:
start_columns = [f"re011_{i}" for i in range(1, 21)]
df["job_start"] = df[start_columns].apply(get_last_valid, axis=1)
df = df[df["job_start"] < 2010].reset_index(drop=True)

Calculate contribution years and first year of contribution

In [14]:
# Load job episodes panel data (from retrospective waves 3 and 7)
jobs = pd.read_stata(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharewX_rel8-0-0_gv_job_episodes_panel.dta"
)

# Calculate number of years of work for each individual
conditions = ["Employee or self-employed", "Short term job (less than 6 months)"]
relevant_rows = jobs[jobs["situation"].isin(conditions)]
result_jobs = (
    relevant_rows.groupby("mergeid").size().reset_index(name="yrscontribution2017")
)

# Calculate the year of first contribution
first_contribution = (
    relevant_rows.groupby("mergeid")["year"].min().reset_index(name="yr1contribution")
)

# Merge with main dataset
df = df.merge(result_jobs, on="mergeid", how="left")
df = df.merge(first_contribution, on="mergeid", how="left")

# Delete those with less than 10 years of contributions in 2010
df = df[df["yrscontribution2017"] >= 10].reset_index(drop=True)

# Delete those who started work before the age of 10
df = df[df["yr1contribution"].astype(int) >= df["yrbirth"].astype(int) + 10]

In [15]:
# Individual left after filters
len(df)

36320

Choose only necessary columns

In [17]:
df = df[
    [
        "mergeid",
        "country",
        "firstwave",
        "gender",
        "yrbirth",
        "mobirth",
        "age2017",
        "yr1country",
        "partnerinhh",
        "yrseducation",
        "nb_children",
        "isco",
        "job_start",
        "yrscontribution2017",
        "yr1contribution",
    ]
]

### Load and merge all needed datasets for SHARE Waves 4 and 6

In [88]:
datasets = []

folders = [
    f"/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew{i}_rel8-0-0_ALL_datasets_stata"
    for i in [4, 6]
]

merge_columns = ["mergeid"]

for folder in folders:
    wave = int(folder.split("sharew")[1].split("_")[0])

    folder_datasets = []

    for filename in os.listdir(folder):
        if (
            filename.endswith("cv_r.dta")
            or filename.endswith("ch.dta")
            or filename.endswith("ep.dta")
            or filename.endswith("gv_health.dta")
        ):
            file_path = os.path.join(folder, filename)
            dataset = pd.read_stata(file_path)

            dataset["wave"] = wave

            folder_datasets.append(dataset)

    if folder_datasets:
        merged_dataset = reduce(
            lambda left, right: pd.merge(
                left,
                right,
                on=left.columns.intersection(right.columns).tolist(),
                how="inner",
            ),
            folder_datasets,
        )

        datasets.append(merged_dataset)

w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)

  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)
  w46 

In [89]:
# Unique individuals
w46.groupby("wave").mergeid.nunique()

wave
4    58000
6    68085
Name: mergeid, dtype: int64

In [111]:
unique_mergeid_w4 = set(w46[w46.wave == 4]["mergeid"].unique())
unique_mergeid_w6 = set(w46[w46.wave == 6]["mergeid"].unique())
unique_mergeid_w46 = unique_mergeid_w4.intersection(unique_mergeid_w6)
unique_mergeid_df = set(df["mergeid"].unique())

In [112]:
len(unique_mergeid_w46.intersection(unique_mergeid_df))

12266

### Transform some variables, filter data and choose only necessary columns

Add year column

In [73]:
wave_to_year = {4: 2011, 6: 2015}
w46["year"] = w46["wave"].map(wave_to_year).astype(int)

Identify number of children by year

In [74]:
# Drop those who refused to answer, put 0 for those who don't know
w46 = w46[(w46.ch001_ != "Refusal")].reset_index(drop=True)
w46["ch001_"] = w46["ch001_"].replace({"Don't know": 0})

children2010 = (
    w46[w46.wave == 4]
    .groupby("hhid4")["ch001_"]
    .max()
    .to_frame(name="nb_children2010")
    .reset_index()
    .fillna(0)
)
children2015 = (
    w46[w46.wave == 6]
    .groupby("hhid6")["ch001_"]
    .max()
    .to_frame(name="nb_children2015")
    .reset_index()
    .fillna(0)
)

w46 = w46.merge(children2010, on="hhid4", how="left")
w46 = w46.merge(children2015, on="hhid6", how="left")

In [76]:
w46.nb_children2010.value_counts(dropna=False)

nb_children2010
NaN     67980
2.0     23723
3.0     11096
1.0     10611
0.0      5452
4.0      4182
5.0      1615
6.0       672
7.0       334
8.0       150
9.0        65
10.0       35
11.0       20
12.0        5
13.0        4
14.0        3
17.0        1
Name: count, dtype: int64

In [77]:
# Unique individuals
w46.groupby("wave").mergeid.nunique()

wave
4    57968
6    67980
Name: mergeid, dtype: int64

Transform partnerinhh to 1=lives with partner and 0=without

In [78]:
w46["partnerinhh"] = w46["partnerinhh"].replace({"Yes": 1, "No": 0})

w46["partnerinhh"].value_counts(dropna=False)

partnerinhh
1                 91146
0                 34802
Not applicable        0
Name: count, dtype: int64

Leave only those with the status of employed

In [79]:
w46 = w46[
    w46.ep005_ == "Employed or self-employed (including working for family business)"
].reset_index(drop=True)

In [80]:
# Unique individuals
w46.groupby("wave").mergeid.nunique()

wave
4    15539
6    16635
Name: mergeid, dtype: int64

Delete those who hold state pensions for disabilities or other special conditions

In [81]:
# ep071dno and ep671dno (from wave 6) - no current state pensions
w46 = w46[(w46.ep071dno == "Selected") | (w46.ep671dno == "Selected")].reset_index(
    drop=True
)

In [82]:
# Unique individuals
w46.groupby("wave").mergeid.nunique()

wave
4    13367
6    14121
Name: mergeid, dtype: int64

In [83]:
unique_mergeid_w4 = set(w46[w46.wave == 4]["mergeid"].unique())
unique_mergeid_w6 = set(w46[w46.wave == 6]["mergeid"].unique())
unique_mergeid_df = set(df["mergeid"].unique())

# Find common unique 'mergeid' values among all three datasets
common_mergeid_count = len(
    unique_mergeid_w4.intersection(unique_mergeid_w6).intersection(unique_mergeid_df)
)

In [84]:
common_mergeid_count

1943

### Set legal retirement ages

In [21]:
# Make some necessary formatting
month_to_numeric = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

df_filtered["mbirth"] = df_filtered["dn002_"].map(month_to_numeric)
df_filtered["yr1country"] = df_filtered["yr1country"].fillna(df_filtered["yrbirth"])
df_filtered = df_filtered[
    ~(
        (df_filtered["country"] == "Czech Republic")
        & (df_filtered["gender"] == "Female")
        & pd.to_numeric(df_filtered["nb_children"], errors="coerce").isna()
    )
]

In [22]:
country_functions_age = {
    "Austria": austria_age,
    "Belgium": belgium_age,
    "Czech Republic": czech_republic_age,
    "Denmark": denmark_age,
    "Estonia": estonia_age,
    "France": france_age,
    "Germany": germany_age,
    "Italy": italy_age,
    "Luxembourg": luxembourg_age,
    "Netherlands": netherlands_age,
    "Poland": poland_age,
    "Slovenia": slovenia_age,
    "Spain": spain_age,
    "Switzerland": switzerland_age,
}


def calculate_retirement_age(row):
    country = row["country"]
    if country in country_functions_age:
        return country_functions_age[country](row)
    else:
        return None

In [23]:
# Apply country-wise functions to calculate legal retirement age
df_filtered["retirement_age"] = df_filtered.apply(calculate_retirement_age, axis=1)

# Delete those who are above the retirement age (continue to work longer)
df_filtered = df_filtered[
    df_filtered["retirement_age"] > df_filtered["age"]
].reset_index(drop=True)

In [24]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1    2891
2    4483
4    6864
5    8383
6    8196
8    3080
Name: mergeid, dtype: int64

### Calculate resting work horizon and its change due to reforms

In [25]:
# Calculate resting work horizon
df_filtered["work_horizon"] = df_filtered["retirement_age"] - df_filtered["age"]

In [26]:
country_functions_change = {
    "Austria": austria_change,
    "Belgium": belgium_change,
    "Czech Republic": czech_republic_change,
    "Denmark": denmark_change,
    "Estonia": estonia_change,
    "France": france_change,
    "Germany": germany_change,
    "Italy": italy_change,
    "Luxembourg": luxembourg_change,
    "Netherlands": netherlands_change,
    "Poland": poland_change,
    "Slovenia": slovenia_change,
    "Spain": spain_change,
    "Switzerland": switzerland_change,
}


def calculate_horizon_change(row):
    country = row["country"]
    if country in country_functions_change:
        return country_functions_change[country](row)
    else:
        return None

In [27]:
# Apply country-wise functions to calculate work horizon change due to reforms
df_filtered["work_horizon_change"] = df_filtered.apply(calculate_horizon_change, axis=1)

### Calculate mental health indicators

In [28]:
# Filter out those with missing values for eurod scale
df_filtered = df_filtered.dropna(subset=["eurod"]).reset_index(drop=True)

In [29]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1    2861
2    4431
4    6752
5    8253
6    8006
8    3052
Name: mergeid, dtype: int64

In [30]:
# Transform to numeric
df_filtered["eurod"] = df_filtered["eurod"].replace(
    {"Not depressed": 0, "Very depressed": 12}
)
df_filtered["eurodcat"] = df_filtered["eurodcat"].replace({"Yes": 1, "No": 0})
df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
] = df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
].applymap(
    lambda x: 1 if x == "Selected" else 0
)

  ].applymap(


In [31]:
# Conduct a PCA to deduct affective suffering and lack of motivation as separate indicators
columns_for_pca = [
    "euro1",
    "euro2",
    "euro3",
    "euro4",
    "euro5",
    "euro6",
    "euro7",
    "euro8",
    "euro9",
    "euro10",
    "euro11",
    "euro12",
]
data_pca = df_filtered[columns_for_pca]

corr_mat = np.corrcoef(data_pca, rowvar=False)  # Tetrachoric correlation matrix

evals, evecs = eigh(corr_mat)  # Eigenvalues and eigenvectors

fa = FactorAnalyzer(n_factors=2, rotation="varimax", method="ml")
fa.fit(data_pca)

factor_scores = fa.transform(data_pca)

cutoff = 0.55

df_filtered["affective_suffering"] = 0
df_filtered["motivation_lack"] = 0

df_filtered["affective_suffering"] = (factor_scores[:, 0] >= cutoff).astype(int)
df_filtered["motivation_lack"] = (factor_scores[:, 1] >= cutoff).astype(int)

In [32]:
fa.loadings_

array([[0.72230676, 0.06609387],
       [0.01712821, 0.21397277],
       [0.2376367 , 0.30820253],
       [0.26465848, 0.21428754],
       [0.39408909, 0.17606154],
       [0.17611074, 0.38047347],
       [0.40292027, 0.19079989],
       [0.19200065, 0.25428392],
       [0.36061593, 0.2733768 ],
       [0.16281977, 0.32561402],
       [0.0287768 , 0.21387176],
       [0.48854601, 0.06975363]])

In [33]:
df_filtered.affective_suffering.value_counts()

affective_suffering
0    40948
1    17282
Name: count, dtype: int64

In [34]:
df_filtered.motivation_lack.value_counts()

motivation_lack
0    50322
1     7908
Name: count, dtype: int64

### Working conditions: demand-control model

# Convert work characteristics questions to numeric
for i in [26, 27, 28, 29, 34, 35]:
    df_filtered[f"ep0{i}_"] = df_filtered[f"ep0{i}_"].replace(
        {
            "Strongly agree": 5,
            "Agree": 4,
            "Don't know": 3,
            "Disagree": 2,
            "Strongly disagree": 1,
        }
    )
    df_filtered[f"ep0{i}_"] = pd.to_numeric(df_filtered[f"ep0{i}_"], errors="coerce")
    df_filtered = df_filtered.dropna(subset=[f"ep0{i}_"])

for i in range(30, 34):
    df_filtered[f"ep0{i}_"] = df_filtered[f"ep0{i}_"].replace(
        {
            "Strongly agree": 1,
            "Agree": 2,
            "Don't know": 3,
            "Disagree": 4,
            "Strongly disagree": 5,
        }
    )
    df_filtered[f"ep0{i}_"] = pd.to_numeric(df_filtered[f"ep0{i}_"], errors="coerce")
    df_filtered = df_filtered.dropna(subset=[f"ep0{i}_"]).reset_index(drop=True)

# Calculate groups above and below median for job demands and control
df_filtered["job_demands"] = 0
df_filtered["job_control"] = 0

for i in range(len(df_filtered)):
    if ((df_filtered["ep027_"][i] + df_filtered["ep028_"][i]) / 2) > (
        (df_filtered["ep027_"] + df_filtered["ep028_"]) / 2
    ).median():
        df_filtered["job_demands"][i] = 1
    if (
        (df_filtered["ep029_"][i] + df_filtered["ep030_"][i] + df_filtered["ep031_"][i])
        / 3
    ) > (
        (df_filtered["ep029_"] + df_filtered["ep030_"] + df_filtered["ep031_"]) / 3
    ).median():
        df_filtered["job_control"][i] = 1

# Define categories of jobs - passive, active, high strain, low strain
df_filtered["job_passive"] = 0
df_filtered["job_active"] = 0
df_filtered["job_high_strain"] = 0
df_filtered["job_low_strain"] = 0

for i in range(len(df_filtered)):
    if df_filtered["job_demands"][i] == 0 and df_filtered["job_control"][i] == 0:
        df_filtered["job_passive"][i] = 1
    elif df_filtered["job_demands"][i] == 1 and df_filtered["job_control"][i] == 1:
        df_filtered["job_active"][i] = 1
    elif df_filtered["job_demands"][i] == 1 and df_filtered["job_control"][i] == 0:
        df_filtered["job_high_strain"][i] = 1
    else:
        df_filtered["job_low_strain"][i] = 1

# Add job recognition, prospects, insecurity
df_filtered["job_poor_recognition"] = 0
df_filtered["job_poor_prospects"] = 0
df_filtered["job_insecurity"] = 0

for i in range(len(df_filtered)):
    if ((df_filtered["ep032_"][i] + df_filtered["ep033_"][i]) / 2) > (
        (df_filtered["ep032_"] + df_filtered["ep033_"]) / 2
    ).median():
        df_filtered["job_poor_recognition"][i] = 1
    if df_filtered["ep034_"][i] > df_filtered["ep034_"].median():
        df_filtered["job_poor_prospects"][i] = 1
    if df_filtered["ep035_"][i] > df_filtered["ep035_"].median():
        df_filtered["job_insecurity"][i] = 1

In [35]:
# Unique individuals
print(f"There are {df_filtered.mergeid.nunique()} unique individuals in our sample.")
print(f"By wave couples:")
print(df_filtered.groupby("wave_couple").mergeid.nunique())
print(f"By wave:")
print(df_filtered.groupby("wave").mergeid.nunique())

There are 16458 unique individuals in our sample.
By wave couples:
wave_couple
w12     4861
w24     8094
w45    10341
w56    10397
w68     8790
Name: mergeid, dtype: int64
By wave:
wave
1    2861
2    4431
4    6752
5    8253
6    8006
8    3052
Name: mergeid, dtype: int64
