## Creating the dataset

### Load libraries

In [1]:
import os
import sys

src_path = os.path.abspath("../")
sys.path.append(src_path)

from functools import reduce

import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from pandas.io.stata import StataReader
from scipy.linalg import eigh

from utils.retirement import *

### Load and merge all needed datasets

In [2]:
folder = "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew7_rel8-0-0_ALL_datasets_stata/"
merge_columns = ["mergeid", "hhid7", "mergeidp7", "coupleid7", "country", "language"]
datasets = []

for filename in os.listdir(folder):
    if (
        filename.endswith("cv_r.dta")
        or filename.endswith("technical_variables.dta")
        or filename.endswith("dn.dta")
        # or filename.endswith("ra.dta")
        # or filename.endswith("cc.dta")
        # or filename.endswith("dq.dta")
        or filename.endswith("fs.dta")
        # or filename.endswith("gl.dta")
        or filename.endswith("rh.dta")
        or filename.endswith("hs.dta")
        or filename.endswith("rc.dta")
        or filename.endswith("re.dta")
        or filename.endswith("rp.dta")
        or filename.endswith("wq.dta")
        or filename.endswith("gv_weights.dta")
    ):
        file_path = os.path.join(folder, filename)
        dataset = pd.read_stata(file_path, convert_categoricals=False)
        datasets.append(dataset)

df = reduce(lambda left, right: pd.merge(left, right, on=merge_columns), datasets)

In [3]:
# Leave only Sharelife part
df = df[df.mn103_ == 1].reset_index(drop=True)

In [4]:
df.mergeid.nunique()

63248

### Transform some variables and choose only necessary columns

In [5]:
# Transform countries from codes to names
with StataReader(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew7_rel8-0-0_ALL_datasets_stata/sharew7_rel8-0-0_cv_r.dta",
    convert_categoricals=True,
) as reader:
    data = reader.read()
    value_labels = reader.value_labels()
df["country"] = df["country"].replace(value_labels.get("country"))

df["country"].value_counts(dropna=False)

country
Estonia           5115
Slovenia          3691
Poland            3559
Spain             3424
Belgium           3333
Czech Republic    3292
Italy             3000
Germany           2984
Austria           2693
Croatia           2408
France            2188
Israel            2131
Sweden            2130
Romania           2114
Slovakia          2077
Lithuania         2035
Finland           2007
Bulgaria          1998
Denmark           1962
Latvia            1734
Switzerland       1648
Hungary           1538
Portugal          1282
Malta             1261
Luxembourg        1250
Cyprus            1233
Greece            1161
Name: count, dtype: int64

In [6]:
# Transform gender to 1=female, 0=male
df["gender"] = df["gender"].replace({1: 0, 2: 1})

df["gender"].value_counts(dropna=False)

gender
1    36040
0    27208
Name: count, dtype: int64

In [7]:
# Transform partnerinhh to 1=lives with partner and 0=without
df["partnerinhh"] = df["partnerinhh"].replace({3: 0})

df["partnerinhh"].value_counts(dropna=False)

partnerinhh
1    45347
0    17901
Name: count, dtype: int64

In [8]:
# Rename variable indicating first year in country and fill missing values with year of birth
df["dn006_"] = df["dn006_"].fillna(df["yrbirth"])
df = df.rename(columns={"dn006_": "yr1country"})

# Drop individuals with missing answers
df = df[df.yr1country > 0].reset_index(drop=True)

df["yr1country"].describe()

count    63233.000000
mean      1949.768934
std         10.511566
min       1912.000000
25%       1943.000000
50%       1951.000000
75%       1958.000000
max       2017.000000
Name: yr1country, dtype: float64

In [9]:
# Identify the number of education years
waves = [1, 2, 4, 5, 6, 7]
dfs = []

for wave in waves:
    file_path = f"/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharew{wave}_rel8-0-0_ALL_datasets_stata/sharew{wave}_rel8-0-0_dn.dta"
    data = pd.read_stata(file_path, convert_categoricals=False)
    dfs.append(data)

dn_data = pd.concat(dfs, ignore_index=True)

edu_sum = dn_data.groupby("mergeid").dn041_.sum().to_frame().reset_index()
edu_sum = edu_sum[(edu_sum.dn041_ >= 0) & (edu_sum.dn041_ <= 40)].reset_index(drop=True)
edu_sum = edu_sum.rename(columns={"dn041_": "yrseducation"})

df = df.merge(edu_sum, on="mergeid", how="left")

# Drop individuals with missing values
df = df.dropna(subset="yrseducation").reset_index(drop=True)

df.yrseducation.describe()

count    62153.000000
mean        11.041068
std          4.415459
min          0.000000
25%          8.000000
50%         11.000000
75%         13.000000
max         40.000000
Name: yrseducation, dtype: float64

In [12]:
# Drop individuals with missing values for number of children (drop if refusal and 0 if nan)
df[["rc023_", "rc039_"]] = df[["rc023_", "rc039_"]].fillna(0)
df = df[df.rc023_ >= 0].reset_index(drop=True)

# Calculate the number of children as sum of biological and adopted
df["nb_children"] = df["rc023_"] + df["rc039_"]

df["nb_children"].describe()

count    62112.000000
mean         2.059892
std          1.333923
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         16.000000
Name: nb_children, dtype: float64

In [14]:
# Identify current job isco
isco_columns = [f"re012isco_{i}" for i in range(1, 21)]


def get_last_valid(row):
    last_valid_index = row.last_valid_index()
    if pd.notnull(last_valid_index):
        return row[last_valid_index]
    else:
        return pd.NA


df["isco"] = df[isco_columns].apply(get_last_valid, axis=1)

# Drop individuals with missing values
df = df[df.isco > 0].reset_index(drop=True)
df = df.dropna(subset="isco").reset_index(drop=True)

df["isco"].value_counts()

isco
4110.0    1447
5223.0    1249
9112.0     873
2221.0     776
2341.0     716
          ... 
2122.0       1
5161.0       1
8334.0       1
2358.0       1
3514.0       1
Name: count, Length: 449, dtype: int64

In [17]:
df["isco"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 46517 entries, 0 to 46516
Series name: isco
Non-Null Count  Dtype 
--------------  ----- 
46517 non-null  object
dtypes: object(1)
memory usage: 363.5+ KB


In [16]:
def modify_isco(value):
    if len(value) < 6:
        value = value[:3] + "0" + value[3:]
    return value


df["isco"].apply(modify_isco).unique()

TypeError: object of type 'numpy.float64' has no len()

In [18]:
df

Unnamed: 0,mergeid,hhid7,mergeidp7,coupleid7,country,language,fs002_,fs003_,fs004_,fs005_,...,hs063d2,hs063d3,hs063d4,hs063d5,hs063d6,hs063d7,hs063dno,hs063dot,nb_children,isco
0,AT-057726-04,AT-057726-A,AT-057726-01,AT-057726-01-04,Austria,11,5.0,,5.0,,...,,,,,,,,,0.0,5414.0
1,AT-059165-01,AT-059165-A,AT-059165-02,AT-059165-01-02,Austria,11,1.0,1990.0,5.0,,...,,,,,,,,,3.0,5221.0
2,AT-079798-01,AT-079798-A,AT-079798-02,AT-079798-01-02,Austria,11,1.0,1995.0,5.0,,...,,,,,,,,,2.0,3431.0
3,AT-084096-02,AT-084096-A,AT-084096-01,AT-084096-01-02,Austria,11,5.0,,5.0,,...,,,,,,,,,1.0,3313.0
4,AT-094882-01,AT-094882-A,AT-094882-02,AT-094882-01-02,Austria,11,5.0,,5.0,,...,,,,,,,,,5.0,6330.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12015,SK-995042-01,SK-995042-A,SK-995042-02,SK-995042-01-02,Slovakia,63,5.0,,5.0,,...,,,,,,,,,3.0,9211.0
12016,SK-995042-02,SK-995042-A,SK-995042-01,SK-995042-01-02,Slovakia,63,5.0,,5.0,,...,,,,,,,,,3.0,8211.0
12017,SK-996004-01,SK-996004-A,,,Slovakia,63,5.0,,5.0,,...,,,,,,,,,3.0,4414.0
12018,SK-999958-01,SK-999958-A,SK-999958-02,SK-999958-01-02,Slovakia,63,5.0,,5.0,,...,,,,,,,,,3.0,8342.0


In [15]:
df_short = df[
    [
        "mergeid",
        "country",
        "firstwave",
        "gender",
        "yrbirth",
        "mobirth",
        "age2017",
        "yr1country",
        "partnerinhh",
        "highest_education",
        "nb_children",
        "isco",
        "ep009_",
        "ep012_",
        "ep016_",
        "ep616isco",
        # "ep026_",
        # "ep027_",
        # "ep028_",
        # "ep029_",
        # "ep030_",
        # "ep031_",
        # "ep032_",
        # "ep033_",
        # "ep034_",
        # "ep035_",
        # "ep036_",
        # "ep037_",
        "ep071dno",
        "ep671dno",
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
        "eurod",
        "eurodcat",
    ]
]

In [16]:
# Rename some essential columns
df_short = df_short.rename(
    columns={
        "dn006_": "yr1country",
        "dn010_": "highest_education",
        "ch001_": "nb_children",
        "ep005_": "employment",
        "ep009_": "job_status",
        "ep012_": "job_week_hours",
        "ep016_": "job_name",
        "ep616isco": "job_isco",
        "ep071dno": "pension",
        "ep671dno": "pension1",
    }
)

In [21]:
for i in [1, 2, 4, 5, 6, 8]:
    print(
        f"Len wave {i}: {len(df_short[df_short.wave == i])}, Missing isco: {df_short[df_short.wave == i]['job_isco'].isna().sum()}"
    )

Len wave 1: 30419, Missing isco: 30419
Len wave 2: 37143, Missing isco: 37143
Len wave 4: 58000, Missing isco: 58000
Len wave 5: 66065, Missing isco: 66065
Len wave 6: 68085, Missing isco: 61670
Len wave 8: 46733, Missing isco: 42036


### Filter countries

In [6]:
# Keep only countries present in both waves in each couple
filtered_countries = []

wave_pairs = [(1, 2), (2, 4), (4, 5), (5, 6), (6, 8)]

for wave1, wave2 in wave_pairs:
    wave1_data = df_short[df_short["wave"] == wave1]
    wave2_data = df_short[df_short["wave"] == wave2]

    common_countries = set(wave1_data["country"]) & set(wave2_data["country"])

    common_countries_wave1 = wave1_data[wave1_data["country"].isin(common_countries)]
    common_countries_wave2 = wave2_data[wave2_data["country"].isin(common_countries)]

    common_countries_wave1["wave_couple"] = f"w{wave1}{wave2}"
    common_countries_wave2["wave_couple"] = f"w{wave1}{wave2}"

    filtered_countries.append(common_countries_wave1)
    filtered_countries.append(common_countries_wave2)

df_filtered = pd.concat(filtered_countries, sort=False, axis=0).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_countries_wave2["wave_couple"] = f"w{wave1}{wave2}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_countries_wave1["wave_couple"] = f"w{wave1}{wave2}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_countries_wave2["wave_couple"] = f"w{wave1}{wave2}"
A value is trying to be set 

In [7]:
# Leave only 15 principal countries of interest
countries_list = [
    "Austria",
    "Belgium",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "France",
    "Germany",
    "Italy",
    "Luxembourg",
    "Netherlands",
    "Poland",
    "Portugal",
    "Slovenia",
    "Spain",
    "Switzerland",
]

df_filtered = df_filtered[df_filtered["country"].isin(countries_list)].reset_index(
    drop=True
)

In [8]:
print(
    f"After filtering we have {df_filtered.country.nunique()} out of {df_short.country.nunique()} countries left. They are: {df_filtered.country.unique()}"
)

After filtering we have 14 out of 29 countries left. They are: ['Austria' 'Belgium' 'Switzerland' 'Germany' 'Denmark' 'Spain' 'France'
 'Italy' 'Netherlands' 'Czech Republic' 'Poland' 'Estonia' 'Slovenia'
 'Luxembourg']


In [9]:
for couple in df_filtered.wave_couple.unique():
    print(
        f"{couple} - {df_filtered[df_filtered.wave_couple == couple].country.nunique()} common countries:"
    )
    print(df_filtered[df_filtered.wave_couple == couple]["country"].unique())

w12 - 9 common countries:
['Austria' 'Belgium' 'Switzerland' 'Germany' 'Denmark' 'Spain' 'France'
 'Italy' 'Netherlands']
w24 - 11 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Spain' 'France' 'Italy' 'Netherlands' 'Poland']
w45 - 12 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Estonia' 'Spain' 'France' 'Italy' 'Netherlands' 'Slovenia']
w56 - 12 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Estonia' 'Spain' 'France' 'Italy' 'Luxembourg' 'Slovenia']
w68 - 13 common countries:
['Austria' 'Belgium' 'Czech Republic' 'Switzerland' 'Germany' 'Denmark'
 'Estonia' 'Spain' 'France' 'Italy' 'Luxembourg' 'Poland' 'Slovenia']


In [10]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1    22024
2    27454
4    50946
5    58911
6    53052
8    28587
Name: mergeid, dtype: int64

### Filter for only aged 50-67

In [11]:
wave_to_year = {1: 2004, 2: 2007, 4: 2011, 5: 2013, 6: 2015, 8: 2020}

# Year of survey
df_filtered["yrsurvey"] = df_filtered["wave"].map(wave_to_year).astype(int)

# Year of birth
df_filtered["yrbirth"] = pd.to_numeric(df_filtered["yrbirth"], errors="coerce")
df_filtered = df_filtered[df_filtered["yrbirth"].notna()].reset_index(drop=True)

# Age
df_filtered["age"] = df_filtered["yrsurvey"] - df_filtered["yrbirth"]

# Filter for 50+
df_filtered = df_filtered[
    (df_filtered["age"] >= 50) & (df_filtered["age"] <= 67)
].reset_index(drop=True)

In [12]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1    13403
2    16230
4    28745
5    32318
6    27299
8    10320
Name: mergeid, dtype: int64

### Filter for not retired and employed

In [13]:
df_filtered = df_filtered[
    df_filtered.employment
    == "Employed or self-employed (including working for family business)"
].reset_index(drop=True)

In [14]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1     5349
2     6909
4    12662
5    14852
6    12355
8     4329
Name: mergeid, dtype: int64

### Filter out those who hold state pensions for disabilities or other special conditions

In [15]:
# ep071dno and ep671dno (from wave 6) - no current state pensions
df_filtered = df_filtered[
    (df_filtered.pension == "Selected") | (df_filtered.pension1 == "Selected")
].reset_index(drop=True)

In [16]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1     4888
2     6359
4    11158
5    13167
6    10832
8     3658
Name: mergeid, dtype: int64

### Calculate contribution years

In [17]:
# Load job episodes panel data (from retrospective waves 3 and 7)
jobs = pd.read_stata(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/sharewX_rel8-0-0_gv_job_episodes_panel.dta"
)

In [18]:
# Calculate number of years of work for each individual
conditions = ["Employee or self-employed", "Short term job (less than 6 months)"]
relevant_rows = jobs[jobs["situation"].isin(conditions)]
result_jobs = (
    relevant_rows.groupby("mergeid").size().reset_index(name="yrscontribution2019")
)
# Calculate the year of first contribution
first_contribution = (
    relevant_rows.groupby("mergeid")["year"].min().reset_index(name="yr1contribution")
)

In [19]:
# Merge with main dataset
df_filtered = df_filtered.merge(result_jobs, on="mergeid", how="left")
df_filtered["yrscontribution"] = df_filtered["yrscontribution2019"] - (
    2019 - df_filtered["yrsurvey"]
)
df_filtered = df_filtered.merge(first_contribution, on="mergeid", how="left")
df_filtered = df_filtered.merge(
    jobs[["mergeid", "year", "withpartner"]].rename(columns={"year": "yrsurvey"}),
    on=["mergeid", "yrsurvey"],
    how="left",
)

# Delete those with less than 10 years of contributions
df_filtered = df_filtered[df_filtered["yrscontribution"] >= 10]
# Delete those who started work before the age of 10
df_filtered = df_filtered[
    df_filtered["yr1contribution"].astype(int)
    >= df_filtered["yrbirth"].astype(int) + 12
]

In [20]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1    2935
2    4651
4    7238
5    8749
6    8569
8    3370
Name: mergeid, dtype: int64

### Set legal retirement ages

In [21]:
# Make some necessary formatting
month_to_numeric = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

df_filtered["mbirth"] = df_filtered["dn002_"].map(month_to_numeric)
df_filtered["yr1country"] = df_filtered["yr1country"].fillna(df_filtered["yrbirth"])
df_filtered = df_filtered[
    ~(
        (df_filtered["country"] == "Czech Republic")
        & (df_filtered["gender"] == "Female")
        & pd.to_numeric(df_filtered["nb_children"], errors="coerce").isna()
    )
]

In [22]:
country_functions_age = {
    "Austria": austria_age,
    "Belgium": belgium_age,
    "Czech Republic": czech_republic_age,
    "Denmark": denmark_age,
    "Estonia": estonia_age,
    "France": france_age,
    "Germany": germany_age,
    "Italy": italy_age,
    "Luxembourg": luxembourg_age,
    "Netherlands": netherlands_age,
    "Poland": poland_age,
    "Slovenia": slovenia_age,
    "Spain": spain_age,
    "Switzerland": switzerland_age,
}


def calculate_retirement_age(row):
    country = row["country"]
    if country in country_functions_age:
        return country_functions_age[country](row)
    else:
        return None

In [23]:
# Apply country-wise functions to calculate legal retirement age
df_filtered["retirement_age"] = df_filtered.apply(calculate_retirement_age, axis=1)

# Delete those who are above the retirement age (continue to work longer)
df_filtered = df_filtered[
    df_filtered["retirement_age"] > df_filtered["age"]
].reset_index(drop=True)

In [24]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1    2891
2    4483
4    6864
5    8383
6    8196
8    3080
Name: mergeid, dtype: int64

### Calculate resting work horizon and its change due to reforms

In [25]:
# Calculate resting work horizon
df_filtered["work_horizon"] = df_filtered["retirement_age"] - df_filtered["age"]

In [26]:
country_functions_change = {
    "Austria": austria_change,
    "Belgium": belgium_change,
    "Czech Republic": czech_republic_change,
    "Denmark": denmark_change,
    "Estonia": estonia_change,
    "France": france_change,
    "Germany": germany_change,
    "Italy": italy_change,
    "Luxembourg": luxembourg_change,
    "Netherlands": netherlands_change,
    "Poland": poland_change,
    "Slovenia": slovenia_change,
    "Spain": spain_change,
    "Switzerland": switzerland_change,
}


def calculate_horizon_change(row):
    country = row["country"]
    if country in country_functions_change:
        return country_functions_change[country](row)
    else:
        return None

In [27]:
# Apply country-wise functions to calculate work horizon change due to reforms
df_filtered["work_horizon_change"] = df_filtered.apply(calculate_horizon_change, axis=1)

### Calculate mental health indicators

In [28]:
# Filter out those with missing values for eurod scale
df_filtered = df_filtered.dropna(subset=["eurod"]).reset_index(drop=True)

In [29]:
# Unique individuals
df_filtered.groupby("wave").mergeid.nunique()

wave
1    2861
2    4431
4    6752
5    8253
6    8006
8    3052
Name: mergeid, dtype: int64

In [30]:
# Transform to numeric
df_filtered["eurod"] = df_filtered["eurod"].replace(
    {"Not depressed": 0, "Very depressed": 12}
)
df_filtered["eurodcat"] = df_filtered["eurodcat"].replace({"Yes": 1, "No": 0})
df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
] = df_filtered[
    [
        "euro1",
        "euro2",
        "euro3",
        "euro4",
        "euro5",
        "euro6",
        "euro7",
        "euro8",
        "euro9",
        "euro10",
        "euro11",
        "euro12",
    ]
].applymap(
    lambda x: 1 if x == "Selected" else 0
)

  ].applymap(


In [31]:
# Conduct a PCA to deduct affective suffering and lack of motivation as separate indicators
columns_for_pca = [
    "euro1",
    "euro2",
    "euro3",
    "euro4",
    "euro5",
    "euro6",
    "euro7",
    "euro8",
    "euro9",
    "euro10",
    "euro11",
    "euro12",
]
data_pca = df_filtered[columns_for_pca]

corr_mat = np.corrcoef(data_pca, rowvar=False)  # Tetrachoric correlation matrix

evals, evecs = eigh(corr_mat)  # Eigenvalues and eigenvectors

fa = FactorAnalyzer(n_factors=2, rotation="varimax", method="ml")
fa.fit(data_pca)

factor_scores = fa.transform(data_pca)

cutoff = 0.55

df_filtered["affective_suffering"] = 0
df_filtered["motivation_lack"] = 0

df_filtered["affective_suffering"] = (factor_scores[:, 0] >= cutoff).astype(int)
df_filtered["motivation_lack"] = (factor_scores[:, 1] >= cutoff).astype(int)

In [32]:
fa.loadings_

array([[0.72230676, 0.06609387],
       [0.01712821, 0.21397277],
       [0.2376367 , 0.30820253],
       [0.26465848, 0.21428754],
       [0.39408909, 0.17606154],
       [0.17611074, 0.38047347],
       [0.40292027, 0.19079989],
       [0.19200065, 0.25428392],
       [0.36061593, 0.2733768 ],
       [0.16281977, 0.32561402],
       [0.0287768 , 0.21387176],
       [0.48854601, 0.06975363]])

In [33]:
df_filtered.affective_suffering.value_counts()

affective_suffering
0    40948
1    17282
Name: count, dtype: int64

In [34]:
df_filtered.motivation_lack.value_counts()

motivation_lack
0    50322
1     7908
Name: count, dtype: int64

### Working conditions: demand-control model

# Convert work characteristics questions to numeric
for i in [26, 27, 28, 29, 34, 35]:
    df_filtered[f"ep0{i}_"] = df_filtered[f"ep0{i}_"].replace(
        {
            "Strongly agree": 5,
            "Agree": 4,
            "Don't know": 3,
            "Disagree": 2,
            "Strongly disagree": 1,
        }
    )
    df_filtered[f"ep0{i}_"] = pd.to_numeric(df_filtered[f"ep0{i}_"], errors="coerce")
    df_filtered = df_filtered.dropna(subset=[f"ep0{i}_"])

for i in range(30, 34):
    df_filtered[f"ep0{i}_"] = df_filtered[f"ep0{i}_"].replace(
        {
            "Strongly agree": 1,
            "Agree": 2,
            "Don't know": 3,
            "Disagree": 4,
            "Strongly disagree": 5,
        }
    )
    df_filtered[f"ep0{i}_"] = pd.to_numeric(df_filtered[f"ep0{i}_"], errors="coerce")
    df_filtered = df_filtered.dropna(subset=[f"ep0{i}_"]).reset_index(drop=True)

# Calculate groups above and below median for job demands and control
df_filtered["job_demands"] = 0
df_filtered["job_control"] = 0

for i in range(len(df_filtered)):
    if ((df_filtered["ep027_"][i] + df_filtered["ep028_"][i]) / 2) > (
        (df_filtered["ep027_"] + df_filtered["ep028_"]) / 2
    ).median():
        df_filtered["job_demands"][i] = 1
    if (
        (df_filtered["ep029_"][i] + df_filtered["ep030_"][i] + df_filtered["ep031_"][i])
        / 3
    ) > (
        (df_filtered["ep029_"] + df_filtered["ep030_"] + df_filtered["ep031_"]) / 3
    ).median():
        df_filtered["job_control"][i] = 1

# Define categories of jobs - passive, active, high strain, low strain
df_filtered["job_passive"] = 0
df_filtered["job_active"] = 0
df_filtered["job_high_strain"] = 0
df_filtered["job_low_strain"] = 0

for i in range(len(df_filtered)):
    if df_filtered["job_demands"][i] == 0 and df_filtered["job_control"][i] == 0:
        df_filtered["job_passive"][i] = 1
    elif df_filtered["job_demands"][i] == 1 and df_filtered["job_control"][i] == 1:
        df_filtered["job_active"][i] = 1
    elif df_filtered["job_demands"][i] == 1 and df_filtered["job_control"][i] == 0:
        df_filtered["job_high_strain"][i] = 1
    else:
        df_filtered["job_low_strain"][i] = 1

# Add job recognition, prospects, insecurity
df_filtered["job_poor_recognition"] = 0
df_filtered["job_poor_prospects"] = 0
df_filtered["job_insecurity"] = 0

for i in range(len(df_filtered)):
    if ((df_filtered["ep032_"][i] + df_filtered["ep033_"][i]) / 2) > (
        (df_filtered["ep032_"] + df_filtered["ep033_"]) / 2
    ).median():
        df_filtered["job_poor_recognition"][i] = 1
    if df_filtered["ep034_"][i] > df_filtered["ep034_"].median():
        df_filtered["job_poor_prospects"][i] = 1
    if df_filtered["ep035_"][i] > df_filtered["ep035_"].median():
        df_filtered["job_insecurity"][i] = 1

In [35]:
# Unique individuals
print(f"There are {df_filtered.mergeid.nunique()} unique individuals in our sample.")
print(f"By wave couples:")
print(df_filtered.groupby("wave_couple").mergeid.nunique())
print(f"By wave:")
print(df_filtered.groupby("wave").mergeid.nunique())

There are 16458 unique individuals in our sample.
By wave couples:
wave_couple
w12     4861
w24     8094
w45    10341
w56    10397
w68     8790
Name: mergeid, dtype: int64
By wave:
wave
1    2861
2    4431
4    6752
5    8253
6    8006
8    3052
Name: mergeid, dtype: int64


In [36]:
df_filtered.to_csv(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/data_clean.csv",
    index=False,
)