### Load libraries

In [1]:
import os
import sys

src_path = os.path.abspath("../")
sys.path.append(src_path)

In [2]:
from functools import reduce

import pandas as pd

from utils.countries_age import *

### Load and merge all needed datasets

In [2]:
datasets = []

folders = [f"sharew{i}_rel8-0-0_ALL_datasets_stata" for i in [1, 2, 4, 5, 6, 7, 8]]

merge_columns = ["mergeid", "hhid1", "mergeidp1", "coupleid1", "country", "language"]

for folder in folders:
    wave = int(folder.split("sharew")[1].split("_")[0])

    folder_datasets = []

    for filename in os.listdir(folder):
        if (
            filename.endswith("dn.dta")
            or filename.endswith("ep.dta")
            or filename.endswith("ch.dta")
            or filename.endswith("gv_health.dta")
        ):
            file_path = os.path.join(folder, filename)
            dataset = pd.read_stata(file_path)

            dataset["wave"] = wave

            folder_datasets.append(dataset)

    if folder_datasets:
        merged_dataset = reduce(
            lambda left, right: pd.merge(
                left,
                right,
                on=left.columns.intersection(right.columns).tolist(),
                how="inner",
            ),
            folder_datasets,
        )

        datasets.append(merged_dataset)

all_waves = pd.concat(datasets, sort=False, axis=0).reset_index(drop=True)

### Group by couples of waves

In [4]:
w12 = all_waves[(all_waves["wave"] == 1) | (all_waves["wave"] == 2)].reset_index(
    drop=True
)
w24 = all_waves[(all_waves["wave"] == 2) | (all_waves["wave"] == 4)].reset_index(
    drop=True
)
w45 = all_waves[(all_waves["wave"] == 4) | (all_waves["wave"] == 5)].reset_index(
    drop=True
)
w56 = all_waves[(all_waves["wave"] == 5) | (all_waves["wave"] == 6)].reset_index(
    drop=True
)
w67 = all_waves[(all_waves["wave"] == 6) | (all_waves["wave"] == 7)].reset_index(
    drop=True
)
w78 = all_waves[(all_waves["wave"] == 7) | (all_waves["wave"] == 8)].reset_index(
    drop=True
)

In [5]:
dataset_names = ["w12", "w24", "w45", "w56", "w67", "w78"]

In [6]:
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67562 entries, 0 to 67561
Columns: 2900 entries, mergeid to ep811_
dtypes: category(1069), float32(13), float64(34), int64(1), object(1783)
memory usage: 1010.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95143 entries, 0 to 95142
Columns: 2900 entries, mergeid to ep811_
dtypes: category(1069), float32(13), float64(34), int64(1), object(1783)
memory usage: 1.4+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124065 entries, 0 to 124064
Columns: 2900 entries, mergeid to ep811_
dtypes: category(1069), float32(13), float64(34), int64(1), object(1783)
memory usage: 1.8+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134150 entries, 0 to 134149
Columns: 2900 entries, mergeid to ep811_
dtypes: category(1069), float32(13), float64(34), int64(1), object(1783)
memory usage: 2.0+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145287 entries, 0 to 145286
Columns: 2900 entries, mergeid to ep811_
dtypes

### Leave only common countries

In [7]:
# Filter each couple dataset for only common countries, exclude Israel, Croatia and Sweden
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    grouped = dataset.groupby("wave")["country"].unique()
    common_countries = set.intersection(*map(set, grouped))
    dataset = dataset[dataset["country"].isin(common_countries)]
    dataset = dataset[
        (dataset["country"] != "Israel")
        & (dataset["country"] != "Croatia")
        & (dataset["country"] != "Sweden")
    ].reset_index(drop=True)
    globals()[dataset_name] = dataset

# Exclude Poland from w67 and w78 because of a reversed reform
w67 = w67[w67["country"] != "Poland"].reset_index(drop=True)
w78 = w78[w78["country"] != "Poland"].reset_index(drop=True)

print(f'There were {all_waves.country.nunique()} unique countries before filtering. They are: {all_waves.country.unique()}')

grouped = all_waves.groupby('wave')['country'].unique()
common_countries = set.intersection(*map(set, grouped))
print(f'There were {len(common_countries)} countries present in all waves. They are: {list(common_countries)}')

### Common countries by couples of waves
data = []

for dataset_name in dataset_names:
    dataset = globals()[dataset_name] 

    grouped = dataset.groupby('wave')['country'].unique()
    unique_countries = set.union(*map(set, grouped))
    common_countries = set.intersection(*map(set, grouped))

    data.append({
        'Waves': dataset_name,
        'Unique Countries': len(unique_countries),
        'Common Countries': len(common_countries),
        'Common Country List': list(common_countries)
    })

pd.set_option('display.max_colwidth', None)
pd.DataFrame(data)

combined_data = pd.DataFrame()
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    combined_data = pd.concat([combined_data, dataset])

print("Number of unique countries in all datasets together after filtering:", combined_data['country'].nunique())
print("They are:", combined_data['country'].unique())

### Filter for only aged 50-67

In [8]:
wave_to_year = {1: 2004, 2: 2007, 4: 2011, 5: 2013, 6: 2015, 7: 2017, 8: 2020}

for dataset_name in dataset_names:
    dataset = globals()[dataset_name]

    # Year of survey
    dataset["yrsurvey"] = dataset["wave"].map(wave_to_year)
    dataset["yrsurvey"] = dataset["yrsurvey"].astype(int)
    # Year of birth
    dataset["yrbirth"] = dataset["dn003_"]
    dataset["yrbirth"] = pd.to_numeric(dataset["yrbirth"], errors="coerce")
    dataset = dataset[dataset["yrbirth"].notna()].reset_index(drop=True)
    # Age
    dataset["age"] = dataset["yrsurvey"] - dataset["yrbirth"]
    # Filter for 50+
    dataset = dataset[(dataset["age"] >= 50) & (dataset["age"] <= 67)].reset_index(
        drop=True
    )

    globals()[dataset_name] = dataset

### Filter for not retired and employed

In [9]:
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    dataset["employment"] = dataset["ep005_"]
    dataset = dataset[
        dataset.employment
        == "Employed or self-employed (including working for family business)"
    ].reset_index(drop=True)
    globals()[dataset_name] = dataset

### Calculate contribution years

In [10]:
# Load job episodes panel data (from retrospective waves 3 and 7)
jobs = pd.read_stata("sharewX_rel8-0-0_gv_job_episodes_panel.dta")
jobs

Unnamed: 0,mergeid,hhid7,hhid3,jep_w,gender,yrbirth,age,year,country,situation,...,currency_min_pension,currency_max_pension,ret_age,early_age,min_pension,max_pension,current_wage,current_currency_w,current_income,current_currency_i
0,AT-001215-01,AT-001215-A,,7,Female,1939,1,1940,Austria,,...,,,,,,,,,,
1,AT-001215-01,AT-001215-A,,7,Female,1939,2,1941,Austria,,...,,,,,,,,,,
2,AT-001215-01,AT-001215-A,,7,Female,1939,3,1942,Austria,,...,,,,,,,,,,
3,AT-001215-01,AT-001215-A,,7,Female,1939,4,1943,Austria,,...,,,,,,,,,,
4,AT-001215-01,AT-001215-A,,7,Female,1939,5,1944,Austria,In education,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6166015,SK-999958-02,SK-999958-A,,7,Female,1945,68,2013,Slovakia,Retired from work,...,,,62.0,,Not applicable,Not applicable,,,,
6166016,SK-999958-02,SK-999958-A,,7,Female,1945,69,2014,Slovakia,Retired from work,...,,,62.0,,Not applicable,Not applicable,,,,
6166017,SK-999958-02,SK-999958-A,,7,Female,1945,70,2015,Slovakia,Retired from work,...,,,62.0,,Not applicable,Not applicable,,,,
6166018,SK-999958-02,SK-999958-A,,7,Female,1945,71,2016,Slovakia,Retired from work,...,,,62.0,,Not applicable,Not applicable,,,,


In [11]:
# Calculate number of years of work for each individual
conditions = ["Employee or self-employed", "Short term job (less than 6 months)"]
relevant_rows = jobs[jobs["situation"].isin(conditions)]
result_jobs = (
    relevant_rows.groupby("mergeid").size().reset_index(name="yrscontribution")
)
result_jobs

Unnamed: 0,mergeid,yrscontribution
0,AT-001215-01,29
1,AT-001492-01,3
2,AT-001492-02,43
3,AT-001881-01,43
4,AT-001881-02,41
...,...,...
85057,SK-995042-01,39
85058,SK-995042-02,46
85059,SK-996004-01,37
85060,SK-999958-01,36


In [12]:
first_contribution = (
    relevant_rows.groupby("mergeid")["year"].min().reset_index(name="yr1contribution")
)
first_contribution

Unnamed: 0,mergeid,yr1contribution
0,AT-001215-01,1956
1,AT-001492-01,1969
2,AT-001492-02,1970
3,AT-001881-01,1944
4,AT-001881-02,1943
...,...,...
85057,SK-995042-01,1974
85058,SK-995042-02,1971
85059,SK-996004-01,1981
85060,SK-999958-01,1964


In [13]:
# Merge with main datasets
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    dataset = dataset.merge(result_jobs, on="mergeid", how="left")
    dataset = dataset.merge(first_contribution, on="mergeid", how="left")
    # Delete those with less than 10 years of contributions
    dataset = dataset[dataset["yrscontribution"] >= 10]
    # Delete those who started work before the age of 10
    dataset = dataset[
        dataset["yr1contribution"].astype(int) >= dataset["yrbirth"].astype(int) + 12
    ]
    globals()[dataset_name] = dataset

In [14]:
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9197 entries, 3 to 12730
Columns: 2906 entries, mergeid to yr1contribution
dtypes: category(1071), float32(13), float64(37), int64(2), object(1783)
memory usage: 138.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12036 entries, 3 to 17096
Columns: 2906 entries, mergeid to yr1contribution
dtypes: category(1071), float32(13), float64(37), int64(2), object(1783)
memory usage: 181.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18033 entries, 1 to 26916
Columns: 2906 entries, mergeid to yr1contribution
dtypes: category(1071), float32(13), float64(37), int64(2), object(1783)
memory usage: 271.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19330 entries, 0 to 25664
Columns: 2906 entries, mergeid to yr1contribution
dtypes: category(1071), float32(13), float64(37), int64(2), object(1783)
memory usage: 290.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19764 entries, 0 to 22965
Columns: 2906

### Set legal retirement ages

In [15]:
# Make some necessary formatting
month_to_numeric = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    dataset["gender"] = dataset["dn042_"]
    dataset["nb_children"] = dataset["ch001_"]
    dataset = dataset[
        ~(
            (dataset["country"] == "Czech Republic")
            & (dataset["gender"] == "Female")
            & pd.to_numeric(dataset["nb_children"], errors="coerce").isna()
        )
    ]
    dataset["mbirth"] = dataset["dn002_"].map(month_to_numeric)
    dataset["yr1country"] = dataset["dn006_"]
    dataset["yr1country"] = dataset["yr1country"].fillna(dataset["yrbirth"])
    dataset["public_job"] = dataset["ep019_"]
    dataset["job_status"] = dataset["ep009_"]
    globals()[dataset_name] = dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['mbirth'] = dataset['dn002_'].map(month_to_numeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['yr1country'] = dataset['dn006_']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['yr1country'] = dataset['yr1country'].fillna(dataset['yrbirth'])
A value is trying to be set on

In [18]:
def czech_republic(row):
    # Male
    if row["gender"] == "Male":
        # Wave 1
        if row["wave"] == 1:
            if row["yrscontribution"] + 61.33 - row["age"] >= 25:
                return 61.33
            elif row["age"] + 25 - row["yrscontribution"] < 65:
                return row["age"] + 25 - row["yrscontribution"]
            elif row["yrscontribution"] + 65 - row["age"] >= 15:
                return 65
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 2
        elif row["wave"] == 2:
            if row["yrscontribution"] + 61.67 - row["age"] >= 25:
                return 61.67
            elif row["age"] + 25 - row["yrscontribution"] < 65:
                return row["age"] + 25 - row["yrscontribution"]
            elif row["yrscontribution"] + 65 - row["age"] >= 15:
                return 65
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 4
        elif row["wave"] == 4:
            if row["yrscontribution"] + 62.17 - row["age"] >= 27:
                return 62.17
            elif row["age"] + 27 - row["yrscontribution"] < 65:
                return row["age"] + 27 - row["yrscontribution"]
            elif row["yrscontribution"] + 65 - row["age"] >= 17:
                return 65
            else:
                return row["age"] + 17 - row["yrscontribution"]
        # Wave 5
        elif row["wave"] == 5:
            if row["yrscontribution"] + 62.5 - row["age"] >= 29:
                return 62.5
            elif row["age"] + 29 - row["yrscontribution"] < 65:
                return row["age"] + 29 - row["yrscontribution"]
            elif row["yrscontribution"] + 65 - row["age"] >= 19:
                return 65
            else:
                return row["age"] + 19 - row["yrscontribution"]
        # Wave 6
        elif row["wave"] == 6:
            if row["yrscontribution"] + 62.83 - row["age"] >= 31:
                return 62.83
            elif row["age"] + 31 - row["yrscontribution"] < 62.83 + 5:
                return row["age"] + 31 - row["yrscontribution"]
            elif row["yrscontribution"] + 62.83 + 5 - row["age"] >= 20:
                return 62.83 + 5
            else:
                return row["age"] + 20 - row["yrscontribution"]
        # Wave 7
        elif row["wave"] == 7:
            if row["yrscontribution"] + 63.17 - row["age"] >= 33:
                return 63.17
            elif row["age"] + 33 - row["yrscontribution"] < 63.17 + 5:
                return row["age"] + 33 - row["yrscontribution"]
            elif row["yrscontribution"] + 63.17 + 5 - row["age"] >= 20:
                return 63.17 + 5
            else:
                return row["age"] + 20 - row["yrscontribution"]
        # Wave 8
        else:
            if row["yrscontribution"] + 63.67 - row["age"] >= 35:
                return 63.67
            elif row["age"] + 35 - row["yrscontribution"] < 63.67 + 5:
                return row["age"] + 35 - row["yrscontribution"]
            elif row["yrscontribution"] + 63.67 + 5 - row["age"] >= 20:
                return 63.67 + 5
            else:
                return row["age"] + 20 - row["yrscontribution"]

    # Female
    else:
        # Wave 1
        if row["wave"] == 1:
            if row["nb_children"] == 0:
                if row["yrscontribution"] + 59.33 - row["age"] >= 25:
                    return 59.33
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            elif row["nb_children"] == 1:
                if row["yrscontribution"] + 58.33 - row["age"] >= 25:
                    return 58.33
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            elif row["nb_children"] == 2:
                if row["yrscontribution"] + 57.33 - row["age"] >= 25:
                    return 57.33
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            elif row["nb_children"] == 3 or row["nb_children"] == 4:
                if row["yrscontribution"] + 56.33 - row["age"] >= 25:
                    return 56.33
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 55.33 - row["age"] >= 25:
                    return 55.33
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
        # Wave 2
        elif row["wave"] == 2:
            if row["nb_children"] == 0:
                if row["yrscontribution"] + 60 - row["age"] >= 25:
                    return 60
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            elif row["nb_children"] == 1:
                if row["yrscontribution"] + 59 - row["age"] >= 25:
                    return 59
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            elif row["nb_children"] == 2:
                if row["yrscontribution"] + 58 - row["age"] >= 25:
                    return 58
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            elif row["nb_children"] == 3 or row["nb_children"] == 4:
                if row["yrscontribution"] + 57 - row["age"] >= 25:
                    return 57
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 56 - row["age"] >= 25:
                    return 56
                elif row["age"] + 25 - row["yrscontribution"] < 65:
                    return row["age"] + 25 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 15:
                    return 65
                else:
                    return row["age"] + 15 - row["yrscontribution"]
        # Wave 4
        elif row["wave"] == 4:
            if row["nb_children"] == 0:
                if row["yrscontribution"] + 61 - row["age"] >= 27:
                    return 61
                elif row["age"] + 27 - row["yrscontribution"] < 65:
                    return row["age"] + 27 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 17:
                    return 65
                else:
                    return row["age"] + 17 - row["yrscontribution"]
            elif row["nb_children"] == 1:
                if row["yrscontribution"] + 60 - row["age"] >= 27:
                    return 60
                elif row["age"] + 27 - row["yrscontribution"] < 65:
                    return row["age"] + 27 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 17:
                    return 65
                else:
                    return row["age"] + 17 - row["yrscontribution"]
            elif row["nb_children"] == 2:
                if row["yrscontribution"] + 59 - row["age"] >= 27:
                    return 59
                elif row["age"] + 27 - row["yrscontribution"] < 65:
                    return row["age"] + 27 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 17:
                    return 65
                else:
                    return row["age"] + 17 - row["yrscontribution"]
            elif row["nb_children"] == 3 or row["nb_children"] == 4:
                if row["yrscontribution"] + 58 - row["age"] >= 27:
                    return 58
                elif row["age"] + 27 - row["yrscontribution"] < 65:
                    return row["age"] + 27 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 17:
                    return 65
                else:
                    return row["age"] + 17 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 57 - row["age"] >= 27:
                    return 57
                elif row["age"] + 27 - row["yrscontribution"] < 65:
                    return row["age"] + 27 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 17:
                    return 65
                else:
                    return row["age"] + 17 - row["yrscontribution"]
        # Wave 5
        elif row["wave"] == 5:
            if row["nb_children"] == 0:
                if row["yrscontribution"] + 61.67 - row["age"] >= 29:
                    return 61.67
                elif row["age"] + 29 - row["yrscontribution"] < 65:
                    return row["age"] + 29 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 19:
                    return 65
                else:
                    return row["age"] + 19 - row["yrscontribution"]
            elif row["nb_children"] == 1:
                if row["yrscontribution"] + 60.67 - row["age"] >= 29:
                    return 60.67
                elif row["age"] + 29 - row["yrscontribution"] < 65:
                    return row["age"] + 29 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 19:
                    return 65
                else:
                    return row["age"] + 19 - row["yrscontribution"]
            elif row["nb_children"] == 2:
                if row["yrscontribution"] + 59.67 - row["age"] >= 29:
                    return 59.67
                elif row["age"] + 29 - row["yrscontribution"] < 65:
                    return row["age"] + 29 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 19:
                    return 65
                else:
                    return row["age"] + 19 - row["yrscontribution"]
            elif row["nb_children"] == 3 or row["nb_children"] == 4:
                if row["yrscontribution"] + 58.67 - row["age"] >= 29:
                    return 58.67
                elif row["age"] + 29 - row["yrscontribution"] < 65:
                    return row["age"] + 29 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 19:
                    return 65
                else:
                    return row["age"] + 19 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 57.67 - row["age"] >= 29:
                    return 57.67
                elif row["age"] + 29 - row["yrscontribution"] < 65:
                    return row["age"] + 29 - row["yrscontribution"]
                elif row["yrscontribution"] + 65 - row["age"] >= 19:
                    return 65
                else:
                    return row["age"] + 19 - row["yrscontribution"]
        # Wave 6
        elif row["wave"] == 6:
            if row["nb_children"] == 0:
                if row["yrscontribution"] + 62 - row["age"] >= 31:
                    return 62
                elif row["age"] + 31 - row["yrscontribution"] < 62.83 + 5:
                    return row["age"] + 31 - row["yrscontribution"]
                elif row["yrscontribution"] + 62.83 + 5 - row["age"] >= 20:
                    return 62.83 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 1:
                if row["yrscontribution"] + 61 - row["age"] >= 31:
                    return 61
                elif row["age"] + 31 - row["yrscontribution"] < 62.83 + 5:
                    return row["age"] + 31 - row["yrscontribution"]
                elif row["yrscontribution"] + 62.83 + 5 - row["age"] >= 20:
                    return 62.83 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 2:
                if row["yrscontribution"] + 60 - row["age"] >= 31:
                    return 60
                elif row["age"] + 31 - row["yrscontribution"] < 62.83 + 5:
                    return row["age"] + 31 - row["yrscontribution"]
                elif row["yrscontribution"] + 62.83 + 5 - row["age"] >= 20:
                    return 62.83 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 3 or row["nb_children"] == 4:
                if row["yrscontribution"] + 59 - row["age"] >= 31:
                    return 59
                elif row["age"] + 31 - row["yrscontribution"] < 62.83 + 5:
                    return row["age"] + 31 - row["yrscontribution"]
                elif row["yrscontribution"] + 62.83 + 5 - row["age"] >= 20:
                    return 62.83 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 58 - row["age"] >= 31:
                    return 58
                elif row["age"] + 31 - row["yrscontribution"] < 62.83 + 5:
                    return row["age"] + 31 - row["yrscontribution"]
                elif row["yrscontribution"] + 62.83 + 5 - row["age"] >= 20:
                    return 62.83 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
        # Wave 7
        elif row["wave"] == 7:
            if row["nb_children"] == 0:
                if row["yrscontribution"] + 62.67 - row["age"] >= 33:
                    return 62.67
                elif row["age"] + 33 - row["yrscontribution"] < 63.17 + 5:
                    return row["age"] + 33 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.17 + 5 - row["age"] >= 20:
                    return 63.17 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 1:
                if row["yrscontribution"] + 61.67 - row["age"] >= 33:
                    return 61.67
                elif row["age"] + 33 - row["yrscontribution"] < 63.17 + 5:
                    return row["age"] + 33 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.17 + 5 - row["age"] >= 20:
                    return 63.17 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 2:
                if row["yrscontribution"] + 60.67 - row["age"] >= 33:
                    return 60.67
                elif row["age"] + 33 - row["yrscontribution"] < 63.17 + 5:
                    return row["age"] + 33 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.17 + 5 - row["age"] >= 20:
                    return 63.17 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 3 or row["nb_children"] == 4:
                if row["yrscontribution"] + 59.67 - row["age"] >= 33:
                    return 59.67
                elif row["age"] + 33 - row["yrscontribution"] < 63.17 + 5:
                    return row["age"] + 33 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.17 + 5 - row["age"] >= 20:
                    return 63.17 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 58.67 - row["age"] >= 33:
                    return 58.67
                elif row["age"] + 33 - row["yrscontribution"] < 63.17 + 5:
                    return row["age"] + 33 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.17 + 5 - row["age"] >= 20:
                    return 63.17 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
        # Wave 8
        else:
            if row["nb_children"] == 0:
                if row["yrscontribution"] + 63.67 - row["age"] >= 35:
                    return 63.67
                elif row["age"] + 35 - row["yrscontribution"] < 63.67 + 5:
                    return row["age"] + 35 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.67 + 5 - row["age"] >= 20:
                    return 63.67 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 1:
                if row["yrscontribution"] + 62.67 - row["age"] >= 35:
                    return 62.67
                elif row["age"] + 35 - row["yrscontribution"] < 63.67 + 5:
                    return row["age"] + 35 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.67 + 5 - row["age"] >= 20:
                    return 63.67 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 2:
                if row["yrscontribution"] + 61.67 - row["age"] >= 35:
                    return 61.67
                elif row["age"] + 35 - row["yrscontribution"] < 63.67 + 5:
                    return row["age"] + 35 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.67 + 5 - row["age"] >= 20:
                    return 63.67 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            elif row["nb_children"] == 3 or row["nb_children"] == 4:
                if row["yrscontribution"] + 60.67 - row["age"] >= 35:
                    return 60.67
                elif row["age"] + 35 - row["yrscontribution"] < 63.67 + 5:
                    return row["age"] + 35 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.67 + 5 - row["age"] >= 20:
                    return 63.67 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 59.67 - row["age"] >= 35:
                    return 59.67
                elif row["age"] + 35 - row["yrscontribution"] < 63.67 + 5:
                    return row["age"] + 35 - row["yrscontribution"]
                elif row["yrscontribution"] + 63.67 + 5 - row["age"] >= 20:
                    return 63.67 + 5
                else:
                    return row["age"] + 20 - row["yrscontribution"]

In [19]:
def denmark(row):
    # Waves 1-7
    if row["wave"] < 8:
        if (row["yrbirth"] == 1932 and row["mbirth"] < 7) or (row["yrbirth"] < 1932):
            return 67
        else:
            return 65
    # Wave 8
    else:
        return 66

In [20]:
def estonia(row):
    # Male
    if row["gender"] == "Male":
        # Waves 1-6
        if row["wave"] < 7:
            if row["yrscontribution"] + 63 - row["age"] >= 15:
                return 63
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 7
        elif row["wave"] == 7:
            if row["yrscontribution"] + 63.25 - row["age"] >= 15:
                return 63.25
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 8
        else:
            if row["yrscontribution"] + 63.75 - row["age"] >= 15:
                return 63.75
            else:
                return row["age"] + 15 - row["yrscontribution"]

    # Female
    else:
        # Wave 1
        if row["wave"] == 1:
            if row["yrscontribution"] + 59 - row["age"] >= 15:
                return 59
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 2
        elif row["wave"] == 2:
            if row["yrscontribution"] + 60 - row["age"] >= 15:
                return 60
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 4
        elif row["wave"] == 4:
            if row["yrscontribution"] + 61 - row["age"] >= 15:
                return 61
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 5
        elif row["wave"] == 5:
            if row["yrscontribution"] + 62 - row["age"] >= 15:
                return 62
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 6
        elif row["wave"] == 6:
            if row["yrscontribution"] + 62.5 - row["age"] >= 15:
                return 62.5
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 7
        elif row["wave"] == 7:
            if row["yrscontribution"] + 63.25 - row["age"] >= 15:
                return 63.25
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Wave 8
        else:
            if row["yrscontribution"] + 63.75 - row["age"] >= 15:
                return 63.75
            else:
                return row["age"] + 15 - row["yrscontribution"]

In [21]:
def france(row):
    # Waves 1-2
    if row["wave"] < 3:
        if row["yrscontribution"] + 60 - row["age"] >= 40:
            return 60
        elif row["age"] + 40 - row["yrscontribution"] >= 65:
            return 65
        else:
            return row["age"] + 40 - row["yrscontribution"]
    # Wave 4
    elif row["wave"] == 4:
        if (row["yrbirth"] == 1951 and row["mbirth"] < 7) or (row["yrbirth"] <= 1951):
            if row["yrscontribution"] + 60 - row["age"] >= 40:
                return 60
            elif row["age"] + 40 - row["yrscontribution"] >= 65:
                return 65
            else:
                return row["age"] + 40 - row["yrscontribution"]
        elif row["yrbirth"] == 1951 and row["mbirth"] >= 7:
            if row["yrscontribution"] + 60.33 - row["age"] >= 40:
                return 60.33
            elif row["age"] + 40 - row["yrscontribution"] >= 65.33:
                return 65.33
            else:
                return row["age"] + 40 - row["yrscontribution"]
        elif row["yrbirth"] == 1952:
            if row["yrscontribution"] + 60.67 - row["age"] >= 41:
                return 60.67
            elif row["age"] + 41 - row["yrscontribution"] >= 65.67:
                return 65.67
            else:
                return row["age"] + 41 - row["yrscontribution"]
        elif row["yrbirth"] == 1953:
            if row["yrscontribution"] + 61 - row["age"] >= 41:
                return 61
            elif row["age"] + 41 - row["yrscontribution"] >= 66:
                return 66
            else:
                return row["age"] + 41 - row["yrscontribution"]
        elif row["yrbirth"] == 1954:
            if row["yrscontribution"] + 61.33 - row["age"] >= 41:
                return 61.33
            elif row["age"] + 41 - row["yrscontribution"] >= 66.33:
                return 66.33
            else:
                return row["age"] + 41 - row["yrscontribution"]
        elif row["yrbirth"] == 1955:
            if row["yrscontribution"] + 61.67 - row["age"] >= 41:
                return 61.67
            elif row["age"] + 41 - row["yrscontribution"] >= 66.67:
                return 66.67
            else:
                return row["age"] + 41 - row["yrscontribution"]
        else:
            if row["yrscontribution"] + 62 - row["age"] >= 41:
                return 62
            elif row["age"] + 41 - row["yrscontribution"] >= 67:
                return 67
            else:
                return row["age"] + 41 - row["yrscontribution"]
    # Waves 5 and 6
    elif row["wave"] == 5 or row["wave"] == 6:
        if (row["yrbirth"] == 1951 and row["mbirth"] < 7) or (row["yrbirth"] <= 1951):
            if row["yrscontribution"] + 60 - row["age"] >= 41:
                return 60
            elif row["age"] + 41 - row["yrscontribution"] >= 65:
                return 65
            else:
                return row["age"] + 41 - row["yrscontribution"]
        elif row["yrbirth"] == 1951 and row["mbirth"] >= 7:
            if row["yrscontribution"] + 60.42 - row["age"] >= 41:
                return 60.42
            elif row["age"] + 41 - row["yrscontribution"] >= 65.42:
                return 65.42
            else:
                return row["age"] + 41 - row["yrscontribution"]
        elif row["yrbirth"] == 1952:
            if row["yrscontribution"] + 60.83 - row["age"] >= 41:
                return 60.83
            elif row["age"] + 41 - row["yrscontribution"] >= 65.83:
                return 65.83
            else:
                return row["age"] + 41 - row["yrscontribution"]
        elif row["yrbirth"] == 1953:
            if row["yrscontribution"] + 61.25 - row["age"] >= 41.25:
                return 61.25
            elif row["age"] + 41.25 - row["yrscontribution"] >= 66.25:
                return 66.25
            else:
                return row["age"] + 41.25 - row["yrscontribution"]
        elif row["yrbirth"] == 1954:
            if row["yrscontribution"] + 61.67 - row["age"] >= 41.25:
                return 61.67
            elif row["age"] + 41.25 - row["yrscontribution"] >= 66.67:
                return 66.67
            else:
                return row["age"] + 41.25 - row["yrscontribution"]
        else:
            if row["yrscontribution"] + 62 - row["age"] >= 41.5:
                return 62
            elif row["age"] + 41.5 - row["yrscontribution"] >= 67:
                return 67
            else:
                return row["age"] + 41.5 - row["yrscontribution"]
    # Waves 7 and 8
    elif row["wave"] == 7 or row["wave"] == 8:
        if (row["yrbirth"] == 1951 and row["mbirth"] < 7) or (row["yrbirth"] <= 1951):
            if row["yrscontribution"] + 60 - row["age"] >= 41.25:
                return 60
            elif row["age"] + 41.25 - row["yrscontribution"] >= 65:
                return 65
            else:
                return row["age"] + 41.25 - row["yrscontribution"]
        elif row["yrbirth"] == 1951 and row["mbirth"] >= 7:
            if row["yrscontribution"] + 60.42 - row["age"] >= 41.25:
                return 60.42
            elif row["age"] + 41.25 - row["yrscontribution"] >= 65.42:
                return 65.42
            else:
                return row["age"] + 41.25 - row["yrscontribution"]
        elif row["yrbirth"] == 1952:
            if row["yrscontribution"] + 60.83 - row["age"] >= 41.25:
                return 60.83
            elif row["age"] + 41.25 - row["yrscontribution"] >= 65.83:
                return 65.83
            else:
                return row["age"] + 41.25 - row["yrscontribution"]
        elif row["yrbirth"] == 1953:
            if row["yrscontribution"] + 61.25 - row["age"] >= 41.25:
                return 61.25
            elif row["age"] + 41.25 - row["yrscontribution"] >= 66.25:
                return 66.25
            else:
                return row["age"] + 41.25 - row["yrscontribution"]
        elif row["yrbirth"] == 1954:
            if row["yrscontribution"] + 61.67 - row["age"] >= 41.25:
                return 61.67
            elif row["age"] + 41.25 - row["yrscontribution"] >= 66.67:
                return 66.67
            else:
                return row["age"] + 41.25 - row["yrscontribution"]
        elif row["yrbirth"] >= 1955 and row["yrbirth"] < 1973:
            if row["yrscontribution"] + 62 - row["age"] >= 41.5:
                return 62
            elif row["age"] + 41.5 - row["yrscontribution"] >= 67:
                return 67
            else:
                return row["age"] + 41.5 - row["yrscontribution"]
        else:
            if row["yrscontribution"] + 62 - row["age"] >= 43:
                return 62
            elif row["age"] + 43 - row["yrscontribution"] >= 67:
                return 67
            else:
                return row["age"] + 43 - row["yrscontribution"]

In [22]:
def germany(row):
    # Waves 1-4
    if row["wave"] < 5:
        if row["yrscontribution"] + 65 - row["age"] >= 5:
            return 65
        else:
            return row["age"] + 5 - row["yrscontribution"]
    # Wave 5
    if row["wave"] == 5:
        if row["yrbirth"] >= 1963:
            if row["yrscontribution"] + 67 - row["age"] >= 5:
                return 67
            else:
                return row["age"] + 5 - row["yrscontribution"]
        else:
            if row["yrscontribution"] + 65.08 - row["age"] >= 5:
                return 65.08
            else:
                return row["age"] + 5 - row["yrscontribution"]
    # Wave 6
    if row["wave"] == 6:
        if row["yrbirth"] >= 1963:
            if row["yrscontribution"] + 67 - row["age"] >= 5:
                return 67
            else:
                return row["age"] + 5 - row["yrscontribution"]
        else:
            if row["yrscontribution"] + 65.17 - row["age"] >= 5:
                return 65.17
            else:
                return row["age"] + 5 - row["yrscontribution"]
    # Wave 7
    if row["wave"] == 7:
        if row["yrbirth"] >= 1963:
            if row["yrscontribution"] + 67 - row["age"] >= 5:
                return 67
            else:
                return row["age"] + 5 - row["yrscontribution"]
        else:
            if row["yrscontribution"] + 65.25 - row["age"] >= 5:
                return 65.25
            else:
                return row["age"] + 5 - row["yrscontribution"]
    # Wave 8
    else:
        if row["yrbirth"] >= 1963:
            if row["yrscontribution"] + 67 - row["age"] >= 5:
                return 67
            else:
                return row["age"] + 5 - row["yrscontribution"]
        else:
            if row["yrscontribution"] + 65.33 - row["age"] >= 5:
                return 65.33
            else:
                return row["age"] + 5 - row["yrscontribution"]

In [23]:
def greece(row):
    # Male
    if row["gender"] == "Male":
        # Waves 1-4
        if row["wave"] < 5:
            if row["age"] + 37 - row["yrscontribution"] < 65:
                return row["age"] + 37 - row["yrscontribution"]
            elif row["yrscontribution"] + 65 - row["age"] >= 15:
                return 65
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Waves 5-8
        else:
            if row["yrscontribution"] + 62 - row["age"] >= 40:
                return 62
            elif row["age"] + 40 - row["yrscontribution"] < 67:
                return row["age"] + 40 - row["yrscontribution"]
            elif row["yrscontribution"] + 67 - row["age"] >= 15:
                return 67
            else:
                return row["age"] + 15 - row["yrscontribution"]

    # Female
    if row["gender"] == "Female":
        # Waves 1-4
        if row["wave"] < 5:
            if row["age"] + 37 - row["yrscontribution"] < 60:
                return row["age"] + 37 - row["yrscontribution"]
            elif row["yrscontribution"] + 60 - row["age"] >= 15:
                return 60
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Waves 5-8
        else:
            if row["yrscontribution"] + 62 - row["age"] >= 40:
                return 62
            elif row["age"] + 40 - row["yrscontribution"] < 67:
                return row["age"] + 40 - row["yrscontribution"]
            elif row["yrscontribution"] + 67 - row["age"] >= 15:
                return 67
            else:
                return row["age"] + 15 - row["yrscontribution"]

In [24]:
def italy(row):
    # Male
    if row["gender"] == "Male":
        # Waves 1-2
        if row["wave"] < 3:
            if row["yr1contribution"] < 1996:
                if row["yrscontribution"] + 65 - row["age"] >= 20:
                    return 65
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 65 - row["age"] >= 20:
                    return 65
                elif row["age"] + 20 - row["yrscontribution"] < 70:
                    return row["age"] + 20 - row["yrscontribution"]
                elif row["yrscontribution"] + 70 - row["age"] >= 5:
                    return 70
                else:
                    return row["age"] + 5 - row["yrscontribution"]
        # Wave 4
        elif row["wave"] == 4:
            if row["yr1contribution"] < 1996:
                if row["yrscontribution"] + 66 - row["age"] >= 20:
                    return 66
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 66 - row["age"] >= 20:
                    return 66
                elif row["age"] + 20 - row["yrscontribution"] < 70:
                    return row["age"] + 20 - row["yrscontribution"]
                elif row["yrscontribution"] + 70 - row["age"] >= 5:
                    return 70
                else:
                    return row["age"] + 5 - row["yrscontribution"]
        # Waves 5 and 6
        elif row["wave"] == 5 or row["wave"] == 6:
            if row["yr1contribution"] < 1996:
                if row["yrscontribution"] + 66.25 - row["age"] >= 20:
                    return 66.25
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 66.25 - row["age"] >= 20:
                    return 66.25
                elif row["age"] + 20 - row["yrscontribution"] < 70:
                    return row["age"] + 20 - row["yrscontribution"]
                elif row["yrscontribution"] + 70 - row["age"] >= 5:
                    return 70
                else:
                    return row["age"] + 5 - row["yrscontribution"]
        # Wave 7
        elif row["wave"] == 7:
            if row["yr1contribution"] < 1996:
                if row["yrscontribution"] + 66.58 - row["age"] >= 20:
                    return 66.58
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 66.58 - row["age"] >= 20:
                    return 66.58
                elif row["age"] + 20 - row["yrscontribution"] < 70:
                    return row["age"] + 20 - row["yrscontribution"]
                elif row["yrscontribution"] + 70 - row["age"] >= 5:
                    return 70
                else:
                    return row["age"] + 5 - row["yrscontribution"]
        # Wave 8
        else:
            if row["yr1contribution"] < 1996:
                if row["yrscontribution"] + 67 - row["age"] >= 20:
                    return 67
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 67 - row["age"] >= 20:
                    return 67
                elif row["age"] + 20 - row["yrscontribution"] < 70:
                    return row["age"] + 20 - row["yrscontribution"]
                elif row["yrscontribution"] + 70 - row["age"] >= 5:
                    return 70
                else:
                    return row["age"] + 5 - row["yrscontribution"]

    # Female
    else:
        # Waves 1-2
        if row["wave"] < 3:
            if row["yr1contribution"] < 1996:
                if row["yrscontribution"] + 60 - row["age"] >= 20:
                    return 60
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 60 - row["age"] >= 20:
                    return 60
                elif row["age"] + 20 - row["yrscontribution"] < 70:
                    return row["age"] + 20 - row["yrscontribution"]
                elif row["yrscontribution"] + 70 - row["age"] >= 5:
                    return 70
                else:
                    return row["age"] + 5 - row["yrscontribution"]
        # Wave 4
        elif row["wave"] == 4:
            if row["public_job"] == "Yes":
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 61 - row["age"] >= 20:
                        return 61
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 61 - row["age"] >= 20:
                        return 61
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
            else:
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 60 - row["age"] >= 20:
                        return 60
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 60 - row["age"] >= 20:
                        return 60
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
        # Wave 5
        elif row["wave"] == 5:
            if row["job_status"] == "Self-employed":
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 63.75 - row["age"] >= 20:
                        return 63.75
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 63.75 - row["age"] >= 20:
                        return 63.75
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
            elif row["job_status"] == "Civil servant":
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 66.25 - row["age"] >= 20:
                        return 66.25
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 66.25 - row["age"] >= 20:
                        return 66.25
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
            else:
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 62.25 - row["age"] >= 20:
                        return 62.25
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 62.25 - row["age"] >= 20:
                        return 62.25
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
        # Wave 6
        elif row["wave"] == 6:
            if row["job_status"] == "Self-employed":
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 64.75 - row["age"] >= 20:
                        return 64.75
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 64.75 - row["age"] >= 20:
                        return 64.75
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
            elif row["job_status"] == "Civil servant":
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 66.25 - row["age"] >= 20:
                        return 66.25
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 66.25 - row["age"] >= 20:
                        return 66.25
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
            else:
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 63.75 - row["age"] >= 20:
                        return 63.75
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 63.75 - row["age"] >= 20:
                        return 63.75
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
        # Wave 7
        elif row["wave"] == 7:
            if row["job_status"] == "Self-employed":
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 66.08 - row["age"] >= 20:
                        return 66.08
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 66.08 - row["age"] >= 20:
                        return 66.08
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
            elif row["job_status"] == "Civil servant":
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 66.58 - row["age"] >= 20:
                        return 66.58
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 66.58 - row["age"] >= 20:
                        return 66.58
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
            else:
                if row["yr1contribution"] < 1996:
                    if row["yrscontribution"] + 65.58 - row["age"] >= 20:
                        return 65.58
                    else:
                        return row["age"] + 20 - row["yrscontribution"]
                else:
                    if row["yrscontribution"] + 65.58 - row["age"] >= 20:
                        return 65.58
                    elif row["age"] + 20 - row["yrscontribution"] < 70:
                        return row["age"] + 20 - row["yrscontribution"]
                    elif row["yrscontribution"] + 70 - row["age"] >= 5:
                        return 70
                    else:
                        return row["age"] + 5 - row["yrscontribution"]
        # Wave 8
        else:
            if row["yr1contribution"] < 1996:
                if row["yrscontribution"] + 67 - row["age"] >= 20:
                    return 67
                else:
                    return row["age"] + 20 - row["yrscontribution"]
            else:
                if row["yrscontribution"] + 67 - row["age"] >= 20:
                    return 67
                elif row["age"] + 20 - row["yrscontribution"] < 70:
                    return row["age"] + 20 - row["yrscontribution"]
                elif row["yrscontribution"] + 70 - row["age"] >= 5:
                    return 70
                else:
                    return row["age"] + 5 - row["yrscontribution"]

In [25]:
def luxembourg(row):
    # Waves 1-4
    if row["wave"] < 5:
        if row["yrscontribution"] + 65 - row["age"] >= 10:
            return 65
        else:
            return row["age"] + 10 - row["yrscontribution"]
    # Wave 5
    elif row["wave"] == 5:
        if row["yrscontribution"] + 65.08 - row["age"] >= 10:
            return 65.08
        else:
            return row["age"] + 10 - row["yrscontribution"]
    # Wave 6
    elif row["wave"] == 6:
        if row["yrscontribution"] + 65.25 - row["age"] >= 10:
            return 65.25
        else:
            return row["age"] + 10 - row["yrscontribution"]
    # Wave 7
    elif row["wave"] == 7:
        if row["yrscontribution"] + 65.75 - row["age"] >= 10:
            return 65.75
        else:
            return row["age"] + 10 - row["yrscontribution"]
    # Wave 8
    else:
        if row["yrscontribution"] + 66.33 - row["age"] >= 10:
            return 66.33
        else:
            return row["age"] + 10 - row["yrscontribution"]

In [26]:
def netherlands(row):
    # Waves 1-7
    if row["wave"] < 8:
        return 65
    # Wave 8
    else:
        return 66

In [27]:
def poland(row):
    # Male
    if row["gender"] == "Male":
        # Waves 1-5
        if row["wave"] < 6:
            if row["yrscontribution"] + 65 - row["age"] >= 25:
                return 65
            else:
                return row["age"] + 25 - row["yrscontribution"]
        # Wave 6
        else:
            if row["yrscontribution"] + 65.75 - row["age"] >= 25:
                return 65.75
            else:
                return row["age"] + 25 - row["yrscontribution"]

    # Female
    else:
        # Waves 1-5
        if row["wave"] < 6:
            if row["yrscontribution"] + 60 - row["age"] >= 20:
                return 60
            else:
                return row["age"] + 20 - row["yrscontribution"]
        # Wave 6
        else:
            if row["yrscontribution"] + 60.75 - row["age"] >= 20:
                return 60.75
            else:
                return row["age"] + 20 - row["yrscontribution"]

In [28]:
def portugal(row):
    # Waves 1-5
    if row["wave"] < 6:
        if row["yrscontribution"] + 65 - row["age"] >= 15:
            return 65
        else:
            return row["age"] + 15 - row["yrscontribution"]
    # Waves 6-8
    else:
        if row["yrscontribution"] + 66 - row["age"] >= 15:
            return 66
        else:
            return row["age"] + 15 - row["yrscontribution"]

In [29]:
def slovenia(row):
    # Male
    if row["gender"] == "Male":
        # Waves 1-4
        if row["wave"] < 5:
            if row["yrscontribution"] + 58 - row["age"] >= 40:
                return 58
            elif row["age"] + 40 - row["yrscontribution"] < 63:
                return row["age"] + 40 - row["yrscontribution"]
            elif row["yrscontribution"] + 63 - row["age"] >= 20:
                return 63
            elif row["age"] + 20 - row["yrscontribution"] == 64:
                return 64
            elif row["yrscontribution"] + 65 - row["age"] >= 15:
                return 65
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Waves 5-8
        else:
            if row["yrscontribution"] + 60 - row["age"] >= 40:
                return 60
            elif row["age"] + 40 - row["yrscontribution"] < 65:
                return row["age"] + 40 - row["yrscontribution"]
            elif row["yrscontribution"] + 65 - row["age"] >= 15:
                return 65
            else:
                return row["age"] + 15 - row["yrscontribution"]

    # Female
    else:
        # Waves 1-4
        if row["wave"] < 5:
            if row["yrscontribution"] + 58 - row["age"] >= 38:
                return 58
            elif row["age"] + 38 - row["yrscontribution"] < 61:
                return row["age"] + 38 - row["yrscontribution"]
            elif row["yrscontribution"] + 61 - row["age"] >= 20:
                return 61
            elif row["age"] + 20 - row["yrscontribution"] == 62:
                return 62
            elif row["yrscontribution"] + 63 - row["age"] >= 15:
                return 63
            else:
                return row["age"] + 15 - row["yrscontribution"]
        # Waves 5-8
        else:
            if row["yrscontribution"] + 60 - row["age"] >= 40:
                return 60
            elif row["age"] + 40 - row["yrscontribution"] < 65:
                return row["age"] + 40 - row["yrscontribution"]
            elif row["yrscontribution"] + 65 - row["age"] >= 15:
                return 65
            else:
                return row["age"] + 15 - row["yrscontribution"]

In [30]:
def spain(row):
    # Waves 1-4
    if row["wave"] < 5:
        if row["yrscontribution"] + 65 - row["age"] >= 15:
            return 65
        else:
            return row["age"] + 15 - row["yrscontribution"]
    # Wave 5
    elif row["wave"] == 5:
        if row["yrscontribution"] + 65 - row["age"] >= 35.25:
            return 65
        elif row["yrscontribution"] + 65.08 - row["age"] >= 15:
            return 65.08
        else:
            return row["age"] + 15 - row["yrscontribution"]
    # Wave 6
    elif row["wave"] == 6:
        if row["yrscontribution"] + 65 - row["age"] >= 35.75:
            return 65
        elif row["yrscontribution"] + 65.25 - row["age"] >= 15:
            return 65.25
        else:
            return row["age"] + 15 - row["yrscontribution"]
    # Wave 7
    elif row["wave"] == 7:
        if row["yrscontribution"] + 65 - row["age"] >= 36.25:
            return 65
        elif row["yrscontribution"] + 65.42 - row["age"] >= 15:
            return 65.42
        else:
            return row["age"] + 15 - row["yrscontribution"]
    # Wave 8
    else:
        if row["yrscontribution"] + 65 - row["age"] >= 37:
            return 65
        elif row["yrscontribution"] + 65.83 - row["age"] >= 15:
            return 65.83
        else:
            return row["age"] + 15 - row["yrscontribution"]

In [31]:
def switzerland(row):
    # Male
    if row["gender"] == "Male":
        return 65

    # Female
    else:
        return 63

In [32]:
country_functions = {
    "Austria": austria,
    "Belgium": belgium,
    "Czech Republic": czech_republic,
    "Denmark": denmark,
    "Estonia": estonia,
    "France": france,
    "Germany": germany,
    "Greece": greece,
    "Italy": italy,
    "Luxembourg": luxembourg,
    "Netherlands": netherlands,
    "Poland": poland,
    "Portugal": portugal,
    "Slovenia": slovenia,
    "Spain": spain,
    "Switzerland": switzerland,
}


def calculate_retirement_age(row):
    country = row["country"]
    if country in country_functions:
        return country_functions[country](row)
    else:
        return None

In [33]:
# Calculate retirement age
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    dataset["retirement_age"] = dataset.apply(calculate_retirement_age, axis=1)
    # Delete those who are above the retirement age (continue to work longer)
    dataset = dataset[dataset["retirement_age"] > dataset["age"]]
    globals()[dataset_name] = dataset

In [34]:
combined_data = pd.DataFrame()
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    combined_data = pd.concat([combined_data, dataset])

In [35]:
combined_data.groupby("country").retirement_age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Austria,3916.0,62.628192,2.504282,60.0,60.0,65.0,65.0,70.0
Belgium,10457.0,64.900736,0.370457,63.0,65.0,65.0,65.0,65.0
Czech Republic,4211.0,61.488806,1.798667,56.0,60.0,62.17,62.83,70.0
Denmark,11084.0,65.045832,0.20913,65.0,65.0,65.0,65.0,66.0
Estonia,7597.0,62.748782,0.7149,61.0,62.5,63.0,63.25,71.0
France,7224.0,62.845493,2.264551,60.0,61.67,62.0,65.0,67.0
Germany,9377.0,65.200309,0.362124,65.0,65.08,65.17,65.25,67.0
Greece,2001.0,63.411294,3.756362,51.0,62.0,64.0,67.0,67.0
Italy,6907.0,64.909593,2.094435,60.0,63.75,66.0,66.25,74.0
Luxembourg,1196.0,65.476212,0.364875,65.08,65.25,65.25,65.75,66.33


### Calculate resting work horizon

In [59]:
w12_short = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w12.csv"
)
w24_short = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w24.csv"
)
w45_short = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w45.csv"
)
w56_short = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w56.csv"
)
w67_short = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w67.csv"
)
w78_short = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w78.csv"
)

  w12_short = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w12.csv')
  w24_short = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w24.csv')
  w45_short = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w45.csv')
  w56_short = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w56.csv')
  w67_short = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w67.csv')
  w78_short = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w78.csv')


In [24]:
w12 = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w12.csv"
)
w24 = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w24.csv"
)
w45 = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w45.csv"
)
w56 = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w56.csv"
)
w67 = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w67.csv"
)
w78 = pd.read_csv(
    "/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w78.csv"
)

  w12 = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w12.csv')
  w24 = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w24.csv')
  w45 = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w45.csv')
  w56 = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w56.csv')
  w67 = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w67.csv')
  w78 = pd.read_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/w78.csv')


In [25]:
dataset_names = ["w12", "w24", "w45", "w56", "w67", "w78"]
dataset_names_short = [
    "w12_short",
    "w24_short",
    "w45_short",
    "w56_short",
    "w67_short",
    "w78_short",
]

In [74]:
value_counts.reset_index()[value_counts.reset_index()["index"] == "AT-011464-01"]

Unnamed: 0,index,mergeid


In [60]:
for dataset_name in dataset_names_short:
    dataset = globals()[dataset_name]
    print(f"Unique ids in {dataset_name} full: {dataset.mergeid.nunique()}")
    value_counts = dataset["mergeid"].value_counts()
    dataset = dataset[dataset["mergeid"].isin(value_counts.index[value_counts == 2])]
    print(f"Filtered for only present in both waves: {dataset.mergeid.nunique()}")
    print("------------------------------------------")

Unique ids in w12_short full: 5829
Filtered for only present in both waves: 2337
------------------------------------------
Unique ids in w24_short full: 8904
Filtered for only present in both waves: 2350
------------------------------------------
Unique ids in w45_short full: 11340
Filtered for only present in both waves: 5085
------------------------------------------
Unique ids in w56_short full: 11507
Filtered for only present in both waves: 6334
------------------------------------------
Unique ids in w67_short full: 10834
Filtered for only present in both waves: 7095
------------------------------------------
Unique ids in w78_short full: 8676
Filtered for only present in both waves: 3078
------------------------------------------


In [64]:
for dataset_name in dataset_names_short:
    dataset = globals()[dataset_name]
    dataset["work_horizon"] = dataset["retirement_age"] - dataset["age"]
    # dataset['work_horizon_change'] = dataset.groupby('mergeid')['work_horizon'].diff()
    globals()[dataset_name] = dataset

In [77]:
value_counts = w12_short["mergeid"].value_counts()
w12_short = w12_short[w12_short["mergeid"].isin(value_counts.index[value_counts == 2])]

In [81]:
w12_short[w12_short["mergeid"] == "AT-011464-01"]

Unnamed: 0,mergeid,hhid1,mergeidp1,coupleid1,country,language,adl,adl2,bmi,bmi2,...,yr1contribution,gender,nb_children,mbirth,yr1country,public_job,job_status,retirement_age,work_horizon_change,work_horizon
0,AT-011464-01,AT-011464-A,,,Austria,German (at),0.0,No adl limitations,24.622961044311523,18.5-24.9 - normal,...,1976,Male,0.0,3.0,1947.0,,,65.0,3,7.0
3559,AT-011464-01,,,,Austria,German (at),0.0,No adl limitations,25.648918,25-29.9 - overweight,...,1976,Male,0.0,3.0,1946.0,,Self-employed,65.0,3,4.0


In [82]:
w12_short["work_horizon_change"] = 0
for id in w12_short["mergeid"].unique():
    indexes = w12_short[w12_short.mergeid == id].index
    difference = abs(
        w12_short.loc[indexes[0], "work_horizon"]
        - w12_short.loc[indexes[1], "work_horizon"]
    )
    w12_short.loc[indexes, "work_horizon_change"] = difference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w12_short['work_horizon_change'] = 0


In [37]:
w12_short

Unnamed: 0,mergeid,hhid1,mergeidp1,coupleid1,country,language,adl,adl2,bmi,bmi2,...,yr1contribution,gender,nb_children,mbirth,yr1country,public_job,job_status,retirement_age,work_horizon,work_horizon_change
0,AT-011464-01,AT-011464-A,,,Austria,German (at),0.0,No adl limitations,24.622961044311523,18.5-24.9 - normal,...,1976,Male,0.0,3.0,1947.0,,,65.0,7.0,
1,AT-017298-01,AT-017298-A,AT-017298-02,AT-017298-01-02,Austria,German (at),0.0,No adl limitations,28.086259841918945,25-29.9 - overweight,...,1968,Male,2.0,6.0,1943.0,,,65.0,4.0,
2,AT-018838-02,AT-018838-A,AT-018838-01,AT-018838-01-02,Austria,German (at),0.0,No adl limitations,24.337480545043945,18.5-24.9 - normal,...,1965,Female,,9.0,1946.0,,,60.0,2.0,
3,AT-020412-01,AT-020412-A,AT-020412-02,AT-020412-01-02,Austria,German (at),0.0,No adl limitations,25.14285659790039,25-29.9 - overweight,...,1961,Male,1.0,12.0,1946.0,,,65.0,7.0,
4,AT-020412-02,AT-020412-A,AT-020412-01,AT-020412-01-02,Austria,German (at),0.0,No adl limitations,26.1065616607666,25-29.9 - overweight,...,1962,Female,,5.0,1948.0,,,60.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8161,NL-997920-01,,,,Netherlands,Dutch (nl),0.0,No adl limitations,26.892322540283203,25-29.9 - overweight,...,1966,Female,0.0,10.0,1951.0,No,Employee,65.0,9.0,3.0
8162,NL-997942-01,,,,Netherlands,Dutch (nl),0.0,No adl limitations,22.944087982177734,18.5-24.9 - normal,...,1967,Male,,3.0,1949.0,,Self-employed,65.0,7.0,3.0
8163,NL-998246-02,,,,Netherlands,Dutch (nl),1.0,1+ adl limitations,20.57613182067871,18.5-24.9 - normal,...,1968,Female,,12.0,1953.0,No,Employee,65.0,11.0,
8164,NL-999385-01,,,,Netherlands,Dutch (nl),0.0,No adl limitations,23.7955265045166,18.5-24.9 - normal,...,1975,Male,2.0,3.0,1953.0,,Employee,65.0,11.0,3.0


In [36]:
combined_data = combined_data.reset_index(drop=True)

for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    dataset = dataset.reset_index(drop=True)
    globals()[dataset_name] = dataset

In [37]:
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8166 entries, 0 to 8165
Columns: 2913 entries, mergeid to retirement_age
dtypes: category(1072), float32(13), float64(40), int64(2), object(1786)
memory usage: 123.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11254 entries, 0 to 11253
Columns: 2913 entries, mergeid to retirement_age
dtypes: category(1072), float32(13), float64(40), int64(2), object(1786)
memory usage: 169.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16425 entries, 0 to 16424
Columns: 2913 entries, mergeid to retirement_age
dtypes: category(1072), float32(13), float64(39), int64(2), object(1787)
memory usage: 247.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17841 entries, 0 to 17840
Columns: 2913 entries, mergeid to retirement_age
dtypes: category(1072), float32(13), float64(39), int64(2), object(1787)
memory usage: 268.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17929 entries, 0 to 17928
Columns: 2913 entries, mergeid to reti

In [38]:
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    print(f"Unique ids in {dataset_name}:")
    print(dataset.mergeid.nunique())

Unique ids in w12:
5829
Unique ids in w24:
8904
Unique ids in w45:
11340
Unique ids in w56:
11507
Unique ids in w67:
10834
Unique ids in w78:
8676


In [8]:
for dataset_name in dataset_names:
    dataset = globals()[dataset_name]
    value_counts = dataset["mergeid"].value_counts()
    dataset = dataset[dataset["mergeid"].isin(value_counts.index[value_counts == 2])]
    print(f"Unique ids in {dataset_name}:")
    print(dataset.mergeid.nunique())

Unique ids in w12:
2337
Unique ids in w24:
2350
Unique ids in w45:
5085
Unique ids in w56:
6334
Unique ids in w67:
7095
Unique ids in w78:
3078


In [41]:
combined_data.mergeid.nunique()

18792

In [39]:
# combined_data.to_csv('/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/all_waves.csv',index=False)

# for dataset_name in dataset_names:
#    dataset = globals()[dataset_name]
#    dataset.to_csv(f'/Volumes/GoogleDrive/Mon Drive/USMB/MH of older workers/Data/datasets/{dataset_name}.csv',index=False)