In [1]:
# relevant imports
import pandas as pd
import numpy as np

# Reading the SOEP Household data

Because the individual questionair file is quite big we will merge it soepwise with the weather and household data. Therefore we first read the hh files. All the required hh data can be found in two different data sets. These are merged in the following:

In [2]:
hbrutto_path = "./data/SOEP-CORE.v37eu_CSV/CSV/hbrutto.csv"
hl_path = "./data/SOEP-CORE.v37eu_CSV/CSV/hl.csv"

`hbrutto` contains meta data about the interviews. This includes data such as "day of interview", "location", etc. The `hl` file contains the interview responses of all waves in a long format.

In the following are the columns of interest with in the `hl` and `hbrutto` file with the corresponding meaning:

In [3]:
hl_var = [
    # keys for merging
    "syear", # survey year -> prim. key
    "hid", # hh id -> prim. key
    # hh control variables:
    "hlc0005_h", #[de] Monatliches HH-Netto-Einkommen [harmonisiert]
    "hlc0043", # Number Children
    "hlf0001_h", # Homewonership
    "hlk0056" # Type of interview
] # 

hbrutto_var = {
    "syear",  # survey year -> prim. key
    "hid",  # hh id -> prim. key
    "bula_h" # location -> bundesland
}

The two files are now read in to their respective dataframe.

In [4]:
df_hl = pd.read_csv(hl_path, usecols=hl_var)

In [5]:
df_hbrutto = pd.read_csv(hbrutto_path, usecols=hbrutto_var)

The two dataframes can now be merged on their primary key. According to the documentation of the db the prim. keys are `hid` and `syear`.

In [6]:
df_hh = df_hl.merge(df_hbrutto, how="inner", on=["hid", "syear"])

# Reading the SOEP Individual Data

Now we read the individual data. This dataset includes the target variable *SWB* as well as many other control variables. The datasets that contain the information used in this analysis are `ppathl` (tracking file) and `pl` (data)

In [7]:
ppathl_path = "./data/SOEP-CORE.v37eu_CSV/CSV/ppathl.csv"
pequiv_path = "./data/SOEP-CORE.v37eu_CSV/CSV/pequiv.csv"
pl_path = "./data/SOEP-CORE.v37eu_CSV/CSV/pl.csv"

In [8]:
ppathl_var = {
    "pid", # person id -> prim. key
    "syear", #survey year -> prim. key
    # relevant covariates
    "sex", # gender [1] female [2] male
    "gebjahr", # year of birth
    "partner" # [0] no partner, [1] spouse, [2] partner,
                              # [3] Probably spouse, [4] probably partner
    # NOTE: join 1&3 and 2&4
}

pequiv_var = {
    "pid", # person id -> prim. key
    "syear", #survey year -> prim. key
    "d11109", #years of education: Year of eduction: numerical
    # NOTE: many nan values
    "m11124", # Disability status: [0] Not Disabled, [1] Disabled
    "e11103" # Labor Participation: [1] Full Time, [2] Part Time, [3] Not Working
}

pl_var = {
    # ids
    "pid",  # person id -> prim. key
    "syear",  #survey year -> prim. key
    "hid",  # hh id -> prim. key
    # target variable
    "plh0182", # Current life satisfaction [0-10]
    # relevant covariates
    "ptagin",  # day of interview
    "pmonin",  # month of interview
    "plh0171",  # Current Health: [1-5] (1=schlecht, 10=gut)
    "plb0021",  # Arbeitslos gemeldet: [2] No [1] Yes
    "plh0173",  # satisfaction with work: [0-10] not satisfied <-> very satisfied
    "plh0174",  # satisfaction with home work:  [0-10] not satisfied <-> very satisfied 
    # NOTE: take max of plh0173 and plh0174
    "plh0175" # Satisfaction With Household Income
}

The `ppathl` contains the tracking data of a person. This includes for instance the age or marital status.

In [9]:
df_ppathl = pd.read_csv(ppathl_path, usecols=ppathl_var)
# because [3] approx [1] and [4] approx [2]:
df_ppathl[df_ppathl['partner'] == 3] = 1
df_ppathl[df_ppathl['partner'] == 4] = 2

In [10]:
df_pequiv = pd.read_csv(pequiv_path, usecols=pequiv_var)

In the following we merge all the dataframes loaded into memory.

In [12]:
soep = pd.read_csv(pl_path, usecols=pl_var)
# because (plh0173 approx plh0174):
soep["plh0173"] = soep[["plh0173", "plh0174"]].max()
soep.drop(columns="plh0174", inplace=True)

## MERGE WITH OTHER DATASETS
# merge with tracking data
soep = soep.merge(df_ppathl, on=["syear", "pid"], how="inner")
# merge with pequiv (TODO what is this table)
soep = soep.merge(df_pequiv, on=["syear", "pid"], how="inner")
# merge with household
soep = soep.merge(df_hh, on=["syear", "hid"], how="inner")

## CALCULATE RELEVANT VARIABLES
# age:
soep["age"] = soep["syear"] - soep["gebjahr"]
# time stamp:
soep.rename({'syear':"year", 'pmonin':"month", 'ptagin':"day"}, axis=1, inplace=True)
soep["time"] = pd.to_datetime(soep[['year', 'month', 'day']], errors='coerce')
# drop unuseful columns:
soep.drop(['year', 'month', 'day', 'gebjahr'], axis=1, inplace=True)
# delete invalid time stamps as they cannot be merged with climate data:
soep = soep[soep['time'].notna()]

## SAFE DATAFRAME
soep.to_csv("./prod/soeplong.csv", index=False)