In [1]:
#reading datasets
import numpy as np
import pandas as pd
pgen = pd.read_stata("pgen.dta")
biol= pd.read_stata("biol.dta")
pequiv = pd.read_stata("pequiv.dta")
jungendl = pd.read_stata("jugendl.dta")

In [2]:
#extracting years of education and nationality of parents
pgen_relevant = pgen.loc[:,['cid','pid','syear','pgbilzeit']]
pgen_relevant.columns = ['cid','pid','syear','years_of_education']


In [3]:
#extracting gender of parents
pequiv_relevant=pequiv.loc[:,['cid','pid','syear','d11102ll']]
pequiv_relevant.columns=['cid','pid','syear','sex_of_parents']


In [4]:
#extracting migration control of parents,
#note that one of them has low number of valid observations
biol_relevant=biol.loc[:,['cid','pid','syear','lb0011','lb0013','lb0014']]
biol_relevant.columns = ['cid','pid','syear','year_of_birth','born_in_germany','german_nationality']


In [5]:
parents = pequiv_relevant.merge(pgen_relevant, how = 'left', on = ['cid','pid','syear'])

In [6]:
parents_controls = parents.merge(biol_relevant, how = 'left', on = ['cid','pid','syear'])

In [7]:
#Keep observation only in jungendl.
childids = np.array(jungendl.cid).tolist()
parents_controls_drop = parents_controls[parents_controls.cid.isin(childids)]

In [8]:
#Replace all negative number into pd.np.nan.
dict_n = {'[-1] keine Angabe': pd.np.nan, '[-8] Frage in diesem Jahr nicht Teil des Frageprograms': pd.np.nan, -2: pd.np.nan, -8:pd.np.nan, -1:pd.np.nan,
          '[-5] In Fragebogenversion nicht enthalten':pd.np.nan, '[-2] trifft nicht zu':pd.np.nan}
parents_control = parents_controls_drop.replace(dict_n)

In [11]:
#Replace all answer to value, easier for the research.
dict_v = {'[1] Deutschland': 1, '[1] Male           1': 0, '[2] Female         2':1, '[2] ausserhalb Deutschlands':0 , '[1] Ja':1, '[2] Nein':2  }
parents_control_replace = parents_control.replace(dict_v)
parents_control_replace

#Sex of parents: 0 = male, 1 = female
#Born_in_germany: 1 = Yes, 0 = No
#German Nationality: 1 = Yes, 0 = No

Index(['cid', 'pid', 'syear', 'sex_of_parents', 'years_of_education',
       'year_of_birth', 'born_in_germany', 'german_nationality'],
      dtype='object')

In [10]:
#Output the clean data-set.
parents_control_replace.to_csv("parent_control.csv")

In [22]:
mother_controls = parents_control_replace.loc[:,[ 'pid', 'syear', 'sex_of_parents', 'years_of_education',
       'year_of_birth', 'born_in_germany', 'german_nationality']]

mother_controls.columns=[ 'pid_mother', 'syear', 'sex_of_mother', 'years_of_education_mother',
       'year_of_birth_mother', 'mother_born_in_germany', 'mother_german_nationality']

In [23]:
mother_controls.to_csv("mother_controls.csv")

In [24]:
father_controls = parents_control_replace.loc[:,[ 'pid', 'syear', 'sex_of_parents', 'years_of_education',
       'year_of_birth', 'born_in_germany', 'german_nationality']]

father_controls.columns=[ 'pid_father', 'syear', 'sex_of_father', 'years_of_education_father',
       'year_of_birth_father', 'father_born_in_germany', 'father_german_nationality']

In [25]:
father_controls.to_csv("father_controls.csv")