In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plt.style.use("ggplot")
%matplotlib inline

### Import dataset

In [2]:
biobirth = pd.read_csv("data/biobirth.csv")
bioparen = pd.read_csv("data/bioparen.csv")
bioimmig = pd.read_csv("data/bioimmig.csv") #The variables contained in BIOIMMIG relate to foreigners in (and migrants to) Germany.
biojob = pd.read_csv("data/biojob.csv")
# taken variables
vars_set = pd.read_csv("data/Variables_set.csv")

In [3]:
biosoc = pd.read_csv("data/biosoc.csv") # data on youth and socialization only from year 2000 
biopupil = pd.read_csv("data/biopupil.csv")  #Pre-Teen Questionnaire only from 2014 

### Cut and take the variables needed

In [4]:
dataset = ['biobirth','bioparen', 'bioimmig','biojob', 'biosoc' ,'biopupil',]
varset = [[] for _ in range(6)]
for i, data in enumerate(dataset):
    sub = vars_set[vars_set['dataset'] == data]
    varset[i] = sub['variable'].tolist()

In [5]:
biobirth = biobirth[varset[0]]
bioparen = bioparen[varset[1]]
bioimmig = bioimmig[varset[2]]
biojob = biojob[varset[3]]
# only from 2000 
biosoc = biosoc[varset[4]]
biopupil = biopupil[varset[5]]
#biosoc.rename(columns={'syear': 'bioyear'},inplace=True) #bioyear

In [6]:
print("biobirth:" + str(biobirth.shape))
print("bioparen:" + str(bioparen.shape)) 
print("bioimmig:" + str(bioimmig.shape)) 
print("biojob:" + str(biojob.shape)) 

print("biopupil:" + str(biopupil.shape))
print("biosoc:" + str(biosoc.shape)) 

biobirth:(88853, 7)
bioparen:(91790, 19)
bioimmig:(206340, 20)
biojob:(91790, 26)
biopupil:(4861, 37)
biosoc:(57411, 45)


### Merge dataset

In [7]:
dfMerge2 = pd.merge(biobirth,bioparen,on=['cid','pid','bioyear'],how='inner')
dfMerge2 = dfMerge2[dfMerge2['bioage'].between(20,35)]
print("dfMerge2:" + str(dfMerge2.shape))

dfMerge2:(17560, 23)


In [8]:
dfMerge3 = pd.merge(biojob,dfMerge2,on=['cid','pid','bioyear'],how='inner')
print("dfMerge3:" + str(dfMerge3.shape))

dfMerge3:(15261, 46)


In [9]:
dfMerge4 = pd.merge(bioimmig,dfMerge3,on=['cid','pid'],how='right')
print("dfMerge4:" + str(dfMerge4.shape))

dfMerge4:(34846, 64)


In [19]:
#Control for whose parent's country not German
df_c1 = dfMerge3[(dfMerge3['forigin']!=1) &
             (dfMerge3['morigin']!=1)
            ]

In [20]:
df_c1.shape

(9943, 46)

In [21]:
df_c1[['forigin','morigin','fnat','mnat']]

Unnamed: 0,forigin,morigin,fnat,mnat
1,-2,-2,-8,-8
2,-2,-2,-8,-8
3,-2,-2,-8,-8
4,-2,-2,-8,-8
6,-2,-2,-8,-8
...,...,...,...,...
15256,-2,30,-5,2
15257,-2,-2,-5,-5
15258,-2,-2,-5,-5
15259,-2,-2,-5,-5


In [18]:
#Control for whose parent's country not German and now the nationality is German
df_c2 = dfMerge3[(dfMerge3['forigin']!=1) &
              (dfMerge3['morigin']!=1) &
              (dfMerge3['fnat']==1) &
              (dfMerge3['mnat']==1)
             ]
df_c2.shape

(2469, 46)

### Descriptive statistic

In [8]:
plt.figure(figsize=(15, 10))
ax = sns.scatterplot(x='syear',
                     y='',
                     hue='morigin',
                     legend='full',
                     data=df,
                     palette=sns.color_palette("Set1", n_colors=len(df.morigin.unique())))
max_transistors_per_year = df.groupby('syear')[''].max()
sns.lineplot(data=max_transistors_per_year,
             ax=ax.axes,
             color='black')
ax.set_xlim(2006, 2021)
plt.show()

Unnamed: 0,hhnrakt,cid,persnr,syear,hhnr,hid,pid,biimgrp,biresper,bicamp,...,bifamc,bifamcl,birbetr,birmoney,birfree,birfam,birpoor,birwar,birjust,birothr
0,175,175,1701,1984,175,175,1701,6,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
1,213,213,2101,1984,213,213,2101,-2,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
2,221,221,2202,1984,221,221,2202,6,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
3,230,230,2302,1984,230,230,2302,-2,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
4,329,329,3201,1984,329,329,3201,6,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206335,3923584,3923584,39235804,2018,3923584,3923584,39235804,5,-5,1,...,-2,-5,-2,-2,-2,-2,-2,-2,-2,-2
206336,3923614,3923614,39236101,2018,3923614,3923614,39236101,5,2,1,...,-5,-5,-2,-2,-2,-2,-2,-2,-2,-2
206337,3923746,3923746,39237402,2018,3923746,3923746,39237402,5,-5,1,...,-2,-5,-2,-2,-2,-2,-2,-2,-2,-2
206338,3923800,3923800,39238001,2018,3923800,3923800,39238001,5,2,1,...,-5,-5,-2,-2,-2,-2,-2,-2,-2,-2
