In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plt.style.use("ggplot")
%matplotlib inline

### Import dataset

In [2]:
# Variables
vars_set = pd.read_csv("data/Test_varset.csv")

In [3]:
dataset = ['biobirth','bioparen', 'bioimmig','biojob', 'biosoc' ,'biopupil','biol']
varset = [[] for _ in range(7)]
for i, data in enumerate(dataset):
    sub = vars_set[vars_set['dataset'] == data]
    varset[i] = sub['variable'].tolist()

In [4]:
biobirth = pd.read_csv("data/biobirth.csv", usecols=varset[0])
bioparen = pd.read_csv("data/bioparen.csv", usecols=varset[1])
bioimmig = pd.read_csv("data/bioimmig.csv", usecols=varset[2]) #The variables contained in BIOIMMIG relate to foreigners in (and migrants to) Germany.
biojob = pd.read_csv("data/biojob.csv", usecols=varset[3])

In [5]:
# khong dung
#biosoc = pd.read_csv("data/biosoc.csv", usecols=varset[4]) # data on youth and socialization only from year 2000 
#biopupil = pd.read_csv("data/biopupil.csv", usecols=varset[5])  #Pre-Teen Questionnaire only from 2014 

In [5]:
biol = pd.read_csv("data/biol.csv", usecols=varset[6])

### Keep only the variables needed

In [6]:
print("biobirth:" + str(biobirth.shape))
print("bioparen:" + str(bioparen.shape)) 
print("bioimmig:" + str(bioimmig.shape)) 
print("biojob:" + str(biojob.shape)) 
print("biol:" + str(biol.shape)) 
#print("biopupil:" + str(biopupil.shape))
#print("biosoc:" + str(biosoc.shape)) 

biobirth:(88853, 7)
bioparen:(91790, 14)
bioimmig:(206340, 20)
biojob:(91790, 26)
biol:(116499, 12)


### Merge dataset

In [7]:
biol_immig = pd.merge(biol,bioimmig,on=['cid','pid','syear'],how='inner')
print("biol_immig:" + str(biol_immig.shape))

biol_immig:(56589, 29)


In [8]:
biol_immig['bioage'] = biol_immig['syear'] - biol_immig['lb0011_h']

In [9]:
biol_immig = biol_immig[biol_immig['bioage'].between(20,35)]

In [10]:
biol_immig.rename(columns={"syear": "bioyear"},inplace=True)

In [12]:
biol_immig['biresper'].value_counts()

 2    8579
-2    4568
-5    4138
 1    2217
-1     124
Name: biresper, dtype: int64

In [35]:
biol_immig_paren = pd.merge(biol_immig,bioparen,on=['cid','pid','bioyear'],how='inner')

In [48]:
biol_immig_paren['immg_gr']=biol_immig_paren['biimgrp'].astype(str)

In [49]:
biol_immig_paren['immg_gr'].replace(
    {"-6": "Version of questionnaire with modified filtering", 
     "-5": "Not included in this version of the questionnaire",
     "-4": "Inadmissible multiple response",
     "-3": "Answer improbable",
     "-2": "Does not apply",
     "-1": "No Answer",
     "2": "Person Of German Descent From Eastern Europe",
     "3": "German Who Lived Abroad",
     "4": "Citizen Of EU Country (up to 2009 EC)",
     "5": "Asylum seeker, refugee",
     "6": "Other Foreigner"
    },
    inplace=True
)

In [51]:
#biol_immig_paren['biimgrp'].astype('category')
biol_immig_paren['immg_gr'].value_counts()

Asylum seeker, refugee                               4869
Does not apply                                       3420
Person Of German Descent From Eastern Europe         1339
Other Foreigner                                      1151
Citizen Of EU Country (up to 2009 EC)                1067
Not included in this version of the questionnaire     862
German Who Lived Abroad                                89
No Answer                                              44
Name: immg_gr, dtype: int64

In [55]:
biol_immig_paren['fnat'].value_counts()

-5    4404
 2    4006
 1    3464
-8     543
-2     346
-1      78
Name: fnat, dtype: int64

In [56]:
biol_immig_paren['forigin'].value_counts()

-2      7074
 1      1927
 2       518
 32      354
 22      328
        ... 
 36        1
 138       1
 114       1
 98        1
 155       1
Name: forigin, Length: 124, dtype: int64

In [57]:
test = biol_immig_paren[(biol_immig_paren['morigin'].isin([1,-2,-5]))&
                (biol_immig_paren['fnat']==1)
                ]

Unnamed: 0,pid,hid_x,cid,bioyear,pnr,lb0011_h,lb0014,lb0092,lb0093,lb0096,...,fprofstat,mprofstat,morigin,forigin,sibl,nums,numb,twin,siblup,immg_gr
0,413603,41360,41360,1984,1,1963,-2,-2,1,-2,...,530,10,1,1,-2,-2,-2,-2,-2,Does not apply
131,7018303,701831,701831,1994,3,1965,1,-2,-2,-2,...,13,-5,-2,32,-2,-2,-2,-2,-2,Person Of German Descent From Eastern Europe
596,2179603,217964,217964,1999,3,1967,1,-2,-2,-2,...,13,13,1,1,1,1,2,3,2003,Does not apply
619,660904,66095,21636,2000,4,1967,-2,-8,-8,-8,...,220,10,-2,-2,1,0,1,3,2003,Person Of German Descent From Eastern Europe
624,974202,97420,24473,2000,2,1969,1,-8,-8,-8,...,432,620,-2,-2,1,1,2,3,2003,Does not apply
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12603,33480102,3348015,3182915,2018,2,1998,-5,-8,-8,-8,...,522,10,1,1,1,1,6,3,2018,Not included in this version of the questionnaire
12606,33482002,3348201,3250180,2018,2,1991,-5,-8,-8,-8,...,424,530,1,1,1,0,1,2,2018,Not included in this version of the questionnaire
12607,33482702,3348279,3192929,2018,2,1996,-5,-8,-8,-8,...,630,540,1,1,1,0,1,3,2018,Not included in this version of the questionnaire
12610,34089802,3408980,3408980,2018,2,1988,-5,-8,-8,-8,...,630,522,1,1,1,3,1,3,2018,Not included in this version of the questionnaire


In [59]:
test = biol_immig_paren[(biol_immig_paren['morigin']==2)&
                (biol_immig_paren['fnat']==1)
                ]
test

Unnamed: 0,pid,hid_x,cid,bioyear,pnr,lb0011_h,lb0014,lb0092,lb0093,lb0096,...,fprofstat,mprofstat,morigin,forigin,sibl,nums,numb,twin,siblup,immg_gr
1430,1328402,132845,58190,2008,2,1985,-8,-8,-8,-8,...,434,530,2,2,1,1,1,3,2008,Does not apply
1509,8630601,863068,863068,2010,1,1976,-8,-8,-8,-8,...,10,10,2,2,1,4,6,3,2010,"Asylum seeker, refugee"
1511,8631703,863173,863173,2010,3,1988,-8,-8,-8,-8,...,10,530,2,2,1,1,0,3,2010,Does not apply
1534,8783401,878340,878340,2010,1,1979,-8,-8,-8,-8,...,220,10,2,2,1,2,4,3,2010,Does not apply
1559,8900101,890014,890014,2010,1,1986,-8,-8,-8,-8,...,434,530,2,2,1,1,1,3,2010,Other Foreigner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12320,22534901,2253499,2253499,2018,1,1995,2,-8,-8,-8,...,220,210,2,1,1,0,2,3,2018,Does not apply
12374,22866101,2286613,2286613,2018,1,1989,1,-8,-8,-8,...,434,521,2,2,1,3,2,3,2018,Does not apply
12423,23193001,2319309,2319309,2018,1,1986,1,-8,-8,-8,...,521,10,2,2,1,0,3,3,2018,Does not apply
12485,23635701,2363570,2363570,2018,1,1995,1,-8,-8,-8,...,522,10,2,2,1,0,3,3,2018,Does not apply


In [7]:
dfMerge2 = pd.merge(biobirth,bioparen,on=['cid','pid','bioyear'],how='inner')
dfMerge2 = dfMerge2[dfMerge2['bioage'].between(20,35)]
print("dfMerge2:" + str(dfMerge2.shape))

dfMerge2:(17560, 23)


In [8]:
dfMerge3 = pd.merge(biojob,dfMerge2,on=['cid','pid','bioyear'],how='inner')
print("dfMerge3:" + str(dfMerge3.shape))

dfMerge3:(15261, 46)


In [9]:
#dfMerge4 = pd.merge(bioimmig,dfMerge3,on=['cid','pid'],how='right')
#print("dfMerge4:" + str(dfMerge4.shape))

In [12]:
#Control for whose parent's country not German
df_c1 = dfMerge3[~(dfMerge3['forigin'].isin([1,-2,-5])) &
                 ~(dfMerge3['morigin'].isin([1,-2,-5]))
                ]

In [13]:
df_c1.shape

(2134, 46)

In [16]:
#Control for whose parent's country not German and now the nationality is German
df_c2 = dfMerge3[~(dfMerge3['forigin'].isin([1,-2,-5])) &
                 ~(dfMerge3['morigin'].isin([1,-2,-5])) &
                 (dfMerge3['fnat']==1) &
                 (dfMerge3['mnat']==1)
                ]
df_c2.shape

(408, 46)

In [17]:
df_c2[['forigin','morigin','fnat','mnat']]

Unnamed: 0,forigin,morigin,fnat,mnat
156,32,32,1,1
306,3,22,1,1
1195,32,32,1,1
1219,32,32,1,1
1271,22,22,1,1
...,...,...,...,...
12333,74,74,1,1
12688,74,32,1,1
12951,74,74,1,1
13164,140,140,1,1


### Descriptive statistic

In [8]:
plt.figure(figsize=(15, 10))
ax = sns.scatterplot(x='syear',
                     y='',
                     hue='morigin',
                     legend='full',
                     data=df,
                     palette=sns.color_palette("Set1", n_colors=len(df.morigin.unique())))
max_transistors_per_year = df.groupby('syear')[''].max()
sns.lineplot(data=max_transistors_per_year,
             ax=ax.axes,
             color='black')
ax.set_xlim(2006, 2021)
plt.show()

Unnamed: 0,hhnrakt,cid,persnr,syear,hhnr,hid,pid,biimgrp,biresper,bicamp,...,bifamc,bifamcl,birbetr,birmoney,birfree,birfam,birpoor,birwar,birjust,birothr
0,175,175,1701,1984,175,175,1701,6,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
1,213,213,2101,1984,213,213,2101,-2,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
2,221,221,2202,1984,221,221,2202,6,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
3,230,230,2302,1984,230,230,2302,-2,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
4,329,329,3201,1984,329,329,3201,6,-2,-2,...,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206335,3923584,3923584,39235804,2018,3923584,3923584,39235804,5,-5,1,...,-2,-5,-2,-2,-2,-2,-2,-2,-2,-2
206336,3923614,3923614,39236101,2018,3923614,3923614,39236101,5,2,1,...,-5,-5,-2,-2,-2,-2,-2,-2,-2,-2
206337,3923746,3923746,39237402,2018,3923746,3923746,39237402,5,-5,1,...,-2,-5,-2,-2,-2,-2,-2,-2,-2,-2
206338,3923800,3923800,39238001,2018,3923800,3923800,39238001,5,2,1,...,-5,-5,-2,-2,-2,-2,-2,-2,-2,-2
