In [1]:
import pandas as pd

In [2]:
data05 = pd.read_stata("../original_data/vp.dta")
data09 = pd.read_stata("../original_data/zp.dta")
data13 = pd.read_stata("../original_data/bdp.dta")

In [3]:
# Extract Column of Big 5 Variables we need for the research.
data_05 = data05.loc[:, 'vp12501':'vp12515']
data_09 = data09.loc[:, 'zp12001':'zp12015']
data_13 = data13.loc[:, 'bdp15101':'bdp15115']

In [4]:
# Rename to meaningful names.
for x in [data_05, data_09, data_13]:
    x.columns =  ['work_carefully','communicative','abrasive', 'new_idea','often_worry',
                       'forgiving_nature','lazy','outgoing','esthetics','often_nervous','work_efficiently',
                           'reserved','considerate','lively_imagination','be_relaxed']


In [5]:
# Extract and rename personal information.
ids2005 = data05.loc[:, ['hhnr','persnr','welle','vp14701','vp14702','vp135']]
ids2009 = data09.loc[:, ['hhnr','persnr','welle','zp12901','zp12902','zp137']]
ids2013 = data13.loc[:, ['hhnr','persnr','welle','bdp13401','bdp13403','bdp143']]

#Rename identifiers to match the other data sets.                         
ids2005.columns = ['cid','pid','syear','sex_2005','birth_year_2005','german_nationality_2005']
ids2009.columns = ['cid','pid','syear','sex_2009','birth_year_2009','german_nationality_2009']
ids2013.columns = ['cid','pid','syear','sex_2013','birth_year_2013','german_nationality_2013'] 

In [6]:
# Merge it.
data_2005 = pd.concat([ids2005, data_05], axis=1)
data_2009 = pd.concat([ids2009, data_09], axis=1)
data_2013 = pd.concat([ids2013, data_13], axis=1)

In [7]:
#Replace all negative number into pd.np.nan and replace string into number.
dict_n = {'[-1] keine Angabe': pd.np.nan, '[7] Trifft voll zu': 7, '[1] Trifft ueberhaupt nicht zu' : 1, '[7] 7 stimme voll zu, (Skala 1-7)':7,
         '[6] 6 auf Skala 1-7':6, '[5] 5 auf Skala 1-7':5, '[4] 4 auf Skala 1-7':4, '[3] 3 auf Skala 1-7':3, '[2] 2 auf Skala 1-7':2
          ,'[1] 1 stimme ueberhaupt nicht zu, (Skala 1-7':1,'[1] Ja':1, '[2] Nein':0, '[-5] In Fragebogenversion nicht enthalten':pd.np.nan}

data_05_nan = data_2005.replace(dict_n)
data_09_nan = data_2009.replace(dict_n)
data_13_nan = data_2013.replace(dict_n)

In [8]:
#Reserves the scale for 'Negative' items.
#Create list of 'Negavie' items and dictionary for things I want to replace.
#Create a dict of number I want to replace
#Replace the number by creating new DataFrame and update

dict_r = {1:7, 7:1, 2:6, 6:2, 3:5, 5:3}
negative = ['lazy','abrasive','reserved','be_relaxed']

reverse_05 = data_05_nan.loc[:,negative].replace(dict_r)
reverse_09 = data_09_nan.loc[:,negative].replace(dict_r)
reverse_13 = data_13_nan.loc[:,negative].replace(dict_r)

a = [data_05_nan, data_09_nan, data_13_nan]
b = [reverse_05, reverse_09, reverse_13]

for x,y in zip(a,b): 
    x.update(y)


In [9]:
# Create list of variable corresond to Big 5.
openness_ls = ['lively_imagination', 'new_idea', 'esthetics']
conscientiousness_ls = ['lazy', 'work_efficiently', 'work_carefully']
extraversion_ls = ['reserved', 'work_efficiently', 'work_carefully']
agreeableness_ls = ['forgiving_nature', 'considerate', 'abrasive']
neuroticism_ls = ['often_worry', 'often_nervous', 'be_relaxed']


In [10]:
# Create list of coloumns we want to create.
trait_ls = [neuroticism_ls, agreeableness_ls, extraversion_ls, conscientiousness_ls, openness_ls]
trait = ['neuroticism', 'agreeableness', 'extraversion', 'conscientiousness', 'openness']

In [11]:
# Define a loop for creating the measure.
for x,y in zip(trait, trait_ls):
    data_05_nan[x] = data_05_nan[y].mean(1)
    data_09_nan[x] = data_09_nan[y].mean(1)
    data_13_nan[x] = data_13_nan[y].mean(1)

In [12]:
# Create function for standardising the data.
standardise = lambda x: (x - x.mean()) / x.std()

# Standardise all the measure and create as dataframe.
trait_05 = data_05_nan[trait].pipe(standardise)
trait_09 = data_09_nan[trait].pipe(standardise)
trait_13 = data_13_nan[trait].pipe(standardise)

In [13]:
data_05 = data_05_nan[['cid', 'pid', 'syear', 'neuroticism', 'agreeableness', 'extraversion', 'conscientiousness', 'openness']]
data_09 = data_09_nan[['cid', 'pid', 'syear', 'neuroticism', 'agreeableness', 'extraversion', 'conscientiousness', 'openness']]
data_13 = data_13_nan[['cid', 'pid', 'syear', 'neuroticism', 'agreeableness', 'extraversion', 'conscientiousness', 'openness']]

In [14]:
# Rename columns to seperate data from time to time.
data_05.columns = ['cid','pid','syear_05', 'neuroticism_05', 'agreeableness_05', 'extraversion_05', 'conscientiousness_05', 'openness_05' ]
data_09.columns = ['cid','pid','syear_09', 'neuroticism_09', 'agreeableness_09', 'extraversion_09', 'conscientiousness_09', 'openness_09' ]
data_13.columns = ['cid','pid','syear_13', 'neuroticism_13', 'agreeableness_13', 'extraversion_13', 'conscientiousness_13', 'openness_13' ]

In [15]:
# Merge the data set before standardise.
trait_not_sd = data_05.merge(data_09, on = ['cid','pid'],how = 'left')
trait_not_sd = trait_not_sd.merge(data_13, on = ['cid','pid'],how = 'left')

In [16]:
trait_not_sd = trait_not_sd.dropna()

In [17]:
# Merge the measure with id.

measures_05 = pd.concat([ids2005, trait_05], axis=1)
measures_09 = pd.concat([ids2009, trait_09], axis=1)
measures_13 = pd.concat([ids2013, trait_13], axis=1)

In [18]:
# Rename columns to seperate data from time to time.
measures_05.columns = ['cid','pid','syear_05','sex','birth_year','german_nationality_05', 'neuroticism_05', 'agreeableness_05', 'extraversion_05', 'conscientiousness_05', 'openness_05' ]
measures_09.columns = ['cid','pid','syear_09','sex','birth_year','german_nationality_09', 'neuroticism_09', 'agreeableness_09', 'extraversion_09', 'conscientiousness_09', 'openness_09' ]
measures_13.columns = ['cid','pid','syear_13','sex','birth_year','german_nationality_13', 'neuroticism_13', 'agreeableness_13', 'extraversion_13', 'conscientiousness_13', 'openness_13' ]

In [19]:
# Merge into one big dataset.
trait = measures_05.merge(measures_09, on = ['cid','pid','birth_year', 'sex'],how = 'left')
trait = trait.merge(measures_13, on = ['cid','pid','birth_year', 'sex'],how = 'left')

In [20]:
# Drop missing variable.
trait = trait.dropna()
trait = trait.drop(['syear_05','syear_09','syear_13', 'birth_year', 'german_nationality_05', 'german_nationality_09', 'german_nationality_13', 'sex'], axis=1)

In [21]:
# Define personality trait as average of 05, 09, 13 to check the impact of personality on unemployment.

openness = ['openness_05', 'openness_09', 'openness_13']
neuroticism = ['neuroticism_05', 'neuroticism_09', 'neuroticism_13']
agreeableness = ['agreeableness_05', 'agreeableness_09', 'agreeableness_13']
extraversion = ['extraversion_05', 'extraversion_09', 'extraversion_13']
conscientiousness = ['conscientiousness_05', 'conscientiousness_09', 'conscientiousness_13']

i= [neuroticism, agreeableness, extraversion, conscientiousness, openness]
j = ['neuroticism', 'agreeableness', 'extraversion', 'conscientiousness', 'openness']
l = ['neuroticism_05_09', 'agreeableness_05_09', 'extraversion_05_09', 'conscientiousness_05_09', 'openness_05_09']
m = ['neuroticism_09_13', 'agreeableness_09_13', 'extraversion_09_13', 'conscientiousness_09_13', 'openness_09_13']
t_05 = ['neuroticism_05', 'agreeableness_05', 'extraversion_05', 'conscientiousness_05', 'openness_05']
t_09 = ['neuroticism_09', 'agreeableness_09', 'extraversion_09', 'conscientiousness_09', 'openness_09']
t_13 = ['neuroticism_13', 'agreeableness_13', 'extraversion_13', 'conscientiousness_13', 'openness_13']

for x,y in zip(j,i):
    trait[x] = trait[y].mean(1)

for x,y,z in zip(l, t_05, t_09):
    trait_not_sd[x] = trait_not_sd[y] - trait_not_sd[z]

for x,y,z in zip(m, t_09, t_13):
    trait_not_sd[x] = trait_not_sd[y] - trait_not_sd[z]

In [22]:
trait_not_sd = trait_not_sd[['cid', 'pid', 'neuroticism_05_09', 'agreeableness_05_09', 'extraversion_05_09', 'conscientiousness_05_09', 'openness_05_09', 'neuroticism_09_13', 'agreeableness_09_13', 'extraversion_09_13', 'conscientiousness_09_13', 'openness_09_13']]

In [23]:
trait = trait.merge(trait_not_sd, on = ['cid','pid'],how = 'left')
big_five_mean_diff = trait.describe()

In [24]:
trait.to_csv("../clean_data/big5.csv")
big_five_mean_diff.to_csv("../clean_data/big5_mean_diff.csv")

In [25]:
trait.columns

Index(['cid', 'pid', 'neuroticism_05', 'agreeableness_05', 'extraversion_05',
       'conscientiousness_05', 'openness_05', 'neuroticism_09',
       'agreeableness_09', 'extraversion_09', 'conscientiousness_09',
       'openness_09', 'neuroticism_13', 'agreeableness_13', 'extraversion_13',
       'conscientiousness_13', 'openness_13', 'neuroticism', 'agreeableness',
       'extraversion', 'conscientiousness', 'openness', 'neuroticism_05_09',
       'agreeableness_05_09', 'extraversion_05_09', 'conscientiousness_05_09',
       'openness_05_09', 'neuroticism_09_13', 'agreeableness_09_13',
       'extraversion_09_13', 'conscientiousness_09_13', 'openness_09_13'],
      dtype='object')

In [26]:
trait.describe()

Unnamed: 0,cid,pid,neuroticism_05,agreeableness_05,extraversion_05,conscientiousness_05,openness_05,neuroticism_09,agreeableness_09,extraversion_09,...,neuroticism_05_09,agreeableness_05_09,extraversion_05_09,conscientiousness_05_09,openness_05_09,neuroticism_09_13,agreeableness_09_13,extraversion_09_13,conscientiousness_09_13,openness_09_13
count,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,...,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0,9953.0
mean,310084.623832,3134165.0,-0.006681,0.011835,0.049675,0.049018,0.047651,0.002389,-0.011843,0.009867,...,0.119294,0.131769,0.053602,0.091446,0.163418,0.039452,-0.035617,0.013614,-0.014619,-0.122794
std,253106.5731,2496454.0,0.99454,0.993842,0.948143,0.962907,0.976482,0.995657,0.995717,0.968433,...,1.11829,0.952452,0.823952,0.886766,1.077451,1.07225,0.932007,0.808031,0.865739,1.042334
min,60.0,901.0,-2.417141,-4.535868,-5.158941,-5.173184,-2.866649,-2.313728,-4.430437,-5.162894,...,-5.333333,-5.333333,-5.0,-5.333333,-5.333333,-5.0,-5.0,-6.0,-4.333333,-5.0
25%,42080.0,587905.0,-0.783829,-0.802547,-0.339459,-0.597254,-0.679805,-0.678242,-0.69284,-0.720363,...,-0.666667,-0.333333,-0.333333,-0.333333,-0.333333,-0.666667,-0.666667,-0.333333,-0.333333,-0.666667
50%,280445.0,2803001.0,0.032827,0.215631,0.062164,0.106735,0.140262,-0.13308,-0.013277,0.08737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,507741.0,5079703.0,0.577264,0.894417,0.865411,0.810724,0.686972,0.684664,0.666286,0.491237,...,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.333333,0.666667
max,824984.0,8262002.0,2.482795,1.573202,2.070282,1.162719,2.05375,2.592731,1.685631,2.106703,...,5.333333,5.666667,3.666667,5.0,6.0,5.0,5.666667,4.333333,5.0,5.666667
