In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data
adults2005 = pd.read_stata("../original_data/vp.dta")
adults2010 = pd.read_stata("../original_data/bap.dta")
adults2015 = pd.read_stata("../original_data/bfp.dta")

In [3]:
#Extract Column of Variables for Locus of Control.
loc_adults_2005 = adults2005.loc[:, 'vp12701':'vp12710']
loc_adults_2010 = adults2010.loc[:, 'bap0201':'bap0210']
loc_adults_2015 = adults2015.loc[:, 'bfp0501':'bfp0510']

In [4]:
#Rename to meaningful names
loc_adults_2005.columns = ['life_depends_on_self','not_achieve_derserved','achieve_luck',
                          'change_through_activities','others_determine','workhard_to_success',
                          'doubt_ability','background_determines','born_determine','little_control']
loc_adults_2010.columns = ['life_depends_on_self','not_achieve_derserved','achieve_luck',
                          'change_through_activities','others_determine','workhard_to_success',
                          'doubt_ability','background_determines','born_determine','little_control']
loc_adults_2015.columns = ['life_depends_on_self','not_achieve_derserved','achieve_luck',
                          'change_through_activities','others_determine','workhard_to_success',
                          'doubt_ability','background_determines','born_determine','little_control']

In [5]:
#Extract Column of basic variables we need for the research.
ids2005 = adults2005.loc[:, ['hhnr','persnr','welle','vp14701']]
ids2010 = adults2010.loc[:, ['hhnr','persnr','welle','bap15001']]
ids2015 = adults2015.loc[:, ['hhnr','persnr','welle','bfpsex']]
#Rename identifiers to match the other data sets.                         
ids2005.columns = ['cid','pid','syear','sex']
ids2010.columns = ['cid','pid','syear','sex']  
ids2015.columns = ['cid','pid','syear','sex'] 

In [6]:
data_adults_2005 = pd.concat([ids2005, loc_adults_2005], axis=1)
data_adults_2010 = pd.concat([ids2010, loc_adults_2010], axis=1)
data_adults_2015 = pd.concat([ids2015, loc_adults_2015], axis=1)
#Create a dataframe merge all the data.
data_adults_whole = pd.concat([data_adults_2005,  data_adults_2010,  data_adults_2015],sort=False)
data_adults = data_adults_whole.reset_index(drop=True)


In [7]:
#Replace all negative number into pd.np.nan.
dict_n = {'[-1] keine Angabe': pd.np.nan}
data_adults_nan = data_adults.replace(dict_n)

In [8]:
#Replace all string variable we use into number.
dict_adults_f = {'[7] Trifft voll zu': 7, '[1] Trifft ueberhaupt nicht zu' : 1, '[7] 7 stimme voll zu, (Skala 1-7)':7,
         '[6] 6 auf Skala 1-7':6, '[5] 5 auf Skala 1-7':5, '[4] 4 auf Skala 1-7':4, '[3] 3 auf Skala 1-7':3, '[2] 2 auf Skala 1-7':2
          ,'[1] 1 stimme ueberhaupt nicht zu, (Skala 1-7':1,'[1] Ja':1, '[2] Nein':0, '[-5] In Fragebogenversion nicht enthalten':pd.np.nan,
                 '[7] 7 Stimme voll zu, (Skala 1-7)':7,'[1] 1 Stimme ueberhaupt nicht zu, (Skala 1-7)':1}

data_adults_replace = data_adults_nan.replace(dict_adults_f)


In [9]:
#Reserves the scale for 'Negative' items.
#Create list of 'Negavie' items and dictionary for things I want to replace.
#Create a dict of number I want to replace
#Replace the number by creating new DataFrame and update

dict_adults_r = {1:7, 7:1, 2:6, 6:2, 3:5, 5:3}
negative = ['not_achieve_derserved',
            'achieve_luck',
            'others_determine',
            'doubt_ability',
            'background_determines',
            'born_determine',
            'little_control']

reverse = data_adults_replace.loc[:,negative].replace(dict_adults_r)


In [10]:
data_locus_2005= data_adults_replace.loc[(data_adults_replace['syear']==2005)]
data_locus_2010= data_adults_replace.loc[(data_adults_replace['syear']==2010)]
data_locus_2015= data_adults_replace.loc[(data_adults_replace['syear']==2015)]

In [11]:
#transform measures to unit scale (Standardizing)
drop_list=['cid','pid','syear','sex']
measure_matrix_2005=data_locus_2005.drop(drop_list, axis = 1)
measure_matrix_2010=data_locus_2010.drop(drop_list, axis = 1)
measure_matrix_2015=data_locus_2015.drop(drop_list, axis = 1)
measures_clean_2005 = measure_matrix_2005.dropna()
measures_clean_2010 = measure_matrix_2010.dropna()
measures_clean_2015 = measure_matrix_2015.dropna()

In [12]:
measures_clean_2005_std = StandardScaler().fit_transform(measures_clean_2005)
measures_clean_2010_std = StandardScaler().fit_transform(measures_clean_2010)
measures_clean_2015_std = StandardScaler().fit_transform(measures_clean_2015)

In [13]:
#Do the Principal Component Analysis 2005
sklearn_pca = sklearnPCA(n_components=1)
#Create involvement measure for both parents
locus_of_control_2005 = sklearn_pca.fit_transform(measures_clean_2005_std)*(-1)
#ptints the factor loadings
print(sklearn_pca.components_*(-1))
locus_of_control_2005_std = StandardScaler().fit_transform(locus_of_control_2005)


[[ 0.28871596 -0.37919486 -0.36000833 -0.01573771 -0.42493688  0.0646455
  -0.37404091 -0.31447256 -0.06148732 -0.46440284]]


In [14]:
#Do the Principal Component Analysis 2010
sklearn_pca = sklearnPCA(n_components=1)
#Create involvement measure for both parents
locus_of_control_2010 = sklearn_pca.fit_transform(measures_clean_2010_std)*(-1)
#ptints the factor loadings
print(sklearn_pca.components_*(-1))
locus_of_control_2010_std = StandardScaler().fit_transform(locus_of_control_2010)
#['life_depends_on_self','not_achieve_derserved','achieve_luck',
#  'change_through_activities','others_determine','workhard_to_success',
#   'doubt_ability','background_determines','born_determine','little_control']

[[ 0.28873533 -0.38415921 -0.35113504  0.02515359 -0.4260749   0.03107637
  -0.3805155  -0.31193084 -0.09913044 -0.45902048]]


In [15]:
#Do the Principal Component Analysis 2015
sklearn_pca = sklearnPCA(n_components=1)
#Create involvement measure for both parents
locus_of_control_2015 = sklearn_pca.fit_transform(measures_clean_2015_std)*(-1)
#ptints the factor loadings
print(sklearn_pca.components_*(-1))
locus_of_control_2015_std = StandardScaler().fit_transform(locus_of_control_2015)

[[ 0.25722775 -0.38548691 -0.35350458  0.01261146 -0.42780401 -0.01318398
  -0.39238964 -0.31811775 -0.1020016  -0.45972409]]


In [16]:
#save scores in dataframes

data_locus_2005_clean=data_locus_2005.loc[:,'cid':'little_control'].dropna()
data_locus_2005_clean['locus_of_control_05']=locus_of_control_2005
data_locus_2005_clean['locus_of_control_std_05']=locus_of_control_2005_std

nondrops_2010=['cid',
 'pid',
 'syear',
 'life_depends_on_self',
 'not_achieve_derserved',
 'achieve_luck',
 'change_through_activities',
 'others_determine',
 'workhard_to_success',
 'doubt_ability',
 'background_determines',
 'born_determine',
 'little_control',
 'sex',
 ]
data_locus_2010_clean=data_locus_2010.loc[:,nondrops_2010].dropna()
data_locus_2010_clean['locus_of_control_10']=locus_of_control_2010
data_locus_2010_clean['locus_of_control_std_10']=locus_of_control_2010_std

nondrops_2015=['cid',
 'pid',
 'syear',
 'life_depends_on_self',
 'not_achieve_derserved',
 'achieve_luck',
 'change_through_activities',
 'others_determine',
 'workhard_to_success',
 'doubt_ability',
 'background_determines',
 'born_determine',
 'little_control',
 'sex']
data_locus_2015_clean=data_locus_2015.loc[:,nondrops_2015].dropna()
data_locus_2015_clean['locus_of_control_15']=locus_of_control_2015
data_locus_2015_clean['locus_of_control_std_15']=locus_of_control_2015_std

In [17]:
#Merge three years of data together
data_loc = data_locus_2005_clean.merge(data_locus_2010_clean, on = ['cid','pid'],how = 'left')
data_loc = data_loc.merge(data_locus_2015_clean, on = ['cid','pid'],how = 'left')

In [18]:
new_loc = data_loc.loc[:,['pid', 'cid', 'locus_of_control_05', 'locus_of_control_std_05', 'locus_of_control_10', 'locus_of_control_std_10',
                              'locus_of_control_15', 'locus_of_control_std_15']]

In [19]:
loc = new_loc.sort_values(by=['pid'])

In [20]:
loc.to_csv('../clean_data/loc.csv')