In [1]:
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler

In [2]:
import numpy as np
import pandas as pd
adults2005 = pd.read_stata("vp.dta")
adults2010 = pd.read_stata("bap.dta")
adults2015 = pd.read_stata("bfp.dta")
#Extract Column of Variables for Locus of Control.
loc_adults_2005 = adults2005.loc[:, 'vp12701':'vp12710']
loc_adults_2010 = adults2010.loc[:, 'bap0201':'bap0210']
loc_adults_2015 = adults2015.loc[:, 'bfp0501':'bfp0510']
#Rename to meaningful names
loc_adults_2005.columns = ['life_depends_on_self','not_achieve_derserved','achieve_luck',
                          'change_through_activities','others_determine','workhard_to_success',
                          'doubt_ability','background_determines','born_determine','little_control']
loc_adults_2010.columns = ['life_depends_on_self','not_achieve_derserved','achieve_luck',
                          'change_through_activities','others_determine','workhard_to_success',
                          'doubt_ability','background_determines','born_determine','little_control']
loc_adults_2015.columns = ['life_depends_on_self','not_achieve_derserved','achieve_luck',
                          'change_through_activities','others_determine','workhard_to_success',
                          'doubt_ability','background_determines','born_determine','little_control']

In [30]:
#Extract Column of basic variables we need for the research.
ids2005 = adults2005.loc[:, ['hhnr','persnr','welle','vp14701']]
ids2010 = adults2010.loc[:, ['hhnr','persnr','welle','bap15001']]
ids2015 = adults2015.loc[:, ['hhnr','persnr','welle','bfpsex']]
#Rename identifiers to match the other data sets.                         
ids2005.columns = ['cid','pid_parents','syear','sex_parent_2005']
ids2010.columns = ['cid','pid_parents','syear','sex_parent_2010']  
ids2015.columns = ['cid','pid_parents','syear','sex_parent_2015'] 

In [31]:
data_adults_2005 = pd.concat([ids2005, loc_adults_2005], axis=1)
data_adults_2010 = pd.concat([ids2010, loc_adults_2010], axis=1)
data_adults_2015 = pd.concat([ids2015, loc_adults_2015], axis=1)
#Create a dataframe merge all the data.
data_adults_whole = pd.concat([data_adults_2005,  data_adults_2010,  data_adults_2015],sort=False)
data_adults = data_adults_whole.reset_index(drop=True)


In [32]:
#Replace all negative number into pd.np.nan.
dict_n = {'[-1] keine Angabe': pd.np.nan}
data_adults_nan = data_adults.replace(dict_n)


In [33]:
#Replace all string variable we use into number.
dict_adults_f = {'[7] Trifft voll zu': 7, '[1] Trifft ueberhaupt nicht zu' : 1, '[7] 7 stimme voll zu, (Skala 1-7)':7,
         '[6] 6 auf Skala 1-7':6, '[5] 5 auf Skala 1-7':5, '[4] 4 auf Skala 1-7':4, '[3] 3 auf Skala 1-7':3, '[2] 2 auf Skala 1-7':2
          ,'[1] 1 stimme ueberhaupt nicht zu, (Skala 1-7':1,'[1] Ja':1, '[2] Nein':0, '[-5] In Fragebogenversion nicht enthalten':pd.np.nan,
                 '[7] 7 Stimme voll zu, (Skala 1-7)':7,'[1] 1 Stimme ueberhaupt nicht zu, (Skala 1-7)':1}

data_adults_replace = data_adults_nan.replace(dict_adults_f)


In [34]:
#Reserves the scale for 'Negative' items.
#Create list of 'Negavie' items and dictionary for things I want to replace.
#Create a dict of number I want to replace
#Replace the number by creating new DataFrame and update

dict_adults_r = {1:7, 7:1, 2:6, 6:2, 3:5, 5:3}
negative = ['not_achieve_derserved',
            'achieve_luck',
            'others_determine',
            'doubt_ability',
            'background_determines',
            'born_determine',
            'little_control']

reverse = data_adults_replace.loc[:,negative].replace(dict_adults_r)

data_adults_replace.update(reverse)

In [35]:
data_locus_2005= data_adults_replace.loc[(data_adults_replace['syear']==2005)]
data_locus_2010= data_adults_replace.loc[(data_adults_replace['syear']==2010)]
data_locus_2015= data_adults_replace.loc[(data_adults_replace['syear']==2015)]
data_locus_2005

Unnamed: 0,cid,pid_parents,syear,sex_parent_2005,life_depends_on_self,not_achieve_derserved,achieve_luck,change_through_activities,others_determine,workhard_to_success,doubt_ability,background_determines,born_determine,little_control,sex_parent_2010,sex_parent_2015
0,27,201,2005,[2] Weiblich,6.0,3.0,3.0,4.0,4.0,5.0,3.0,4.0,4.0,5.0,,
1,27,203,2005,[1] Maennlich,6.0,6.0,4.0,2.0,6.0,4.0,4.0,6.0,3.0,6.0,,
2,60,602,2005,[2] Weiblich,6.0,5.0,4.0,5.0,5.0,5.0,2.0,4.0,5.0,5.0,,
3,60,609102,2005,[2] Weiblich,6.0,1.0,6.0,6.0,5.0,7.0,6.0,2.0,2.0,3.0,,
4,60,609104,2005,[1] Maennlich,5.0,4.0,5.0,6.0,6.0,6.0,4.0,4.0,7.0,6.0,,
5,60,609105,2005,[1] Maennlich,5.0,5.0,4.0,5.0,3.0,6.0,3.0,3.0,3.0,6.0,,
6,60,1088902,2005,[2] Weiblich,5.0,7.0,7.0,1.0,2.0,4.0,6.0,6.0,6.0,2.0,,
7,60,609103,2005,[2] Weiblich,6.0,6.0,3.0,2.0,6.0,6.0,2.0,2.0,5.0,4.0,,
8,60,1203002,2005,[1] Maennlich,7.0,6.0,5.0,4.0,5.0,6.0,2.0,2.0,2.0,6.0,,
9,94,901,2005,[2] Weiblich,6.0,3.0,5.0,5.0,2.0,6.0,5.0,3.0,2.0,6.0,,


In [37]:
#transform measures to unit scale (Standardizing)
drop_list=['cid','pid_parents','syear','sex_parent_2005','sex_parent_2010','sex_parent_2015']
measure_matrix_2005=data_locus_2005.drop(drop_list, axis = 1)
measure_matrix_2010=data_locus_2010.drop(drop_list, axis = 1)
measure_matrix_2015=data_locus_2015.drop(drop_list, axis = 1)
measures_clean_2005 = measure_matrix_2005.dropna()
measures_clean_2010 = measure_matrix_2010.dropna()
measures_clean_2015 = measure_matrix_2015.dropna()

In [38]:
measures_clean_2005_std = StandardScaler().fit_transform(measures_clean_2005)
measures_clean_2010_std = StandardScaler().fit_transform(measures_clean_2010)
measures_clean_2015_std = StandardScaler().fit_transform(measures_clean_2015)


In [39]:
#Do the Principal Component Analysis 2005
sklearn_pca = sklearnPCA(n_components=1)
#Create involvement measure for both parents
locus_of_control_2005 = sklearn_pca.fit_transform(measures_clean_2005_std)*(-1)
#ptints the factor loadings
print(sklearn_pca.components_*(-1))
locus_of_control_2005_std = StandardScaler().fit_transform(locus_of_control_2005)


[[ 0.28871596  0.37919486  0.36000833 -0.01573771  0.42493688  0.0646455
   0.37404091  0.31447256  0.06148732  0.46440284]]


In [40]:
#Do the Principal Component Analysis 2010
sklearn_pca = sklearnPCA(n_components=1)
#Create involvement measure for both parents
locus_of_control_2010 = sklearn_pca.fit_transform(measures_clean_2010_std)*(-1)
#ptints the factor loadings
print(sklearn_pca.components_*(-1))
locus_of_control_2010_std = StandardScaler().fit_transform(locus_of_control_2010)
#['life_depends_on_self','not_achieve_derserved','achieve_luck',
#  'change_through_activities','others_determine','workhard_to_success',
#   'doubt_ability','background_determines','born_determine','little_control']

[[0.28873533 0.38415921 0.35113504 0.02515359 0.4260749  0.03107637
  0.3805155  0.31193084 0.09913044 0.45902048]]


In [41]:
#Do the Principal Component Analysis 2015
sklearn_pca = sklearnPCA(n_components=1)
#Create involvement measure for both parents
locus_of_control_2015 = sklearn_pca.fit_transform(measures_clean_2015_std)*(-1)
#ptints the factor loadings
print(sklearn_pca.components_*(-1))
locus_of_control_2015_std = StandardScaler().fit_transform(locus_of_control_2015)


[[ 0.25722775  0.38548691  0.35350458  0.01261146  0.42780401 -0.01318398
   0.39238964  0.31811775  0.1020016   0.45972409]]


In [42]:
#save scores in dataframes

data_locus_2005_clean=data_locus_2005.loc[:,'cid':'little_control'].dropna()
data_locus_2005_clean['locus_of_control']=locus_of_control_2005
data_locus_2005_clean['locus_of_control_std']=locus_of_control_2005_std

nondrops_2010=['cid',
 'pid_parents',
 'syear',
 
 'life_depends_on_self',
 'not_achieve_derserved',
 'achieve_luck',
 'change_through_activities',
 'others_determine',
 'workhard_to_success',
 'doubt_ability',
 'background_determines',
 'born_determine',
 'little_control',
 'sex_parent_2010',
 ]
data_locus_2010_clean=data_locus_2010.loc[:,nondrops_2010].dropna()
data_locus_2010_clean['locus_of_control']=locus_of_control_2010
data_locus_2010_clean['locus_of_control_std']=locus_of_control_2010_std

nondrops_2015=['cid',
 'pid_parents',
 'syear',
 
 'life_depends_on_self',
 'not_achieve_derserved',
 'achieve_luck',
 'change_through_activities',
 'others_determine',
 'workhard_to_success',
 'doubt_ability',
 'background_determines',
 'born_determine',
 'little_control',
 'sex_parent_2015']
data_locus_2015_clean=data_locus_2015.loc[:,nondrops_2015].dropna()
data_locus_2015_clean['locus_of_control']=locus_of_control_2015
data_locus_2015_clean['locus_of_control_std']=locus_of_control_2015_std
data_locus_2015

Unnamed: 0,cid,pid_parents,syear,sex_parent_2005,life_depends_on_self,not_achieve_derserved,achieve_luck,change_through_activities,others_determine,workhard_to_success,doubt_ability,background_determines,born_determine,little_control,sex_parent_2010,sex_parent_2015
47825,94,901,2015,,5.0,2.0,5.0,4.0,5.0,6.0,5.0,3.0,3.0,5.0,,[2] Weiblich
47826,159,1501,2015,,4.0,7.0,4.0,5.0,3.0,6.0,6.0,4.0,4.0,6.0,,[1] Maennlich
47827,167,1601,2015,,7.0,1.0,2.0,3.0,3.0,7.0,3.0,3.0,2.0,6.0,,[1] Maennlich
47828,230,2301,2015,,6.0,7.0,7.0,4.0,5.0,5.0,5.0,4.0,3.0,6.0,,[1] Maennlich
47829,230,2302,2015,,6.0,3.0,5.0,5.0,7.0,4.0,4.0,4.0,2.0,6.0,,[2] Weiblich
47830,523,5201,2015,,6.0,4.0,3.0,3.0,3.0,7.0,5.0,4.0,3.0,6.0,,[1] Maennlich
47831,523,5202,2015,,5.0,4.0,4.0,3.0,2.0,6.0,4.0,4.0,3.0,4.0,,[2] Weiblich
47832,523,5203,2015,,7.0,7.0,5.0,3.0,3.0,3.0,6.0,6.0,4.0,7.0,,[2] Weiblich
47833,531,5303,2015,,3.0,4.0,4.0,3.0,3.0,6.0,5.0,1.0,4.0,4.0,,[1] Maennlich
47834,531,1056003,2015,,6.0,5.0,5.0,5.0,5.0,6.0,3.0,4.0,6.0,5.0,,[1] Maennlich


In [54]:
#Merge three years of data together
data_loc = data_locus_2005_clean.append([data_locus_2010_clean, data_locus_2015_clean])
data_loc

Unnamed: 0,achieve_luck,background_determines,born_determine,change_through_activities,cid,doubt_ability,life_depends_on_self,little_control,locus_of_control,locus_of_control_std,not_achieve_derserved,others_determine,pid_parents,sex_parent_2005,sex_parent_2010,sex_parent_2015,syear,workhard_to_success
0,3.0,4.0,4.0,4.0,27,3.0,6.0,5.0,-1.100730,-0.695943,3.0,4.0,201,[2] Weiblich,,,2005,5.0
1,4.0,6.0,3.0,2.0,27,4.0,6.0,6.0,1.086110,0.686700,6.0,6.0,203,[1] Maennlich,,,2005,4.0
2,4.0,4.0,5.0,5.0,60,2.0,6.0,5.0,-0.405696,-0.256504,5.0,5.0,602,[2] Weiblich,,,2005,5.0
3,6.0,2.0,2.0,6.0,60,6.0,6.0,3.0,-0.979128,-0.619060,1.0,5.0,609102,[2] Weiblich,,,2005,7.0
4,5.0,4.0,7.0,6.0,60,4.0,5.0,6.0,0.515888,0.326174,4.0,6.0,609104,[1] Maennlich,,,2005,6.0
5,4.0,3.0,3.0,5.0,60,3.0,5.0,6.0,-0.830393,-0.525021,5.0,3.0,609105,[1] Maennlich,,,2005,6.0
6,7.0,6.0,6.0,1.0,60,6.0,5.0,2.0,0.133098,0.084152,7.0,2.0,1088902,[2] Weiblich,,,2005,4.0
7,3.0,2.0,5.0,2.0,60,2.0,6.0,4.0,-0.787124,-0.497664,6.0,6.0,609103,[2] Weiblich,,,2005,6.0
8,5.0,2.0,2.0,4.0,60,2.0,7.0,6.0,0.040643,0.025697,6.0,5.0,1203002,[1] Maennlich,,,2005,6.0
9,5.0,3.0,2.0,5.0,94,5.0,6.0,6.0,-0.678362,-0.428899,3.0,2.0,901,[2] Weiblich,,,2005,6.0


In [55]:
data_loc_drop = data_loc.drop_duplicates('pid_parents')

In [56]:
new_loc = data_loc_drop.loc[:,['pid_parents', 'locus_of_control', 'locus_of_control_std']]

In [57]:
new_loc.to_csv("new_adult_loc.csv")

In [58]:
new_loc.sort_values(by=['pid_parents'])

Unnamed: 0,pid_parents,locus_of_control,locus_of_control_std
0,201,-1.100730,-0.695943
1,203,1.086110,0.686700
2,602,-0.405696,-0.256504
9,901,-0.678362,-0.428899
10,1202,0.354765,0.224303
11,1501,0.725358,0.458612
12,1601,-0.804332,-0.508544
13,1602,0.441827,0.279348
14,1603,-0.354845,-0.224353
15,1701,2.840852,1.796147
