### Load Data

In [2]:
import pandas as pd
import numpy as np

#### Read datasets from all waves

In [3]:
data_anchor = {}
for i in [1,2,3,4,5,6,7,8,9,10,11]:
    data_anchor["wave"+str(i)] = pd.read_stata('../../data/pairfam_v11/Data/Stata/anchor'+str(i)+'.dta')
    print('anchor dataset of wave '+str(i)+' was loaded successfully')

anchor dataset of wave 1 was loaded successfully
anchor dataset of wave 2 was loaded successfully
anchor dataset of wave 3 was loaded successfully
anchor dataset of wave 4 was loaded successfully
anchor dataset of wave 5 was loaded successfully
anchor dataset of wave 6 was loaded successfully
anchor dataset of wave 7 was loaded successfully
anchor dataset of wave 8 was loaded successfully
anchor dataset of wave 9 was loaded successfully
anchor dataset of wave 10 was loaded successfully
anchor dataset of wave 11 was loaded successfully


In [4]:
data_partner = {}
for i in [1,2,3,4,5,6,7,8,9,10,11]:
    data_partner["wave"+str(i)] = pd.read_stata('../../data/pairfam_v11/Data/Stata/partner'+str(i)+'.dta')
    print('partner dataset of wave '+str(i)+' was loaded successfully')

partner dataset of wave 1 was loaded successfully
partner dataset of wave 2 was loaded successfully
partner dataset of wave 3 was loaded successfully
partner dataset of wave 4 was loaded successfully
partner dataset of wave 5 was loaded successfully
partner dataset of wave 6 was loaded successfully
partner dataset of wave 7 was loaded successfully
partner dataset of wave 8 was loaded successfully
partner dataset of wave 9 was loaded successfully
partner dataset of wave 10 was loaded successfully
partner dataset of wave 11 was loaded successfully


#### Create a meta data set with all subjects (anchors and partners combined):

In [5]:
def recode_values(df):
    for col in df.columns:
        df[col] = df[col].astype('str').str.extract(r"([-+]?\d*\.*\d+|\d+)",expand=False).astype('float')
        df[col] = df[col]
        df.loc[df[col]<0,[col]] = np.nan
    return df

In [6]:
def exclude_singles(df):
    df = df[df['pid']>=0]
    return df

In [7]:
subjects_anchor = data_anchor['wave1'].loc[:, ['id', 'pid', 'original_doby', 'sex_gen']]
subjects_anchor.rename(columns={'original_doby': 'doby', 'sex_gen': 'gen'}, inplace=True)
subjects_anchor['subj_type'] = 'anchor'

subjects_partner = data_partner['wave1'].loc[:, ['id', 'pid', 'pdoby', 'psex']]
subjects_partner.rename(columns={'id': 'pid', 'pid': 'id', 'pdoby': 'doby', 'psex': 'gen'}, inplace=True)
subjects_partner['subj_type'] = 'partner'

subjects = pd.concat([subjects_anchor, subjects_partner])
subjects = exclude_singles(subjects)
subjects.loc[:, ['doby', 'gen']] = recode_values(subjects.loc[:, ['doby', 'gen']])
subjects.drop(['pid'], axis=1, inplace=True, errors='ignore')

In [8]:
subjects

Unnamed: 0,id,doby,gen,subj_type
2,309000,1981.0,2.0,anchor
5,907000,1981.0,2.0,anchor
7,1028000,1983.0,2.0,anchor
12,1299000,1972.0,2.0,anchor
13,1300000,1973.0,1.0,anchor
...,...,...,...,...
3738,748577101,1970.0,1.0,partner
3739,748755101,1969.0,1.0,partner
3740,748982101,1982.0,1.0,partner
3741,749211101,1970.0,2.0,partner


#### Create subsets for each wave

In [9]:
variables = {'per1i2':        {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'Sometimes I believe that I am worthless'}, 
             'per1i7':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'I like myself just the way I am'}, 
             'per1i13':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'All in all, I am pleased with myself'},                      
             'pa18i2':        {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'Sometimes I am afraid that partner would rather spend time with others'}, 
             'pa18i7':        {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'I have the feeling that I like partner more than he/she likes me'}, 
             'pa18i10':       {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'Sometimes not sure if partner enjoys being with me as much as I'}, 
             'pa18i12':       {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'Afraid partner will think I am silly/stupid if I make a mistake'}, 
             'pa18i15':       {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'When I disappoint/annoy partner, I am afraid he/she will not like me'}, 
             'per1i6':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'I feel lonely'}, 
             'sat6':          {'max': 10, 'min': 0, 'inverted': 'no',  'desc': 'General satisfaction with life'},
             'atts':          {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Attachment: Model of self'},
             'atto':          {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Attachment: Model of others'}}

In [19]:
variables = {'bce1i5':        {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'VOP+: Affection and feeling of security in relationship'}, 
             'bce1i6':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'VOP+: Freedom to follow own interests by relationship '}, 
             'bce1i12':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'VOP+: Pursue own interests in partnership'}, 
             'bce1i10':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'VOP-: Constrained by partner'}, 
             'per1i2':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Sometimes I believe that I am worthless'}, 
             'per1i7':        {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'I like myself just the way I am'}, 
             'per1i13':       {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'All in all, I am pleased with myself'},                      
             'pa17i1':        {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'Frequency: Telling partner what you are thinking'}, 
             'pa17i8':        {'max': 5,  'min': 1, 'inverted': 'yes', 'desc': 'Frequency: Sharing secrets and private feelings with partner'}, 
             'pa18i2':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Sometimes I am afraid that partner would rather spend time with others'}, 
             'pa18i4':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Partner clings to me so much that I feel like I am suffocating'}, 
             'pa18i7':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'I have the feeling that I like partner more than he/she likes me'}, 
             'pa18i10':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Sometimes not sure if partner enjoys being with me as much as I'}, 
             'pa18i12':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Afraid partner will think I am silly/stupid if I make a mistake'}, 
             'pa18i14':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Partner clings to me so tightly that I cannot do what I want '}, 
             'pa18i15':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'When I disappoint/annoy partner, I am afraid he/she will not like me'}, 
             'per1i6':        {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'I feel lonely'}, 
             'sat6':          {'max': 10, 'min': 0, 'inverted': 'no',  'desc': 'General satisfaction with life'},
             'att_anx':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Attachment Anxiety'},
             'att_avd':       {'max': 5,  'min': 1, 'inverted': 'no',  'desc': 'Attachment Avoidance'}}

In [20]:
def rename_columns(df, add_prfx='', rem_prfx='', add_sffx=''):
    df = df.copy()
    for var in variables:
        df.rename(columns = {rem_prfx+var:     add_prfx+var+add_sffx, 
                             rem_prfx+'p'+var: add_prfx+'p'+var+add_sffx}, inplace=True)
    return df

In [21]:
def extract_columns(df, variables):
    variables_avail = list(set(df.columns).intersection(variables))
    variables_avail.append('id')
    variables_avail.append('pid')
    df = df.loc[:, variables_avail]
    return df

In [22]:
def invert_scales(df, variables):
    for col in df:
        if(col in variables and variables[col]['inverted'] == 'yes'):
            df[col].mask(df[col]>=0, variables[col]['max']-df[col]+variables[col]['min'], inplace=True)
    return df

In [23]:
def combine_variables(df):
    
    # Calculate measure for 'model of self'
    if 'per1i2' in df.columns:
        df['atts'] = df[['per1i2', 'per1i7', 'per1i13']].mean(axis=1)
    df.loc[((df['per1i2']==-2) | (df['per1i7']==-2) | (df['per1i13']==-2)),['atts']] = np.nan    
        
    if 'pa18i2' in df.columns:
        df['atto'] = df[['pa18i2', 'pa18i7', 'pa18i10', 'pa18i12', 'pa18i15']].mean(axis=1) # column for model of self
    return df    

In [23]:
def combine_variables(df):
    
    # Calculate measure for 'attachment anxiety
    if 'per1i2' in df.columns:
        df['att_anx'] = df[['per1i2', 'per1i7', 'per1i13', 'pa18i7', 'pa18i10', 'pa18i2', 'pa18i12', 'pa18i15']].mean(axis=1)
    df.loc[((df['per1i2']==-2) | (df['per1i7']==-2) | (df['per1i13']==-2)),['atts']] = np.nan    
     
    # Calculate measure for 'attachment avoidance'
    if 'pa18i4' in df.columns:
        df['att_avd'] = df[['bce1i5', 'bce1i6', 'bce1i12', 'bce1i10', 'pa17i1', 'pa17i8', 'pa18i4', 'pa18i14']].mean(axis=1) 
    return df    

In [24]:
def join_anchor_partner(df_anchor, df_partner):
    df = df_anchor.merge(df_partner, left_on='pid', right_on='id', how='left', suffixes=['', '_y'])
    df.drop(['id_y', 'pid_y', 'pageExact', 'page', 'pint'], axis=1, inplace=True, errors='ignore')
    return df

In [25]:
subsets = {}

for i in range(1,12):
    
    # Process anchor datasets
    df_a = data_anchor["wave"+str(i)]
    df_a = extract_columns(df_a, variables) # Keep only the relevant columns
    df_a = recode_values(df_a) # Recodes the values ("1" instead of "1 Trifft überhaupt nicht zu")
    df_a = invert_scales(df_a, variables)
    df_a = combine_variables(df_a) # Creates combined measures for attachment model of self and model of others

    # Process partner datasets 
    df_p = data_partner["wave"+str(i)]
    df_p = rename_columns(df_p, rem_prfx='p') # Removes the 'p' before the field names
    df_p = extract_columns(df_p, variables) # Keep only the relevant columns
    df_p = recode_values(df_p) # Recodes the values ("1" instead of "1 Trifft überhaupt nicht zu")
    df_p = invert_scales(df_p, variables)
    df_p = combine_variables(df_p) # Creates combined measures for attachment model of self and model of others
    df_p = df_p.rename(columns={'pid': 'id', 'id': 'pid'}) # switch IDs, because identifer of partner subjects is pid and not id

    df1 = join_anchor_partner(df_a, rename_columns(df_p, add_prfx='p'))
    df2 = join_anchor_partner(df_p, rename_columns(df_a, add_prfx='p'))

    df = pd.concat([df1, df2])
    
    subsets["wave"+str(i)] = df

#### Merge all subsets into one dataframe with a Long Format

In [26]:
df_long = {}

# Calculate age
for i in range(1,12):
    df_temp = subjects.copy().merge(subsets['wave'+str(i)], how='left', on='id')
    df_temp['age'] = 2009-df_temp['doby']+i
    df_temp['wave'] = i
    df_long['wave'+str(i)] = df_temp

# Concat subsets
df_long = pd.concat([df_long['wave1'], df_long['wave2'], df_long['wave3'],
                     df_long['wave4'], df_long['wave5'], df_long['wave6'],
                     df_long['wave7'], df_long['wave8'], df_long['wave9'],
                     df_long['wave10'], df_long['wave11']])

# Drop NANs
#df_long.drop(df_long[df_long['age'].isnull()].index, inplace=True)
df_long.dropna(thresh=len(df_long.columns)-len(variables)*2, inplace=True) # thresh = number of non-variable columns

# Drop time points when subjects wasn't in a relationship
df.dropna(subset=['pid'], inplace=True)

# Normalize variables
columns = ['p'+var for var in variables] + list(variables)
df_long[columns] = (df_long[columns]-df_long[columns].mean()) / df_long[columns].std()

# Export
df_long.to_csv('../../data/samples/data6.csv', index=False)
df_long

KeyError: "['patt_avd', 'att_avd', 'att_anx', 'patt_anx'] not in index"

#### Analyze descriptives:

In [181]:
descriptives = pd.DataFrame(columns=['subj_type', 'm', 'sd', 'corr', 'wave1', 'wave2', 'wave3', 'wave4', 'wave5', 'wave6', 'wave7', 'wave8', 'wave9', 'wave10', 'wave11'])

for subj_type in ('anchor', 'partner'):
    temp = df_long[df_long['subj_type'] == subj_type]
    for var in df_long.columns:
        if var in variables or var[1:] in variables:
            N = temp[[var, 'wave']].groupby('wave').count().T.add_prefix('wave')
            m = np.round(df_long[var].mean(),4)
            sd = temp[var].std()
            if var=='atts':    corr = temp[['atts', 'per1i2', 'per1i7', 'per1i13']].corr()['atts'][1:].mean()
            elif var=='atto':  corr = temp[['atto', 'pa18i2', 'pa18i7', 'pa18i10', 'pa18i12', 'pa18i15']].corr()['atto'][1:].mean()
            elif var=='patts': corr = temp[['patts', 'pper1i2', 'pper1i7', 'pper1i13']].corr()['patts'][1:].mean()
            elif var=='patto': corr = temp[['patto', 'ppa18i2', 'ppa18i7', 'ppa18i10', 'ppa18i12', 'ppa18i15']].corr()['patto'][1:].mean()
            else:              corr = np.nan
            descriptives = descriptives.append(N.join(pd.DataFrame({'subj_type': subj_type, 'm': m, 'sd': sd, 'corr': corr}, index=[var])))   

descriptives[descriptives.index.isin(['sat6', 'per1i6', 'atto', 'atts', 'psat6', 'pper1i6', 'patto', 'patts'])]

Unnamed: 0,subj_type,m,sd,corr,wave1,wave2,wave3,wave4,wave5,wave6,wave7,wave8,wave9,wave10,wave11
sat6,anchor,0.0,0.971583,,7231,5076,4376,3954,3559,3230,2931,2712,2542,2380,1990
per1i6,anchor,-0.0,1.015695,,7203,0,0,3934,3549,0,2916,2712,2537,2371,1983
atts,anchor,-0.0,1.001066,0.821334,7213,5061,4363,3944,3553,3225,2918,2714,2537,2372,1983
atto,anchor,-0.0,0.988507,0.71253,7166,2156,3812,0,3128,0,2590,0,2239,0,1986
psat6,anchor,-0.0,1.065495,,3668,2446,2084,1878,1708,1589,1437,1358,1231,1123,1056
pper1i6,anchor,-0.0,1.027463,,3660,0,2087,1860,1710,1571,372,1344,335,1111,76
patts,anchor,0.0,1.021127,0.820607,3696,2425,2094,1877,1715,1588,374,1361,337,1119,75
patto,anchor,-0.0,1.046002,0.723189,3699,0,2094,0,1727,0,1444,0,1234,0,1060
sat6,partner,0.0,1.069616,,3668,2039,1645,1440,1260,1149,1029,917,825,754,703
per1i6,partner,-0.0,0.956276,,3660,0,1645,1426,1261,1142,206,918,185,746,15


#### Convert to Wide Format

In [15]:
df_wide = subjects.copy()

for i in range(0,11):
    df_temp = subsets["wave"+str(i+1)].copy()
    df_temp = rename_columns(df_temp, add_sffx='_T'+str(i))
    df_wide = df_wide.merge(df_temp, how='left', on='id')

# Keep only the variables relevant for the export
cols_rel = []
for variable in variables:
    if variables[variable]['export'] == 'yes':
        for i in range(0,11):
            var_name = variable+'_T'+str(i)
            cols_rel.append(var_name)
            if var_name not in df_wide.columns:
                df_wide[var_name] = np.nan          
df_wide = df_wide.loc[:, cols_rel]

# Add df columns
for i in range(1,11):
    df_wide['dT'+str(i)] = 1

        
# Export
df_wide.to_csv('../../data/samples/sample_wide.csv', index=False)
df_wide.head(1000).to_csv('../../data/samples/sample_wide_small.csv', index=False)
df_wide.head(100).to_csv('../../data/samples/sample_wide_extrasmall.csv', index=False)

df_wide

Unnamed: 0,per1i6_T0,per1i6_T1,per1i6_T2,per1i6_T3,per1i6_T4,per1i6_T5,per1i6_T6,per1i6_T7,per1i6_T8,per1i6_T9,...,dT1,dT2,dT3,dT4,dT5,dT6,dT7,dT8,dT9,dT10
0,1.0,,,,,,,,,,...,1,1,1,1,1,1,1,1,1,1
1,2.0,,,1.0,1.0,,1.0,2.0,5.0,5.0,...,1,1,1,1,1,1,1,1,1,1
2,2.0,,,3.0,2.0,,4.0,2.0,2.0,2.0,...,1,1,1,1,1,1,1,1,1,1
3,1.0,,,,,,,,,,...,1,1,1,1,1,1,1,1,1,1
4,1.0,,,3.0,3.0,,1.0,1.0,2.0,2.0,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10972,1.0,,,,3.0,,,,,,...,1,1,1,1,1,1,1,1,1,1
10973,1.0,,1.0,1.0,1.0,2.0,,1.0,,,...,1,1,1,1,1,1,1,1,1,1
10974,5.0,,,,,,,,,,...,1,1,1,1,1,1,1,1,1,1
10975,3.0,,,,,,,,,,...,1,1,1,1,1,1,1,1,1,1
