In [613]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols

In [638]:
from pathlib import Path
data_folder = Path("Z:\\SOEP-CORE.v36eu_STATA\\Stata\\raw")

file_names = ['vp', 'bap', 'bfp', 'bgp', 'bhp', 'bip', 'bjp', 'bkp']


file_paths = [data_folder / f"{file_name}.dta" for file_name in file_names]

file_paths_2 = [data_folder / f"{file_name}gen.dta" for file_name in file_names]

In [678]:
data0 = pd.read_stata(file_paths[0], columns=["pid","hid", "vp12602", "vp12603", "vp12605"]).set_index(['pid', 'hid'])
df0 = data0.rename(columns={'vp12602': 'take_revenge', 'vp12603': 'similar_problems', 'vp12605': 'insult_back'})

In [679]:
data1 = pd.read_stata(file_paths[1], columns=["pid","hid", "bap12402", "bap12403", "bap12405"]).set_index(['pid', 'hid'])
df1 = data1.rename(columns={'bap12402': 'take_revenge', 'bap12403': 'similar_problems', 'bap12405': 'insult_back'})

In [680]:
data2 = pd.read_stata(file_paths[2], columns=["pid","hid", "bfp0602", "bfp0603", "bfp0605"]).set_index(['pid', 'hid'])
df2 = data2.rename(columns={'bfp0602': 'take_revenge', 'bfp0603': 'similar_problems', 'bfp0605': 'insult_back'})

In [681]:
data3 = pd.read_stata(file_paths[3], columns=["pid","hid", "bgpm185a02", "bgpm185a03", "bgpm185a05"]).set_index(['pid', 'hid'])
df3 = data3.rename(columns={'bgpm185a02': 'take_revenge', 'bgpm185a03': 'similar_problems', 'bgpm185a05': 'insult_back'})

In [682]:
data4 = pd.read_stata(file_paths[4], columns=["pid","hid", "bhp_447_q56", "bhp_448_q56", "bhp_450_q56"]).set_index(['pid', 'hid'])
df4 = data4.rename(columns={'bhp_447_q56': 'take_revenge', 'bhp_448_q56': 'similar_problems', 'bhp_450_q56': 'insult_back'})

In [683]:
data5 = pd.read_stata(file_paths[5], columns=["pid","hid", "bip_412_q107", "bip_413_q107", "bip_415_q107"]).set_index(['pid', 'hid'])
df5 = data5.rename(columns={'bip_412_q107': 'take_revenge', 'bip_413_q107': 'similar_problems', 'bip_415_q107': 'insult_back'})

In [684]:
data6 = pd.read_stata(file_paths[6], columns=["pid","hid", "bjp_446_q155", "bjp_447_q155", "bjp_449_q155"]).set_index(['pid', 'hid'])
df6 = data6.rename(columns={'bjp_446_q155': 'take_revenge', 'bjp_447_q155': 'similar_problems', 'bjp_449_q155': 'insult_back'})

In [685]:
#data7 = pd.read_stata(file_paths[7], columns=["pid","hid", "bkp_06_02", "bkp_06_03", "bkp_06_05"]).set_index(['pid', 'hid'])
#df7 = data7.rename(columns={'bkp_06_02': 'take_revenge', 'bkp_06_03': 'similar_problems', 'bkp_06_05': 'insult_back'})

In [686]:
neg_rec_mapping = {
    '[1] Trifft ueberhaupt nicht zu': 1,
    '[2] Skala 1-7': 2,
    '[3] Skala 1-7': 3,
    '[4] Skala 1-7': 4,
    '[5] Skala 1-7': 5,
    '[6] Skala 1-7': 6,
    '[7] Trifft voll zu': 7,
    '[-1] keine Angabe': -1
}

neg_rec_mapping6 = {
    '[1] 1=Trifft überhaupt nicht zu': 1,
    '[2] Skala 1-7': 2,
    '[3] Skala 1-7': 3,
    '[4] Skala 1-7': 4,
    '[5] Skala 1-7': 5,
    '[6] Skala 1-7': 6,
    '[7] 7=Trifft voll zu': 7,
    '[-1] keine Angabe': -1
}

neg_rec_mapping2 = {
    '[1] 1 Trifft ueberhaupt nicht zu, (Skala 1-7)': 1,
    '[2] 2 auf Skala 1-7': 2,
    '[3] 3 auf Skala 1-7': 3,
    '[4] 4 auf Skala 1-7': 4,
    '[5] 5 auf Skala 1-7': 5,
    '[6] 6 auf Skala 1-7': 6,
    '[7] 7 Trifft voll zu, (Skala 1-7)': 7,
    '[-1] keine Angabe': -1
}

neg_rec_mapping1 = {
    '[1] Trifft ueberhaupt nicht zu': 1,
    '[2] 2 auf Skala 1-7': 2,
    '[3] 3 auf Skala 1-7': 3,
    '[4] 4 auf Skala 1-7': 4,
    '[5] 5 auf Skala 1-7': 5,
    '[6] 6 auf Skala 1-7': 6,
    '[7] Trifft voll zu': 7,
    '[-1] keine Angabe': -1
}

neg_rec_mapping4 = {
    '[1] 1=Trifft ueberhaupt nicht zu': 1,
    '[2] auf einer Skala 1-7': 2,
    '[3] auf einer Skala 1-7': 3,
    '[4] auf einer Skala 1-7': 4,
    '[5] auf einer Skala 1-7': 5,
    '[6] auf einer Skala 1-7': 6,
    '[7] 7=Trifft voll zu': 7,
    '[-1] keine Angabe': -1
}

neg_rec_mapping4_1 = {
    '[1] Trifft ueberhaupt nicht zu': 1,
    '[2] auf einer Skala 1-7': 2,
    '[3] auf einer Skala 1-7': 3,
    '[4] auf einer Skala 1-7': 4,
    '[5] auf einer Skala 1-7': 5,
    '[6] auf einer Skala 1-7': 6,
    '[7] Trifft voll zu': 7,
    '[-1] keine Angabe': -1
}

neg_rec_mapping5 = {
    '[1] 1=Trifft überhaupt nicht zu': 1,
    '[2] 2': 2,
    '[3] 3': 3,
    '[4] 4': 4,
    '[5] 5': 5,
    '[6] 6': 6,
    '[7] 7=Trifft voll zu': 7,
    '[-1] keine Angabe': -1
}

df0['take_revenge'] = df0['take_revenge'].map(neg_rec_mapping)
df1['take_revenge'] = df1['take_revenge'].map(neg_rec_mapping1)
df2['take_revenge'] = df2['take_revenge'].map(neg_rec_mapping2)
df3['take_revenge'] = df3['take_revenge'].map(neg_rec_mapping2)
df4['take_revenge'] = df4['take_revenge'].map(neg_rec_mapping4)
df5['take_revenge'] = df5['take_revenge'].map(neg_rec_mapping5)
df6['take_revenge'] = df6['take_revenge'].map(neg_rec_mapping6)
#df7['take_revenge'] = df7['take_revenge'].map(revenge_mapping)


df0['similar_problems'] = df0['similar_problems'].map(neg_rec_mapping)
df1['similar_problems'] = df1['similar_problems'].map(neg_rec_mapping1)
df2['similar_problems'] = df2['similar_problems'].map(neg_rec_mapping2)
df3['similar_problems'] = df3['similar_problems'].map(neg_rec_mapping2)
df4['similar_problems'] = df4['similar_problems'].map(neg_rec_mapping4_1)
df5['similar_problems'] = df5['similar_problems'].map(neg_rec_mapping5)
df6['similar_problems'] = df6['similar_problems'].map(neg_rec_mapping6)
#df7['similar_problems'] = df7['similar_problems'].map(similar_problems_mapping)

df0['insult_back'] = df0['insult_back'].map(neg_rec_mapping)
df1['insult_back'] = df1['insult_back'].map(neg_rec_mapping1)
df2['insult_back'] = df2['insult_back'].map(neg_rec_mapping2)
df3['insult_back'] = df3['insult_back'].map(neg_rec_mapping2)
df4['insult_back'] = df4['insult_back'].map(neg_rec_mapping4_1)
df5['insult_back'] = df5['insult_back'].map(neg_rec_mapping5)
df6['insult_back'] = df6['insult_back'].map(neg_rec_mapping6)
#df7['insult_back'] = df7['insult_back'].map(insult_back_mapping)

In [687]:
df0 = df0.mask(df0 < 0, np.nan)
df1 = df1.mask(df1 < 0, np.nan)
df2 = df2.mask(df2 < 0, np.nan)
df3 = df3.mask(df3 < 0, np.nan)
df4 = df4.mask(df4 < 0, np.nan)
df5 = df5.mask(df5 < 0, np.nan)
df6 = df6.mask(df6 < 0, np.nan)
#df7 = df7.mask(df7 < 0, np.nan)

In [695]:
df0['avg_rec'] = df0[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
df1['avg_rec'] = df1[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
df2['avg_rec'] = df2[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
df3['avg_rec'] = df3[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
df4['avg_rec'] = df4[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
df5['avg_rec'] = df5[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
df6['avg_rec'] = df6[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
#df7['avg_rec'] = df7[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)

In [696]:
df = pd.merge(df0[["avg_rec"]], df1[["avg_rec"]], how= 'outer',
                       left_on=["pid", "hid",], right_on=["pid", "hid"])

In [697]:
df = pd.merge(df2[["avg_rec"]], df, how='outer',
                       left_on=["pid", "hid",], right_on=["pid", "hid"])

In [698]:
df = pd.merge(df3[["avg_rec"]], df, how='outer',
                       left_on=["pid", "hid",], right_on=["pid", "hid"])

In [699]:
df = pd.merge(df4[["avg_rec"]], df, how='outer',
                       left_on=["pid", "hid",], right_on=["pid", "hid"])

In [700]:
df = pd.merge(df5[["avg_rec"]], df, how='outer',
                       left_on=["pid", "hid",], right_on=["pid", "hid"])

In [701]:
df = pd.merge(df6[["avg_rec"]], df, how='outer',
                       left_on=["pid", "hid",], right_on=["pid", "hid"])

In [703]:
df = df.dropna(how= 'all')
#drop all rows which contain only NaN

In [841]:
means = df.mean(axis=1)

In [842]:
# c counts how many times someone answered the questions about negative reciprocity
c = means * 0
df_new = pd.merge(df, c.rename("count"),
                  left_on = ["pid", "hid",], right_on=["pid", "hid"])
df_new = pd.merge(df_new, means.rename("mean"),
                  left_on = ["pid", "hid",], right_on=["pid", "hid"])

In [843]:
for x in range(len(means)):
     for i in range(7):
        if df_new.iloc[x][i] >= 0: 
            df_new.iloc[x]["count"] +=1

In [845]:
#drops all people who answered less than two times
df_new.drop(df_new[df_new['count'] < 2].index, inplace = True)

In [847]:
#lower and upper bound in for which avg_rec is still considered stable depending on which range (a) is considered stable
a = 1
lower_bound = df_new["mean"] - a
upper_bound = df_new["mean"] + a
counter = df_new["mean"] * 0

In [848]:
#goes through all values of avg_rec and sets counter to 1 for the individual if one is outside the boundaries
for x in range(len(df_new)):
    for i in range(7):
        if df_new.iloc[x][i] < lower_bound.iloc[x] or df_new.iloc[x][i] > upper_bound.iloc[x]:
            counter[x] = 1

In [850]:
#percentage of individuals whose avgerage reciprocity is not stable
np.count_nonzero(counter)/len(df_new)

0.2498673036093418

The problem here is that there are at most 3 values of negative reciprocity for each individual which is not a significant amount to see if a variable is stable over time.