### First Regression (Turnover intention ~ unfair treatment x neg. reciprocity)

In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols

### 1.  Read in SOEP Data:
- vp : 2005 data : main variables of interest: questions on negative reciprocity
- wp: 2006 data : main variables of interest: question on perceived recognition for work
- xp: 2007 data : main variables of interest: turnover intentions, controls

In [63]:
# define path: insert the path where the SOEP data is stored on your computer here
from pathlib import Path
# Path Max: 
# Path Maxie: /Volumes/dohmen_soep/SOEP-CORE.v36eu_STATA/Stata/raw

data_folder = Path("C:/Users/max-admin/Desktop/Masterstudium/WiSe_22_23/Research_Module/SOEP-Data/Stata/raw")
# define relevant subsets of SOEP-data
file_names = ['vp', 'wp', 'xp']

file_paths = [data_folder / f"{file_name}.dta" for file_name in file_names]
# some controls are in gen data
file_paths_2 = [data_folder / f"{file_name}gen.dta" for file_name in file_names]

In [64]:
# read in 2005 data for the reciprocity measures
data05 = pd.read_stata(file_paths[0], columns=["pid","hid", "syear","vp12602", "vp12603", "vp12605"]).set_index(['pid', 'hid'])
df_2005 = data05.rename(columns={ 'vp12602': 'take_revenge', 'vp12603': 'similar_problems', 'vp12605': 'insult_back'})
# create dummies for take_revenge question
# Create dummy variables
dummies = pd.get_dummies(df_2005['similar_problems'])

# Join the dummy variables to the original dataframe
df_2005 = pd.concat([df_2005, dummies], axis=1)
# rename dummy
df_05 = df_2005.rename(columns = {'[1] Trifft ueberhaupt nicht zu' : 'rec1' , '[2] Skala 1-7' : 'rec2' , '[3] Skala 1-7' : 'rec3' ,'[4] Skala 1-7' : 'rec4' ,'[5] Skala 1-7' : 'rec5' ,'[6] Skala 1-7' : 'rec6' ,'[7] Trifft voll zu' : 'rec7'})
df_05 = df_05.drop(columns = ["[-1] keine Angabe"])

In [65]:
# read in 2006 data
# personal = personal advancement
# still includes all unfair treat
data06 = pd.read_stata(file_paths[1], columns=["pid", "hid", "syear", 'wp43b01', 'wp43b02', 'wp43b03', 'wp43b04', 'wp43b05', 'wp43b06', 'wp43b07','wp43b08', 'wp43a11']).set_index(['pid', 'hid'])
df_06 = data06.rename(columns={ 'wp43b01': 'recog_sup', 'wp43b02': 'felt_recog_sup',"wp43b03": "recog_effort",  'wp43b04': 'felt_recog_effort', "wp43b05": "recog_personal", "wp43b06" :"felt_recog_personal" ,"wp43b07": "recog_pay",'wp43b08': 'felt_recog_pay' , 'wp43a11': 'jobatrisk'})

In [66]:
#read in 2007 data
# here left out 'xp8601' for school degree since we have it in another module also 'xp0102' : 'work_satisfaction' for the beginning
#for outcome and all controls
data3= pd.read_stata(file_paths[2], columns=["pid", "hid", "syear", 'xp13101' , 'xp13102', 'xp2701', 'xp7302','xp7202','xp28', 'xp3001' ,  'xp5701' , 'xp0102' , 'xp2702', 'xp149' , 'xp40']).set_index(['pid', 'hid'])
df_07 = data3.rename(columns= {'xp13101':'gender','xp13102': 'year_birth' ,'xp2701': 'turnover_intention' , 'xp7302': 'wage_lastmonth','xp7202': 'overtime','xp28': 'new_job', 'xp3001': 'reason_new_job',  'xp5701' : 'commute_distance' , "xp0102" : "work_satisfaction" , 'xp2702' : "fear_losingjob" , 'xp149' : 'life_satisfaction' , 'xp40' : 'beamte'})

In [67]:
# read in 2007 data from work module

# adapt path and merge
hours07 = pd.read_stata(file_paths_2[2], columns=["pid","hid", "syear", "xvebzeit", "xpsbil","nace07" ,"betr07", "xerwzeit", "xbilzeit"]).set_index(['pid', 'hid'])
work07 = hours07.rename(columns={'xvebzeit': 'working_hours', "xpsbil": "school_degree", "nace07": "sector","betr07": "firmsize", "xerwzeit": "tenure" , "xbilzeit" : "years_educ"})

 ### 2. Define Functions and mappings for cleaning data

In [68]:
# mapping for reciprocity questions: same scale for all
reciprocity_questions_mapping = {
    '[1] Trifft ueberhaupt nicht zu': 1,
    '[2] Skala 1-7': 2,
    '[3] Skala 1-7': 3,
    '[4] Skala 1-7': 4,
    '[5] Skala 1-7': 5,
    '[6] Skala 1-7': 6,
    '[7] Trifft voll zu': 7,
    '[-1] keine Angabe': -1,
}
## mapping for recognition questions: binary -> binary -> unfair treatment: No -> later 1 fair treatment: Yes -> 0
recog_mapping = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe': -1,
    '[1] Ja': 2,
    '[2] Nein': 1,
}
# felt recog mapping
felt_recog_mapping = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe': -1,
    '[1] Gar nicht': 1,
    '[2] Maessig': 2,
    '[3] Stark': 3,
    '[4] Sehr stark': 4,
}
# mapping for firmsize -> we need to recode this in a sensible way: jumps are the same: first change: selbstständig to 0
firmsize_mapping = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe': -1,
    '[1] Unter  5': 1,
    '[2] 5 bis 10': 2,
    '[3] 11 bis unter 20': 3,
    '[4] bis 90: unter 20': 4,
    '[5] 91-04: 5 bis unter 20': 5,
    '[6] 20 bis unter 100': 6,
    '[7] 100 bis unter 200': 7,
    '[8] bis 98: 20 bis unter 200': 8,
    '[9] 200 bis unter 2000': 9,
    '[10] 2000 und mehr': 10,
    '[11] Selbstaendig-ohne Mitarb.': 0,
}
# mapping new job into binary variable
new_job_mapping = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe': -1, 
    '[1] Ja': 1, 
    '[2] Nein': 2,
    '[3] Ja, nach Datenpruefung': 1,
}
# mapping for job satisfaction: split up into binary with roughly equal value counts for simplicity: might change that later to categories,
satisfaction_mapping = {
    '[0] 0 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 2,
    '[1] 1 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 2,
    '[2] 2 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 2,
    '[3] 3 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 2,
    '[4] 4 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 2,
    '[5] 5 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 2,
    '[6] 6 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 1,
    '[7] 7 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 1,
    '[8] 8 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 1,
    '[9] 9 Zufrieden: Skala 0-Niedrig bis 10-Hoch': 1,
    '[10] 10 Zufrieden: Skala 0-Niedrig bis 10-Hoc': 1,
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe': -1,
}
turnover_mapping = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe':-1,
    '[0] 0% wahrscheinlich': 2,
    '[10] 10% wahrscheinlich': 1,
    '[20] 20% wahrscheinlich': 1,
    '[30] 30% wahrscheinlich': 1,
    '[40] 40% wahrscheinlich': 1, 
    '[50] 50% wahrscheinlich': 1,
    '[60] 60% wahrscheinlich': 1,
    '[70] 70% wahrscheinlich': 1,
    '[80] 80% wahrscheinlich': 1,
    '[90] 90% wahrscheinlich': 1,
    '[100] 100% wahrscheinlich': 1,
}
# mapping for turnover intention robustness check: Cardinal
turnover_mapping_cardinal = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe':-1,
    '[0] 0% wahrscheinlich': 0,
    '[10] 10% wahrscheinlich': 10,
    '[20] 20% wahrscheinlich': 20,
    '[30] 30% wahrscheinlich': 30,
    '[40] 40% wahrscheinlich': 40, 
    '[50] 50% wahrscheinlich': 50,
    '[60] 60% wahrscheinlich': 60,
    '[70] 70% wahrscheinlich': 70,
    '[80] 80% wahrscheinlich': 80,
    '[90] 90% wahrscheinlich': 90,
    '[100] 100% wahrscheinlich': 100,
}
# mapping for new job to easier remove negatives
reason_new_job_mapping = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe':-1, 
    '[1] Erstmals erwerbstaetig': 1,
    '[2] Wieder erwerbstaetig': 2,
    '[3] Stelle bei neuen Arbeitgeber': 3,  
    '[4] Uerbnommen von Betrieb': 4,
    '[5] Stellenwechsel im Betrieb': 5, 
    '[6] Selbstaendig geworden': 6,
}
# mapping for school degree: to easier remove negatives
school_degree_mapping = {
    '[-2] trifft nicht zu': -2,
    '[-1] keine Angabe':-1,
    '[1] Hauptschulabschluss': 1,
    '[2] Realschulabschluss': 2,
    '[3] Fachhochschulreife': 3,
    '[4] Abitur': 4,
    '[5] Anderer Abschluss': 5,
    '[6] Ohne Abschluss verlassen': 6,
    '[7] Noch kein Abschluss': 7,
    '[8] Keine Schule besucht': 8,
}
# sector_map
sector_map = {
    "[1] Landwirtschaft und  Jagd": 1,
    "[2] Forstwirtschaft": 2,
    "[5] Fischerei und Fischzucht": 5,
    "[10] Kohlenbergbau, Torfgewinnung": 10,
    "[11] Gewinnung von Erdöl und Erdgas, Erbringung damit verbundener Dienstleistungen": 11,
    "[12] Bergbau auf Uran- und Thoriumerze": 12,
    "[13] Erzbergbau": 13,
    "[14] Gewinnung von Steinen und Erden, sonstiger Bergbau": 14,
    "[15] Herstellung von Nahrungs- und Futtermitteln sowie Getränken": 15,
    "[16] Tabakverarbeitung": 16,
    "[17] Herstellung von Textilien": 17,
    "[18] Herstellung von Bekleidung": 18,
    "[19] Herstellung von Leder und Lederwaren": 19,
    "[20] Herstellung von Holz sowie Holz-, Kork- und Flechtwaren (ohne Herstellung von Möbeln)": 20,
    "[21] Herstellung von Papier, Pappe und Waren daraus": 21,
    '[22] Herstellung von Verlags- und Druckerzeugnissen,  Vervielfältigung von bespielten Ton-, Bild- und Datenträgern': 22,
    "[23] Kokerei, Mineralölverarbeitung, Herstellung und Verarbeitung von Spalt- und Brutstoffen": 23,
    "[24] Herstellung von chemischen Erzeugnissen": 24,
    "[25] Herstellung von Gummi- und Kunststoffwaren": 25,
    "[26] Herstellung von Glas und Glaswaren, Keramik, Verarbeitung von Steinen und Erden": 26,
    "[27] Metallerzeugung und -bearbeitung": 27,
    "[28] Herstellung von Metallerzeugnissen": 28,
    "[29] Maschinenbau": 29,
    "[31] Herstellung von Geräten der Elektrizitätserzeugung, -verteilung u. Ä.": 31,
    "[30] Herstellung von Büromaschinen, Datenverarbeitungsgeräten und -einrichtungen": 30,
    "[32] Rundfunk- und Nachrichtentechnik": 32,
    "[33] Medizin-, Mess-, Steuer- und Regelungstechnik, Optik, Herstellung von Uhren": 33,
    "[34] Herstellung von Kraftwagen und Kraftwagenteilen": 34,
    "[35] Sonstiger Fahrzeugbau": 35,
    "[36] Herstellung von Möbeln, Schmuck, Musikinstrumenten, Sportgeräten, Spielwaren und sonstigen Erzeugnissen": 36,
    "[37] Rückgewinnung": 37,
    "[40] Energieversorgung": 40,
    "[41] Wasserversorgung": 41,
    "[45] Bau": 45,
    "[50] Kraftfahrzeughandel; Instandhaltung und Reparatur von Kraftfahrzeugen; Tankstellen": 50,
    "[51] Handelsvermittlung und Großhandel (ohne Handel mit Kraftfahrzeugen)": 51,
    "[52] Einzelhandel (ohne Handel mit Kraftfahrzeugen und ohne Tankstellen); Reparatur von Gebrauchsgütern": 52,
    "[55] Beherbergungs- und Gaststätten": 55,
    "[60] Landverkehr; Transport in Rohrfernleitungen": 60,
    "[61] Schifffahrt": 61,
    "[62] Luftfahrt": 62,
    "[63] Hilfs- und Nebentätigkeiten für den Verkehr; Verkehrsvermittlung": 63,
    "[64] Nachrichtenübermittlung": 64,
    "[65] Kreditinstitute": 65,
    "[66] Versicherungen (ohne Sozialversicherung)": 66,
    "[67] Mit den Kreditinstituten und Versicherungen verbundene Tätigkeiten": 67,
    "[70] Grundstücks- und Wohnungswesen": 70,
    "[71] Vermietung beweglicher Sachen ohne Bedienungspersonal": 71,
    "[72] Datenverarbeitung und Datenbanken": 72,
    "[73] Forschung und Entwicklung": 73,
    "[74] Erbringung von unternehmensbezogenen Dienstleistungen": 74,
    "[75] Öffentliche Verwaltung, Verteidigung, Sozialversicherung": 75,
    "[80] Erziehung und Unterricht": 80,
    "[85] Gesundheits-, Veterinär- und Sozialwesen": 85,
    "[90] Abwasser- und Abfallbeseitigung und sonstige Entsorgung": 90,
    "[91] Interessenvertretungen sowie kirchliche und sonstige Vereinigungen (ohne Sozialwesen, Kultur und Sport)": 91,
    "[92] Kultur, Sport und Unterhaltung": 92,
    "[93] Erbringung von sonstigen Dienstleistungen": 93,
    "[95] Private Haushalte mit Hauspersonal": 95,					
    "[96] Industrie - ohne weitere Zuordnung": 96,					
    "[97] Handwerk - ohne weitere Zuordnung": 97,					
    "[98] Dienstleistungen ohne weitere Zuordnung": 98,					
    "[99] Exterritoriale Organisationen und Körperschaften": 99,				
    "[100] Produzierendes Gewerbe ohne w.Zuordnung": 100,
    "[-1] keine Angabe": 3,
    '[-2] trifft nicht zu': 0, 
    "[-3] unplausibler Wert": -3,
    "[-4] unzulaessige Mehrfachantwort": -4, 
    "[-5] in Fragebogenversion nicht enthalten": -5,
    "[-6] Fragebogenversion mit geaenderter Filterfuehrung": -6, 
    "[-7] nur in weniger eingeschraenkter Edition verfuegbar": -7,
    "[-8] Frage in diesem Jahr nicht Teil des Frageprogramms": -8,
}
# reversed mapping to redo changes
reversed_mapping_reason = {v: k for k, v in reason_new_job_mapping.items()}
reversed_mapping_schoold = {v: k for k, v in school_degree_mapping.items()}
reversed_mapping_sector = {v: k for k, v in sector_map.items()}

## function for recoding values and dropping missing

def recode_categoricals(inputdf,rc_cardinal = 0):
    """
        Applies recoding to categoricals to easier drop n.a.ns

        Input:
            - inputdf : merged dataframe
            - rc_cardinal: optional argument: if == 1 turnover intentions will be coded as cardinal instead of binary 
            
        Output:
            - df : cleaned Dataframe

    """
    merged = inputdf  
    
    # recode Gender variable
    merged['gender'].replace('[2] Weiblich', 2,inplace=True)
    merged['gender'].replace('[1] Maennlich', 1,inplace=True)
    # recode reciprocity variables
    merged[["similar_problems","take_revenge","insult_back"]] = merged[["similar_problems","take_revenge","insult_back"]].apply(lambda x: x.map(reciprocity_questions_mapping))
    # recode recognition variables
    merged[["recog_sup","recog_effort","recog_personal","recog_pay","jobatrisk"]] = merged[["recog_sup","recog_effort","recog_personal","recog_pay","jobatrisk"]].apply(lambda x: x.map(recog_mapping))
    # recode felt recognition variables
    merged[["felt_recog_sup","felt_recog_effort","felt_recog_personal","felt_recog_pay"]] = merged[["felt_recog_sup","felt_recog_effort","felt_recog_personal","felt_recog_pay"]].apply(lambda x: x.map(felt_recog_mapping))
    # recode firm size
    merged['firmsize'] = merged['firmsize'].map(firmsize_mapping)
    # recode new job reason variable
    merged['reason_new_job'] = merged['reason_new_job'].map(reason_new_job_mapping)
    # recode job change variable
    merged['new_job']= merged['new_job'].map(new_job_mapping)
    # recode sector
    merged['sector'] = merged['sector'].map(sector_map)
    merged['beamte'] = merged['beamte'].map(recog_mapping)
    # recode job satisfaction
    merged['work_satisfaction']= merged['work_satisfaction'].map(satisfaction_mapping)
    merged['life_satisfaction'] = merged['life_satisfaction'].map(satisfaction_mapping)
    # recode turnover intention variable
    merged['fear_losingjob'] = merged['fear_losingjob'].map(turnover_mapping_cardinal)
    if rc_cardinal == 1:
        merged['turnover_intention'] = merged['turnover_intention'].map(turnover_mapping_cardinal)
    else:
        merged['turnover_intention'] = merged['turnover_intention'].map(turnover_mapping)
    
    # recode school degree
    merged['school_degree'] = merged['school_degree'].map(school_degree_mapping)
    
    output = merged
    return output

# Merge dataframes: a bit tough to read as its nested, merges 4 dataframes: 2005,2006,2007 and 2007gen

def merge_and_clean(df_05,df_06,df_07,work07,rc_cardinal = 0,rc_rec_binary=0):
    """
    merges data from different years, applies recoding to categoricals and constructs additional variables.

    Input:
        - df_05 : Pd.Dataframe contains reciprocity measures
        - df_06 : Pd.Dataframe contains unfair treatment measures
        - df_07 : Pd.Dataframe contains outcome and controls
        - work07 : Pd.Dataframe contains additional controls

    Output:
        - df : cleaned Dataframe

    """
    allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
    recoded = recode_categoricals(allmerged_df,rc_cardinal).astype('int')
    # replaces negative values with n.a.n 
    recoded = recoded.mask(recoded < 0, np.nan)
    # recode sector back intro category
    
    recoded["wage_lastmonth"] = np.log(recoded["wage_lastmonth"])
    recoded = recoded[recoded["wage_lastmonth"] != -np.inf] 
    # construct avg reciprocity measure
    recoded['avg_rec'] = recoded[['take_revenge', 'similar_problems', 'insult_back']].mean(axis=1)
    # For robustness check: option to construct binary reciprocity measure.
    if rc_rec_binary == 1:
        recoded['binary_rec'] = recoded['avg_rec'].apply(binary_reciprocity)
    
    # construct age, potential experience and age^2
    recoded['age'] = 2007 - recoded['year_birth']
    recoded["potential_experience"] = pow((recoded["age"] - 18), 2)
    recoded["age_squared"] = (recoded["age"] ** 2) / 100
    recoded["tenure_squared"] = (recoded["tenure"] ** 2) / 100
    # recode categoricals back to make it better readable
    recoded["reason_new_job"] = recoded["reason_new_job"].map(reversed_mapping_reason)
    recoded["school_degree"] = recoded["school_degree"].map(reversed_mapping_schoold)
    recoded["sector"] = recoded["sector"].map(reversed_mapping_sector)
    # transform binary variables with 1 and 2 into 1 and 0
    columns_to_transform = ["recog_sup","recog_effort", "recog_pay", "recog_personal" ,"gender", "turnover_intention", "new_job" , "jobatrisk" , "beamte"]

    # Iterate over the columns and replace the values 2 with 0 
    for col in columns_to_transform:
        recoded[col] = recoded[col].replace({2: 0})

    # save df somewhere so its not muted when repeatedly executing this cell: Can later transform that into functions
    df = recoded
    
    return df

# Add Mincer Wage Regression and adds its residuals to the Dataframe

def add_mincer_residuals(cleaneddata):
    """
    Performs a Minzer-wage Regression on the cleaned dataset and adds
    the residuals to the dataframe. This will be used as the wage controls

        Input: 
            - cleaneddata(pd.DataFrame) : merged and recoded dataset

        Output: 
            - df_cleaned(pd.DataFrame)  : samedataset with Minzer-residuals
    
    """



    df_cleaned = cleaneddata
    # specify which columns to drop from our dataframe
    df_mincer = cleaneddata.drop(columns=['syear_x', 'similar_problems', 'take_revenge', 'insult_back','syear_y', 'felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'syear_y', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_sup', 'age', 'commute_distance', 'recog_effort', 'working_hours', 'turnover_intention', 'rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' , 'work_satisfaction' , 'sector' , 'tenure_squared' , 'fear_losingjob' , 'jobatrisk' , 'life_satisfaction'])
    # Convert 'gender' and 'sector' columns to categorical data type
    for col in ['gender']:
        df_mincer[col] = df_mincer[col].astype('category')
    df_mincer = df_mincer.dropna()
    # Define the dependent variable
    y = df_mincer['wage_lastmonth']
    # Define the independent variables
    X = df_mincer[['gender', 'firmsize', 'tenure', 'years_educ', 'potential_experience', 'age_squared']]

    # Add a constant term to the independent variables
    X = sm.add_constant(X)

    # Fit the Mincer wage regression model
    mincer_model = sm.OLS(y, X).fit()
    
    # Create a new column in the dataframe with the same name as the residuals array/ delete relative wage entries
    df_cleaned['mincer_residuals'] = None
    # Match the rows of the dataframe with the values in the residuals array using the index
    df_cleaned.loc[df_cleaned.index, 'mincer_residuals'] = mincer_model.resid

    return df_cleaned

### Inlcude People who switched their jobs in 2006-2007 with 2006 controls

In [69]:
def include_jobchangers(data07,onlynewemployer=True,worksatisfaction=False,rc_cardinal = False):
    """
    Replaces controls of subjects which switched their jobs-from 2006-2007 with 
    controls from their job at the time.
    
    Input: 
        - data07(Pd.Dataframe)  : finished 2007 dataframe
        - onlynewemployer (optional argument) : If True only includes those who switched jobs to a new employer
        - worksatisfaction (optional argument): If True drops observations where job satisfaction between 2006 and 2007 changed 

    Output:
        - dfconcat: Dataframe where controls for people who switched their jobs are from 2006
    
    """
    df = data07
    # read in cleaned 2006 dataset:
    #  change path here : Maxie: '/Users/maxieschulze/Documents/Dokumente - MacBook Pro von Maxie/5. Semester/Research Module/ResearchModule/src/data_management/2006jobchange.csv'
    #                     Max 'C:/Users/max-admin/Desktop/Masterstudium/WiSe_22_23/Research_Module/finalproj/src/data_management/2006jobchange.csv'  
    observations_2006 = pd.read_csv('C:/Users/max-admin/Desktop/Masterstudium/WiSe_22_23/Research_Module/finalproj/src/data_management/2006jobchange.csv')
    # some initial datamanagement
    observations_2006.reset_index(inplace=True)
    observations_2006.set_index(["pid","hid"], inplace=True)
    observations_2006.drop(columns=observations_2006.filter(regex='^syear').columns, inplace=True)
    observations_2006.drop("index",axis=1 ,inplace =True)
    # If cardinal turnover intentions are coded to 100
    if rc_cardinal == True:
        observations_2006["turnover_intention"] = 100
    
    # drop s year columns from both dataframes
    df.drop(columns=df.filter(regex='^syear').columns, inplace=True)
    
    # optional worksatisfaction robustness check
    if worksatisfaction == True:
        satisfaction = pd.DataFrame(df['work_satisfaction']).join(pd.DataFrame(observations_2006['work_satisfaction']), on = ["pid", "hid"], lsuffix = "_07" , rsuffix ="_06")
        # list of IDs where job satisfaction changed over the past year
        ID_keep_satis = satisfaction[satisfaction["work_satisfaction_07"] == satisfaction["work_satisfaction_06"]].index
        # drops all persons from 2007 dataframe where job satisfaction changed
        df.drop(df[~df.index.isin(ID_keep_satis)].index, inplace=True)
    
    # create list of IDs of people who switched to a new employer in the last year in 2007
    if onlynewemployer == True:
        IDs_tokeep = df[(df["new_job"] == 1) & (df["reason_new_job"] == '[3] Stelle bei neuen Arbeitgeber')].index
    # if True drops people whose work satisfaction changed between 2006 and 2007
    else:
        IDs_tokeep = df[(df["new_job"] == 1)].index
    
    # drop all who changed their job in 2007 dataframe and replace 2007 controls with 2006 controls
    df.drop(df[df["new_job"] == 1].index, inplace = True)
    observations_2006.drop(observations_2006[~observations_2006.index.isin(IDs_tokeep)].index, inplace=True)
    #concat both dataframes
    dfconcat = pd.concat([df,observations_2006])
    
    return dfconcat

### 3. Analysis


In [70]:
df_OLS = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07)))
# add interaction term
df_OLS["recXrecog_sup"] = df_OLS["recog_sup"] * df_OLS["avg_rec"]

## Drop n.a.n.s
df_OLS.drop(columns=['similar_problems', 'take_revenge', 'insult_back', 'felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth', 'rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' , 'work_satisfaction'], inplace=True)
df_OLS.dropna(inplace=True)
# drop missing data in regression dataframe


# Specify model
formula_main = 'turnover_intention ~ recog_sup + avg_rec + recXrecog_sup + working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + beamte + gender'

# Fit the regression and cluster on the sector variable
reg = smf.ols(formula_main, data=df_OLS).fit(cov_type='cluster', cov_kwds={'groups': df_OLS['sector']})

# Print the regression results
summary = reg.summary()
#print(summary.as_latex())
summary

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.205
Model:,OLS,Adj. R-squared:,0.203
Method:,Least Squares,F-statistic:,880.4
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,4.12e-62
Time:,13:48:30,Log-Likelihood:,-3255.5
No. Observations:,5332,AIC:,6543.0
Df Residuals:,5316,BIC:,6648.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0322,0.009,3.464,0.001,0.014,0.050
recog_sup,0.1239,0.032,3.822,0.000,0.060,0.187
avg_rec,0.0206,0.006,3.384,0.001,0.009,0.033
recXrecog_sup,-0.0112,0.008,-1.353,0.176,-0.027,0.005
working_hours,0.0009,0.001,0.708,0.479,-0.002,0.003
firmsize,-0.0089,0.003,-2.993,0.003,-0.015,-0.003
tenure,-0.0149,0.002,-8.736,0.000,-0.018,-0.012
tenure_squared,0.0205,0.004,5.107,0.000,0.013,0.028
age,0.0373,0.007,5.129,0.000,0.023,0.052

0,1,2,3
Omnibus:,8560.938,Durbin-Watson:,1.86
Prob(Omnibus):,0.0,Jarque-Bera (JB):,352.616
Skew:,-0.033,Prob(JB):,2.6900000000000002e-77
Kurtosis:,1.742,Cond. No.,1.16e+18


### Robustness Checks


#### 1. Recode turnover variable into cardinal variable: Optional argument in recode_categoricals

In [71]:
# optional argument ==1
df_card = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07,rc_cardinal=1)),rc_cardinal=True)
## drop n.a.ns
df_card.drop(columns=['similar_problems', 'take_revenge', 'insult_back','felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth','rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' ,'work_satisfaction'], inplace = True)
# add interaction term
df_card["recXrecog_sup"] = df_card["recog_sup"] * df_card["avg_rec"]
# drop missing data in regression dataframe
df_card.dropna(inplace=True)

import statsmodels.formula.api as smf

# Specify model
formula_main = 'turnover_intention ~ recog_sup + avg_rec + recXrecog_sup + working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + beamte + gender'

# Fit the regression and cluster on the sector variable
reg_cardinal = smf.ols(formula_main, data=df_card).fit(cov_type='cluster', cov_kwds={'groups': df_card['sector']})

# Print the regression results
summary = reg_cardinal.summary()
#print(summary.as_latex())
summary

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.227
Model:,OLS,Adj. R-squared:,0.225
Method:,Least Squares,F-statistic:,253.6
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,7.4e-47
Time:,13:48:33,Log-Likelihood:,-24792.0
No. Observations:,5332,AIC:,49620.0
Df Residuals:,5316,BIC:,49720.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,5.2540,0.632,8.308,0.000,4.015,6.493
recog_sup,8.3945,1.442,5.823,0.000,5.569,11.220
avg_rec,0.7447,0.235,3.165,0.002,0.284,1.206
recXrecog_sup,-0.4949,0.389,-1.272,0.204,-1.258,0.268
working_hours,0.0404,0.076,0.532,0.595,-0.108,0.189
firmsize,-0.2727,0.135,-2.025,0.043,-0.537,-0.009
tenure,-1.4831,0.134,-11.101,0.000,-1.745,-1.221
tenure_squared,2.7050,0.289,9.366,0.000,2.139,3.271
age,4.6526,0.527,8.836,0.000,3.621,5.685

0,1,2,3
Omnibus:,823.98,Durbin-Watson:,1.923
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1288.478
Skew:,1.081,Prob(JB):,1.62e-280
Kurtosis:,4.062,Cond. No.,1.16e+18


### 2. Creating dummies and include 7 interaction terms : Decide for one of the 3 questions for simplicity

When treating the categories as numerical: we are making assumptions about the differences between the scale items. If those distances can be considered equal at all levels, then it is reasonable to treat reciprocity as numerical. (i.e a one unit change from 1 to 2 is equivalent to a one unit change from 6 to 7)


For dummy coding we need to exclude one of the categories in the dataframe and make it the reference category: This will be the lowest level of reciprocity 1 and will be coded as zero.  so rec2 rec3 rec4 , ... and their interaction terms with unfair treatment stay in the regression.

rec2 is then interpreted as the mean of turnover intentions in the rec2 group - the mean of turnover intentions in the rec1 group (reference group) holding everything else constant

In [72]:
# 1. optional argument:  rc_binary ==1
dfrc_dummy = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07)))
# specify columns which we need for regression by dropping everything else
dfrc_dummy.drop(columns=['similar_problems', 'take_revenge', 'insult_back','felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay','year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth', 'avg_rec', 'work_satisfaction'], inplace = True)
# add interaction terms
for col in ['rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7']:
    dfrc_dummy = dfrc_dummy.assign(**{col + '_X_recog_sup': dfrc_dummy[col] * dfrc_dummy['recog_sup']})
# drop missing data in regression dataframe
dfrc_dummy = dfrc_dummy.dropna()


# Specify model
formula_main = 'turnover_intention ~ recog_sup  + rec2 + rec3 + rec4 + rec5 + rec6 + rec7 + rec2_X_recog_sup + rec3_X_recog_sup + rec4_X_recog_sup + rec5_X_recog_sup + rec6_X_recog_sup + rec7_X_recog_sup+ working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + beamte + gender'

# Fit the regression and cluster on the sector variable
reg = smf.ols(formula_main, data=dfrc_dummy).fit(cov_type='cluster', cov_kwds={'groups': dfrc_dummy['sector']})

# Print the regression results
summary = reg.summary()
#print(summary.as_latex())
summary

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.218
Model:,OLS,Adj. R-squared:,0.215
Method:,Least Squares,F-statistic:,1087.0
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,9.33e-68
Time:,13:48:36,Log-Likelihood:,-3211.8
No. Observations:,5332,AIC:,6476.0
Df Residuals:,5306,BIC:,6647.0
Df Model:,25,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0297,0.009,3.442,0.001,0.013,0.047
recog_sup,0.1165,0.026,4.475,0.000,0.065,0.168
rec2,0.1648,0.013,12.326,0.000,0.139,0.191
rec3,0.1521,0.019,7.818,0.000,0.114,0.190
rec4,0.0983,0.024,4.082,0.000,0.051,0.146
rec5,0.1598,0.028,5.764,0.000,0.105,0.214
rec6,0.1795,0.046,3.897,0.000,0.089,0.270
rec7,0.0473,0.046,1.021,0.307,-0.044,0.138
rec2_X_recog_sup,-0.0691,0.029,-2.392,0.017,-0.126,-0.012

0,1,2,3
Omnibus:,5654.663,Durbin-Watson:,1.861
Prob(Omnibus):,0.0,Jarque-Bera (JB):,333.619
Skew:,-0.042,Prob(JB):,3.59e-73
Kurtosis:,1.777,Cond. No.,1.57e+17


##### Robustness Check : Drop people whose job satisfaction level changed

In [73]:
dfworksatisfation = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07)),worksatisfaction = True)
# specify columns which we need for regression by dropping everything else

df_rc_ws = dfworksatisfation.drop(columns=['similar_problems', 'take_revenge', 'insult_back', 'felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth', 'rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' ,'work_satisfaction'])
#  included 'recog_effort'again , excluded overtime due to sample size and wage last month """
# add interaction term
df_rc_ws["recXrecog_sup"] = df_rc_ws["recog_sup"] * df_rc_ws["avg_rec"]
# drop missing data in regression dataframe
df_rc_ws.dropna(inplace=True)

formula_main = 'turnover_intention ~ recog_sup + avg_rec + recXrecog_sup + working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + beamte + gender'
# Fit the regression and cluster on the sector variable
reg = smf.ols(formula_main, data=df_rc_ws).fit(cov_type='cluster', cov_kwds={'groups': df_rc_ws['sector']})

# Print the regression results
summary = reg.summary()
#print(summary.as_latex())
summary

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.198
Model:,OLS,Adj. R-squared:,0.195
Method:,Least Squares,F-statistic:,580.8
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,3.78e-56
Time:,13:48:40,Log-Likelihood:,-2637.4
No. Observations:,4294,AIC:,5307.0
Df Residuals:,4278,BIC:,5409.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0333,0.009,3.608,0.000,0.015,0.051
recog_sup,0.1467,0.032,4.584,0.000,0.084,0.209
avg_rec,0.0192,0.006,3.057,0.002,0.007,0.032
recXrecog_sup,-0.0100,0.008,-1.246,0.213,-0.026,0.006
working_hours,0.0006,0.001,0.526,0.599,-0.002,0.003
firmsize,-0.0085,0.003,-2.799,0.005,-0.014,-0.003
tenure,-0.0140,0.002,-7.386,0.000,-0.018,-0.010
tenure_squared,0.0204,0.004,4.565,0.000,0.012,0.029
age,0.0367,0.007,5.200,0.000,0.023,0.051

0,1,2,3
Omnibus:,10001.187,Durbin-Watson:,1.861
Prob(Omnibus):,0.0,Jarque-Bera (JB):,295.418
Skew:,0.032,Prob(JB):,7.0900000000000004e-65
Kurtosis:,1.717,Cond. No.,5.1e+17


### Robustness Check 

Avg reciprocity measure over years



In [74]:
df_OLS_avg = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07)))
# specify columns which we need for regression by dropping everything else
df_OLS_avg.drop(columns=['similar_problems', 'take_revenge', 'insult_back', 'felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth', 'rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' , 'work_satisfaction','avg_rec'], inplace = True)
# Load avg reciprocity measures over the years


# Max 'C:/Users/max-admin/Desktop/Masterstudium/WiSe_22_23/Research_Module/finalproj/src/data_management/rec_avgyears.csv'
# Maxie '/Users/maxieschulze/Documents/Dokumente - MacBook Pro von Maxie/5. Semester/Research Module/ResearchModule/src/data_management/rec_avgyears.csv'
avg_reciprocity = pd.read_csv('C:/Users/max-admin/Desktop/Masterstudium/WiSe_22_23/Research_Module/finalproj/src/data_management/rec_avgyears.csv')
avg_reciprocity.reset_index(inplace=True)
avg_reciprocity.set_index(["pid","hid"], inplace=True)
avg_reciprocity.drop("index",axis=1 ,inplace =True)

df_avg_years = pd.merge(df_OLS_avg,avg_reciprocity, on=["pid","hid"])

# add interaction term
df_avg_years["recXrecog_sup"] = df_avg_years["recog_sup"] * df_avg_years["avg_rec"]
# drop missing data in regression dataframe
df_avg_years.dropna(inplace=True)

formula_main = 'turnover_intention ~ recog_sup + avg_rec + recXrecog_sup + working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + beamte + gender'
# Fit the regression and cluster on the sector variable
reg = smf.ols(formula_main, data=df_avg_years).fit(cov_type='cluster', cov_kwds={'groups': df_avg_years['sector']})

# Print the regression results
summary = reg.summary()
#print(summary.as_latex())
summary

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.206
Model:,OLS,Adj. R-squared:,0.203
Method:,Least Squares,F-statistic:,798.6
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,6.549999999999999e-61
Time:,13:48:45,Log-Likelihood:,-3254.8
No. Observations:,5332,AIC:,6542.0
Df Residuals:,5316,BIC:,6647.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0314,0.009,3.544,0.000,0.014,0.049
recog_sup,0.1111,0.036,3.056,0.002,0.040,0.182
avg_rec,0.0227,0.006,3.673,0.000,0.011,0.035
recXrecog_sup,-0.0074,0.010,-0.743,0.457,-0.027,0.012
working_hours,0.0008,0.001,0.681,0.496,-0.002,0.003
firmsize,-0.0089,0.003,-2.926,0.003,-0.015,-0.003
tenure,-0.0150,0.002,-8.805,0.000,-0.018,-0.012
tenure_squared,0.0209,0.004,5.177,0.000,0.013,0.029
age,0.0365,0.007,5.229,0.000,0.023,0.050

0,1,2,3
Omnibus:,8340.089,Durbin-Watson:,1.859
Prob(Omnibus):,0.0,Jarque-Bera (JB):,351.48
Skew:,-0.033,Prob(JB):,4.75e-77
Kurtosis:,1.744,Cond. No.,6.4e+17


Also does not change much

In [75]:
df = add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07))

df["insult_back"].mean()

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


3.2100646079070594

##### Realized Turnover in 2007 -> turnover intentions = 1 else 0 with 2006 controls

In [76]:
def realizedturnover(df_07):
    # gets people who realized turnover in 2006-2007
    jobchangers = df_07[df_07["new_job"] == '[1] Ja'].index

    observations_2006 = pd.read_csv('C:/Users/max-admin/Desktop/Masterstudium/WiSe_22_23/Research_Module/finalproj/src/data_management/2006jobchange.csv')
    # Max 'C:/Users/max-admin/Desktop/Masterstudium/WiSe_22_23/Research_Module/finalproj/src/data_management/2006jobchange.csv'
    # Maxie '/Users/maxieschulze/Documents/Dokumente - MacBook Pro von Maxie/5. Semester/Research Module/ResearchModule/src/data_management/2006jobchange.csv'
    # some initial datamanagement
    observations_2006.reset_index(inplace=True)
    observations_2006.set_index(["pid","hid"], inplace=True)
    observations_2006.drop(columns=observations_2006.filter(regex='^syear').columns, inplace=True)
    observations_2006.drop("index",axis=1 ,inplace =True)
    # define people who changed their jobs
    jobstayers = observations_2006[~observations_2006.index.isin(jobchangers)]
    jobstayers["turnover_intention"] = 0
    # drop all to concatenate both: not elegant all which did not change job
    observations_2006.drop(observations_2006[~observations_2006.index.isin(jobchangers)].index, inplace=True)

    subset_2006 = pd.concat([jobstayers,observations_2006])

    return subset_2006

In [77]:
df_real = realizedturnover(df_07)
df_real.drop(columns=['similar_problems', 'take_revenge', 'insult_back', 'felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth', 'rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' , 'work_satisfaction'], inplace = True)
#  included 'recog_effort'again , excluded overtime due to sample size and wage last month """
# add interaction term
df_real["recXrecog_sup"] = df_real["recog_sup"] * df_real["avg_rec"]
# drop missing data in regression dataframe
df_real.dropna(inplace=True)

formula_main = 'turnover_intention ~ recog_sup + avg_rec + recXrecog_sup + working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + beamte + gender'
# Fit the regression and cluster on the sector variable
reg = smf.ols(formula_main, data=df_real).fit(cov_type='cluster', cov_kwds={'groups': df_real['sector']})

# Print the regression results
reg.summary()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jobstayers["turnover_intention"] = 0


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,59.8
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,1.5e-26
Time:,13:48:49,Log-Likelihood:,-499.32
No. Observations:,861,AIC:,1031.0
Df Residuals:,845,BIC:,1107.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.0168,0.015,-1.107,0.268,-0.047,0.013
recog_sup,0.0088,0.082,0.106,0.915,-0.153,0.170
avg_rec,0.0008,0.012,0.065,0.948,-0.023,0.025
recXrecog_sup,0.0116,0.024,0.484,0.628,-0.035,0.059
working_hours,-0.0024,0.002,-1.283,0.199,-0.006,0.001
firmsize,-0.0054,0.005,-1.171,0.242,-0.014,0.004
tenure,-0.0186,0.009,-2.047,0.041,-0.036,-0.001
tenure_squared,0.0426,0.030,1.406,0.160,-0.017,0.102
age,-0.0041,0.011,-0.360,0.719,-0.026,0.018

0,1,2,3
Omnibus:,175.272,Durbin-Watson:,0.095
Prob(Omnibus):,0.0,Jarque-Bera (JB):,166.107
Skew:,0.997,Prob(JB):,8.520000000000001e-37
Kurtosis:,2.194,Cond. No.,5.65e+17


### Other measure for unfair treatment : take additional information into account how much they are bothered by it

In [78]:
df_OLS = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07)))
# add interaction term
idstostayat1 = df_OLS[ (df_OLS["felt_recog_sup"] == 3.0) | (df_OLS["felt_recog_sup"] == 4.0) | (df_OLS["felt_recog_sup"] == 2.0)].index
df_OLS['recog_sup'][~df_OLS.index.isin(idstostayat1)] = 0
df_OLS["recXrecog_sup"] = df_OLS["recog_sup"] * df_OLS["avg_rec"]
## Drop n.a.n.s
df_OLS.drop(columns=['similar_problems', 'take_revenge', 'insult_back', 'felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth', 'rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' , 'work_satisfaction'], inplace=True)
df_OLS.dropna(inplace=True)
## change all which are unbothered to unfair treatment 0

 
## add interaction term and reestimate





# Specify model
formula_main = 'turnover_intention ~ recog_sup + avg_rec + recXrecog_sup + working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + beamte + gender'

# Fit the regression and cluster on the sector variable
reg = smf.ols(formula_main, data=df_OLS).fit(cov_type='cluster', cov_kwds={'groups': df_OLS['sector']})

# Print the regression results
summary = reg.summary()
#print(summary.as_latex())
summary


  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_OLS['recog_sup'][~df_OLS.index.isin(idstostayat1)] = 0


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.206
Model:,OLS,Adj. R-squared:,0.204
Method:,Least Squares,F-statistic:,850.8
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,1.0899999999999999e-61
Time:,13:48:53,Log-Likelihood:,-3266.8
No. Observations:,5354,AIC:,6566.0
Df Residuals:,5338,BIC:,6671.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0326,0.010,3.420,0.001,0.014,0.051
recog_sup,0.1446,0.035,4.167,0.000,0.077,0.213
avg_rec,0.0203,0.006,3.379,0.001,0.009,0.032
recXrecog_sup,-0.0137,0.009,-1.449,0.147,-0.032,0.005
working_hours,0.0008,0.001,0.665,0.506,-0.002,0.003
firmsize,-0.0088,0.003,-2.896,0.004,-0.015,-0.003
tenure,-0.0147,0.002,-8.854,0.000,-0.018,-0.011
tenure_squared,0.0202,0.004,5.284,0.000,0.013,0.028
age,0.0378,0.007,5.060,0.000,0.023,0.053

0,1,2,3
Omnibus:,8430.255,Durbin-Watson:,1.858
Prob(Omnibus):,0.0,Jarque-Bera (JB):,353.264
Skew:,-0.034,Prob(JB):,1.95e-77
Kurtosis:,1.743,Cond. No.,8.61e+17


In [79]:
df_OLS = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07)))
df_OLS["recog_sup"].value_counts()

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


0.0    5524
1.0    2998
Name: recog_sup, dtype: int64

In [80]:
# optional argument ==1
df_card = include_jobchangers(add_mincer_residuals(merge_and_clean(df_05,df_06,df_07,work07)))
## drop n.a.ns
df_card.drop(columns=['similar_problems', 'take_revenge', 'insult_back','felt_recog_sup', 'felt_recog_effort',
       'recog_personal', 'felt_recog_personal', 'recog_pay', 'felt_recog_pay', 'year_birth', 'new_job',
       'reason_new_job', 'school_degree','overtime', 'recog_effort','wage_lastmonth','rec1' , 'rec2' , 'rec3' , 'rec4', 'rec5' , 'rec6' , 'rec7' ,'work_satisfaction'], inplace = True)
# add interaction term
df_card["recXrecog_sup"] = df_card["recog_sup"] * df_card["avg_rec"]
# drop missing data in regression dataframe
df_card.dropna(inplace=True)

import statsmodels.formula.api as smf

# Specify model
formula_main = 'turnover_intention ~ recog_sup + avg_rec + recXrecog_sup + working_hours + firmsize + tenure + tenure_squared + age + age_squared + years_educ + commute_distance + potential_experience + mincer_residuals + jobatrisk + sector + beamte + gender'

# Fit the regression and cluster on the sector variable
reg_cardinal = smf.ols(formula_main, data=df_card).fit(cov_type='HC3')

# Print the regression results
summary = reg_cardinal.summary()
#print(summary.as_latex())
summary

  allmerged_df = pd.merge(pd.merge(pd.merge(df_05,df_06,on=["pid", "hid"]),work07,on=["pid","hid"]),df_07,on=["pid", "hid"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[2] Weiblich', 2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['gender'].replace('[1] Maennlich', 1,inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,turnover_intention,R-squared:,0.22
Model:,OLS,Adj. R-squared:,0.21
Method:,Least Squares,F-statistic:,100.3
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,0.0
Time:,13:48:59,Log-Likelihood:,-3205.2
No. Observations:,5332,AIC:,6556.0
Df Residuals:,5259,BIC:,7037.0
Df Model:,72,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0388,0.013,2.874,0.004,0.012,0.065
sector[T.[-2] trifft nicht zu],0.0982,0.165,0.596,0.551,-0.225,0.421
"sector[T.[10] Kohlenbergbau, Torfgewinnung]",-0.0590,0.167,-0.352,0.725,-0.387,0.269
"sector[T.[14] Gewinnung von Steinen und Erden, sonstiger Bergbau]",0.4045,1.592,0.254,0.799,-2.716,3.525
sector[T.[15] Herstellung von Nahrungs- und Futtermitteln sowie Getränken],-0.0243,0.119,-0.205,0.838,-0.258,0.209
sector[T.[16] Tabakverarbeitung],-0.5433,0.110,-4.937,0.000,-0.759,-0.328
sector[T.[17] Herstellung von Textilien],0.0053,0.142,0.037,0.970,-0.273,0.283
sector[T.[18] Herstellung von Bekleidung],0.0105,0.163,0.065,0.949,-0.308,0.329
sector[T.[19] Herstellung von Leder und Lederwaren],-0.3878,0.246,-1.578,0.115,-0.870,0.094

0,1,2,3
Omnibus:,4920.541,Durbin-Watson:,1.862
Prob(Omnibus):,0.0,Jarque-Bera (JB):,324.801
Skew:,-0.021,Prob(JB):,2.95e-71
Kurtosis:,1.792,Cond. No.,4.36e+17
