In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
from scipy.stats import zscore
from scipy.stats import ttest_rel
#import warnings
#warnings.filterwarnings('ignore')

In [7]:
# Drop rows where 'StartDate' is equal to "Start Date"
df = df[df["StartDate"] != "Start Date"]

In [8]:
# Drop data where subjects haven't finished the experiment
df = df[df["Finished"] != "0"]

In [9]:
df['id']=df.index

In [56]:
df['id']

In [11]:
# Define control and treatment groups
df["Condition"] = df["Condition"].replace({"Control": "0", "Treatment": "1"})

df[["Condition"]].head(20);

In [12]:
# Assign WTP_Fund columns
df["WTP_Poverty_Fund"] = df["WTP_Poverty_Fund"].replace({"Fund_A": "1", "Fund_B": "2"})
df["WTP_Climate_Fund"] = df["WTP_Climate_Fund"].replace({"Fund_A": "1", "Fund_B": "2"})
df["WTP_DecentWork_Fund"] = df["WTP_DecentWork_Fund"].replace({"Fund_A": "1", "Fund_B": "2"})
df["WTP_Gender_Fund"] = df["WTP_Gender_Fund"].replace({"Fund_A": "1", "Fund_B": "2"})

In [58]:
#check whether there are other nationalities
df["ifother_Nationality"].unique()

In [14]:
# create a variable for developing countries which is the default with 1
df["developing_country"] = 1

In [15]:
# Replace 'developing_country' with 0 if the nationality is in the specified list of non-developing countries
df.loc[df["ifother_Nationality"].isin(["German", "American", "Canadian", "-"]), "developing_country"] = 0

In [16]:
#check correct assignment to developing country
df[["ifother_Nationality", "developing_country"]].sample(50);

In [59]:
df[df["developing_country"] == 0][["ifother_Nationality", "developing_country"]]

In [18]:
#drop subjects from non-developing countries
df=df[df['developing_country'] != 0]

In [19]:
#drop subject who had to retake the experiment
df=df[df['id'] != 95]

In [21]:
#Recode the variables for future analysis
df.loc[df["Gender"] == 2, "Gender"] = 0
df.loc[df["Gender"] == 3, "Gender"] = 0

df.loc[df["Awareness_SustainInv"] == 2, "Awareness_SustainInv"] = 0

df['Education'].value_counts()
df["High_Education"] = 0
df.loc[(df["Education"] > 1) & (df["Education"] < 5), "High_Education"] = 1


df["Low_Income"] = 0
df.loc[df["Income"].isin([1, 2]), "Low_Income"] = 1

df["High_Income"] = 0
df.loc[df["Income"].isin([3, 4, 5]), "High_Income"] = 1


df["High_Share_SRI"] = 0
#df.loc[(df["ShareSustainInvest"] < 4) & (df["ShareSustainInvest"].notna()) | (df["ShareSustainInvest"] == 8), "High_Share_SRI"] = 0
df.loc[(df["ShareSustainInvest"] >= 4) & (df["ShareSustainInvest"].notna()) & (df["ShareSustainInvest"] != 8), "High_Share_SRI"] = 1


df["Share_SRI_smaller20"] = 0
df.loc[(df["ShareSustainInvest"] <= 2) & (df["ShareSustainInvest"].notna()), "Share_SRI_smaller20"] = 1
#df.loc[(df["ShareSustainInvest"] > 2) & (df["ShareSustainInvest"].notna()), "Share_SRI_smaller20"] = 0


In [22]:
#check recoding worked
df[['Low_Income', 'High_Income','Income']].sample(50);

In [23]:
df[["Share_SRI_smaller20","ShareSustainInvest"]];

In [24]:
df[['High_Education', 'Education']].sample(100);

In [62]:
df.shape[0];

In [26]:
df=df.rename(columns={"Q463": "ClimateMortality_Post" , "Q465" : "ClimateEmissions_Post"})

In [29]:
#Winsorize prior and posterior severity beliefs data
# Define the function to winsorize 
def winsorize_column(series, lower=5, upper=95):
    return winsorize(series, limits=(lower / 100, (100 - upper) / 100))

# List of prior severity belief data columns to winsorize
cols_to_winsorize = [
    "Poverty_Food", "Poverty_Line", "Q_Economic_Informal", "Q_Economic_Umemploy",
    "Q_Gender_Parliament", "Q_Gender_WageGap", "Q_Climate_Mortality", "Q_Climate_Emissions"
]

# Apply winsorization and create new columns with '_w' suffix
for col in cols_to_winsorize:
    df[col + "_w"] = winsorize_column(df[col])
 

# List of posterior severity belief data columns to winsorize by 'Condition'
cols_to_winsorize_by_condition = [
    "Poverty_Food_Post", "Poverty_Line_Post", "DecentWork_IEmp_Post", "DecentWork_Unem_Post",
    "Gender_Parl_Post", "Gender_Gap_Post", "ClimateMortality_Post", "ClimateEmissions_Post"
]

# Apply winsorization within each 'Condition' group
for col in cols_to_winsorize_by_condition:
    df[col + "_w"] = df.groupby("Condition", group_keys=False)[col].apply(winsorize_column)


In [31]:
# Create standardized measures for social, risk and time preferences
df["SocialPreferences1_1"] = pd.to_numeric(df["SocialPreferences1_1"], errors="coerce")
df["SocialPreferenceDona_1"] = pd.to_numeric(df["SocialPreferenceDona_1"], errors="coerce")

# z-score normalization
df["SocialPreferences1_1_std"] = zscore(df["SocialPreferences1_1"].dropna())
df["SocialPreferenceDona_1_std"] = zscore(df["SocialPreferenceDona_1"].dropna())

# Define Social Preferences as a weighted measure according to Falk et al. (2023)
df["social_preferences"] = (0.635 * df["SocialPreferences1_1_std"] + 
                            0.365 * df["SocialPreferenceDona_1_std"])

# Standardize the weighted Social Preferences measure 
df["Social_Preferences_std"] = zscore(df["social_preferences"])

# Standardize Risk Preferences
df["RiskPreferences_1"] = pd.to_numeric(df["RiskPreferences_1"], errors="coerce")
df["RiskPreferences_1_std"] = zscore(df["RiskPreferences_1"])

# Standardize Time Preferences
df["Timepreferences_1"] = pd.to_numeric(df["Timepreferences_1"], errors="coerce")
df["TimePreferences_1_std"] = zscore(df["Timepreferences_1"])


In [65]:
df['FinancialLiteracy1'].unique()

In [67]:
df['FinancialLiteracy2'].unique()

In [68]:
df['FinancialLiteracy3'].unique()

In [35]:
# Define Financial Literacy
# Replace missing values in FinancialLiteracy2 with 4 (equivalent to 'I do not know')
df["FinancialLiteracy2"] = df["FinancialLiteracy2"].fillna(4)

# Create binary variables for correct Financial Literacy answers
df["FL_1"] = (df["FinancialLiteracy1"] == 1).astype(int)
df["FL_2"] = (df["FinancialLiteracy2"] == 3).astype(int)
df["FL_3"] = (df["FinancialLiteracy3"] == 2).astype(int)

# Create total Financial Literacy Score
df["FL_all"] = df["FL_1"] + df["FL_2"] + df["FL_3"]

In [36]:
df['current_investor']=0
df.loc[df['CurrentInvestor'] != 7, 'current_investor'] = 1 

df['investor']=0
df.loc[(df['Investor'] ==1)  | (df['current_investor'] ==1), 'investor'] = 1


In [37]:
df.columns = df.columns.str.strip()  

In [38]:
# Vriables Definition Deviation and Belief Updating 
#True severity values
df['True_Poverty_Food']= 30
df['True_Poverty_Line']= 47
df['True_Economic_Informal']= 55
df['True_Economic_Umemploy']= 29
df['True_Gender_Parliament'] =26
df['True_Gender_WageGap']= 20
df['True_Climate_Mortality']= 15
df['True_Climate_Emissions']= 50

In [39]:
df["Poverty_Line_w"] = pd.to_numeric(df["Poverty_Line_w"], errors="coerce")

In [40]:
#Deviation from True Beliefs Severity
# Signal minus Prior calculations
df["SignalminusPrior_Poverty_Food"] = df["True_Poverty_Food"] - df["Poverty_Food_w"]
df["SignalminusPrior_Poverty_Line"] = df["True_Poverty_Line"] - df["Poverty_Line_w"]
df["SignalminusPrior_Unemployment"] = df["True_Economic_Umemploy"] - df["Q_Economic_Umemploy_w"]
df["SignalminusPrior_Informal"] = df["True_Economic_Informal"] - df["Q_Economic_Informal_w"]
df["SignalminusPrior_Climate_Mortal"] = df["True_Climate_Mortality"] - df["Q_Climate_Mortality_w"]
df["SignalminusPrior_Climate_Emiss"] = df["True_Climate_Emissions"] - df["Q_Climate_Emissions_w"]
df["SignalminusPrior_Gender_Wage_Gap"] = df["True_Gender_WageGap"] - df["Q_Gender_WageGap_w"]
df["SignalminusPrior_Gender_Parl"] = df["True_Gender_Parliament"] - df["Q_Gender_Parliament_w"]


In [41]:
# Deviation calculations
df["Dev_Poverty_Food"] = df["Poverty_Food_w"] - df["True_Poverty_Food"]
df["Dev_Poverty_Line"] = df["Poverty_Line_w"] - df["True_Poverty_Line"]
df["Dev_Unemployment"] = df["Q_Economic_Umemploy_w"] - df["True_Economic_Umemploy"]
df["Dev_Informal"] = df["Q_Economic_Informal_w"] - df["True_Economic_Informal"]
df["Dev_Climate_Mortality"] = df["Q_Climate_Mortality_w"] - df["True_Climate_Mortality"]
df["Dev_Climate_Emissions"] = df["Q_Climate_Emissions_w"] - df["True_Climate_Emissions"]
df["Dev_Gender_Wage_Gap"] = df["Q_Gender_WageGap_w"] - df["True_Gender_WageGap"]
df["Dev_Gender_Parliament"] = df["True_Gender_Parliament"] - df["Q_Gender_Parliament_w"]

In [42]:
# Convert all Deviation columns to numeric
dev_columns = [
    "Dev_Climate_Mortality", "Dev_Climate_Emissions",
    "Dev_Poverty_Food", "Dev_Poverty_Line",
    "Dev_Gender_Parliament", "Dev_Gender_Wage_Gap",
    "Dev_Informal", "Dev_Unemployment"]

# Ensure all are numeric
df[dev_columns] = df[dev_columns].apply(pd.to_numeric, errors="coerce")
df=df.apply(pd.to_numeric, errors="coerce")

In [43]:
# -----------------------------
# Absolute average prior beliefs
# -----------------------------
df["Dev_Climate_avg"] = (df["Dev_Climate_Mortality"] + df["Dev_Climate_Emissions"]) / 2
df["Dev_Poverty_avg"] = (df["Dev_Poverty_Food"] + df["Dev_Poverty_Line"]) / 2
df["Dev_Gender_avg"] = (df["Dev_Gender_Parliament"] + df["Dev_Gender_Wage_Gap"]) / 2
df["Dev_Work_avg"] = (df["Dev_Informal"] + df["Dev_Unemployment"]) / 2


In [44]:
# -----------------------------
# Deviation Posterior Beliefs
# -----------------------------
df["Dev_Poverty_Food_Post"] = df["Poverty_Food_Post_w"] - df["True_Poverty_Food"]
df["Dev_Poverty_Line_Post"] = df["Poverty_Line_Post_w"] - df["True_Poverty_Line"]
df["Dev_Unemployment_Post"] = df["DecentWork_Unem_Post_w"] - df["True_Economic_Umemploy"]
df["Dev_Informal_Post"] = df["DecentWork_IEmp_Post_w"] - df["True_Economic_Informal"]
df["Dev_Climate_Mortality_Post"] = df["ClimateMortality_Post_w"] - df["True_Climate_Mortality"]
df["Dev_Climate_Emissions_Post"] = df["ClimateEmissions_Post_w"] - df["True_Climate_Emissions"]
df["Dev_Gender_Wage_Gap_Post"] = df["Gender_Gap_Post_w"] - df["True_Gender_WageGap"]
df["Dev_Gender_Parl_Post"] = df["True_Gender_Parliament"] - df["Gender_Parl_Post_w"]


In [45]:
# Convert Deviation columns to numeric
dev_columns = [
    "Dev_Climate_Mortality_Post", "Dev_Climate_Emissions_Post",
    "Dev_Poverty_Food_Post", "Dev_Poverty_Line_Post",
    "Dev_Gender_Parl_Post", "Dev_Gender_Wage_Gap_Post",
    "Dev_Informal_Post", "Dev_Unemployment_Post"
]

# Ensure all are numeric 
df[dev_columns] = df[dev_columns].apply(pd.to_numeric, errors="coerce")

In [46]:
# -----------------------------
# Absolute average post beliefs
# -----------------------------
df["Dev_Climate_avg_post"] = (df["Dev_Climate_Mortality_Post"] + df["Dev_Climate_Emissions_Post"]) / 2
df["Dev_Poverty_avg_post"] = (df["Dev_Poverty_Food_Post"] + df["Dev_Poverty_Line_Post"]) / 2
df["Dev_Gender_avg_post"] = (df["Dev_Gender_Parl_Post"] + df["Dev_Gender_Wage_Gap_Post"]) / 2
df["Dev_Work_avg_post"] = (df["Dev_Informal_Post"] + df["Dev_Unemployment_Post"]) / 2

In [47]:
# -----------------------------
# Belief Updating Severity
# -----------------------------
df["Updating_Poverty_Food"] = df["Poverty_Food_Post_w"] - df["Poverty_Food_w"]
df["Updating_Poverty_Line"] = df["Poverty_Line_Post_w"] - df["Poverty_Line_w"]
df["Updating_Unemploy"] = df["DecentWork_Unem_Post_w"] - df["Q_Economic_Umemploy_w"]
df["Updating_Informal"] = df["DecentWork_IEmp_Post_w"] - df["Q_Economic_Informal_w"]
df["Updating_Gender_WageGap"] = df["Gender_Gap_Post_w"] - df["Q_Gender_WageGap_w"]
df["Updating_Gender_Parliament"] = df["Gender_Parl_Post_w"] - df["Q_Gender_Parliament"]
df["Updating_Climate_Mortality"] = df["ClimateMortality_Post_w"] - df["Q_Climate_Mortality_w"]
df["Updating_Climate_Emissions"] = df["ClimateEmissions_Post_w"] - df["Q_Climate_Emissions_w"]


In [48]:
# ---------------------------------------
# Generate standardized propensities
# ---------------------------------------

invest_cols = ["Q_Willingness_Invest_1", "Q_Willingness_Invest_2", 
               "Q_Willingness_Invest_3", "Q_Willingness_Invest_4"]

df[[f"{col}_std" for col in invest_cols]] = df[invest_cols].apply(lambda x: zscore(x, ddof=1))

# ---------------------------------------
# Generate standardized prior effectiveness beliefs
# ---------------------------------------

prior_invest_cols = ["Q_Prior_InvestImpact_1", "Q_Prior_InvestImpact_2", 
                     "Q_Prior_InvestImpact_3", "Q_Prior_InvestImpact_4"]

df[[f"{col}_std" for col in prior_invest_cols]] = df[prior_invest_cols].apply(lambda x: zscore(x, ddof=1))

# ---------------------------------------
# Generate standardized posterior effectiveness beliefs
# ---------------------------------------

effectiveness_post_cols = ["Effectiveness_Post_1", "Effectiveness_Post_2", 
                           "Effectiveness_Post_3", "Effectiveness_Post_4"]

df[[f"{col}_std" for col in effectiveness_post_cols]] = df[effectiveness_post_cols].apply(lambda x: zscore(x, ddof=1))

# ---------------------------------------
# Generate standardized posterior severity beliefs
# ---------------------------------------

severity_cols = ["Q_Severity_Assess_1", "Q_Severity_Assess_2", 
                 "Q_Severity_Assess_3", "Q_Severity_Assess_4"]

df[[f"{col}_std" for col in severity_cols]] = df[severity_cols].apply(lambda x: zscore(x, ddof=1))

# ---------------------------------------
# Generate standardized policy support variable
# ---------------------------------------

df["Q_Policy_Support_std"] = zscore(df["Q_Policy_Support_1"], ddof=1)


In [49]:
# Dictionary to store variable labels
variable_labels = {
    "Condition": "Treatment",
    "Gender": "Female",
    "High_Education": "High Education",
    "High_Income": "High Income",
    "social_preferences": "Social Preferences",
    "SocialPreferences1_1_std": "Social Preferences 1",
    "SocialPreferenceDona_1_std": "Social Preferences 2",
    "Social_Preferences_std": "Social Preferences",
    "RiskPreferences_1_std": "Risk Preferences",
    "TimePreferences_1_std": "Time Preferences",
    "Awareness_SustainInv": "Awareness SRI",
    "investor": "Investor",
    "FL_all": "Financial Literacy",
    "Q_Prior_InvestImpact_1_std": "Prior Effectiveness Poverty",
    "Q_Prior_InvestImpact_2_std": "Prior Effectiveness Decent Work",
    "Q_Prior_InvestImpact_3_std": "Prior Effectiveness Gender Equality",
    "Q_Prior_InvestImpact_4_std": "Prior Effectiveness Climate Change",
    "Dev_Poverty_avg": "Prior Poverty Severity",
    "Dev_Work_avg": "Prior Decent Work Severity",
    "Dev_Gender_avg": "Prior Gender Equality Severity",
    "Dev_Climate_avg": "Prior Climate Change Severity",
    "General_Attitude_SRI_1": "'Immunity' towards Issues in Own Country",
    "General_Attitude_SRI_2": "Own Country Suffers from Selected Issues",
    "General_Attitude_SRI_3": "Responsibility of Developed Countries/Development Organizations",
    "General_Attitude_SRI_4": "Feel Responsible for Sustainable Development",
    "General_Attitude_SRI_5": "Requirement for Immediate Action"
}

In [50]:
df.to_csv("transformed_data.csv", index=False)

In [69]:
#Summary Statistics 
#List of variables to summarize
summary_vars = [
    "Gender", "Age", "High_Income", "SocialPreferences1_1", "SocialPreferenceDona_1",
    "RiskPreferences_1", "Timepreferences_1", "FL_all", "Awareness_SustainInv",
    "investor", "ShareSustainInvest"
]

# Compute summary statistics (mean, std, min, max, count)
summary_table = df[summary_vars].describe().T  # Transpose for correct format
summary_table["count"] = df[summary_vars].count()  # Add count column

# Select and reorder relevant columns
summary_table = summary_table[["count", "mean", "std", "min", "max"]]

# Rename columns to match Stata esttab format
summary_table.columns = ["N", "Mean", "SD", "Min", "Max"]

# Format numeric values (2 decimals for Mean/SD, 0 for Min/Max)
summary_table = summary_table.round({"Mean": 2, "SD": 2, "Min": 0, "Max": 0})

# Rename rows (variable labels)
var_labels = {
    "Age": "Age",
    "Gender": "Female",
    "High_Income": "High Income",
    "SocialPreferences1_1": "Social Preferences",
    "SocialPreferenceDona_1": "Hypothetical Donation",
    "RiskPreferences_1": "Risk Preferences",
    "Timepreferences_1": "Time Preferences",
    "FL_all": "Financial Literacy",
    "Awareness_SustainInv": "Awareness SRI",
    "investor": "Investor",
    "ShareSustainInvest": "Share SRI"
}

summary_table.index = summary_table.index.map(lambda x: var_labels.get(x, x))  # Apply labels

In [71]:
# Export summary statistics to Latex
latex_table = summary_table.to_latex(
    index=True,          
    caption="Summary Statistics\\label{tabsummarystat}",  # Add caption
    column_format="lccccc",  # Align columns
    escape=False,        # Allow LaTeX special characters
    bold_rows=True,      # Bold variable names
    longtable=True       # Allow long tables if needed
)

# Save Latex Document
with open("sumtable.tex", "w", encoding="utf-8") as file:
    file.write(latex_table)