### Step 1
Analyse the data in the raw dataset answering the following questions (note, you are free to choose the suitable methods ofthe analysis yourself, based on your knowledge of e.g. applied statistics or other courses of your study)

In [16]:
import pandas as pd
from scipy.stats import chisquare
import numpy as np 
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency
import statsmodels.api as sm

In [2]:
# reading in the data

survey_data = pd.read_excel("data/private_dataE.xlsx")
results_data = pd.read_excel("data/public_data_resultsE.xlsx")
pub_data_register = pd.read_excel('data/public_data_registerE.xlsx')

#### Data cleanup

In [7]:
# naming the unnamed column to "where voting" as it refers to polling stations and evotes
results_data = results_data.rename(columns = {"Unnamed: 0":"where_voting"})

# creating a transformed version of the results data for evotes vs polling stations
def rename(x): 
    if "Polling station" in x: 
        return "Polling station"
    else: 
        return "E-votes"
transformed_results_data = results_data.copy()
transformed_results_data["where_voting"]= transformed_results_data["where_voting"].apply(lambda x: rename(x))
agg_functions = {'Red': 'sum','Green': 'sum', 'Invalid ballots': 'sum' ,'Total': 'sum'  }
df_new = transformed_results_data.groupby(transformed_results_data["where_voting"]).aggregate(agg_functions).reset_index()

### (A) 

Is there a significant difference between the political preferences as expressed in the survey and the election results for bothelectronic and polling station votes?

In [10]:
# Count votes by political preference within each voting type
preference_counts = survey_data.groupby(["evote", "party"]).size().unstack(fill_value=0).reset_index()
preference_counts["total"]= preference_counts[['Green', 'Red', "Invalid vote"]].sum(axis=1)
# Calculate proportions for each political preference within each voting type
preference_counts["prop_green"]=preference_counts["Green"]/preference_counts["total"]
preference_counts["prop_red"]=preference_counts["Red"]/preference_counts["total"]
preference_counts["prop_inv"]=preference_counts["Invalid vote"]/preference_counts["total"]
preference_counts.head()

party,evote,Green,Invalid vote,Red,total,prop_green,prop_red,prop_inv
0,0,88,3,47,138,0.637681,0.34058,0.021739
1,1,43,0,19,62,0.693548,0.306452,0.0


In [None]:
# Calculating proportions for political preferences in the public results data
df_new["prop_green"]=df_new["Green"]/df_new["Total"]
df_new["prop_red"]=df_new["Red"]/df_new["Total"]
df_new["prop_inv"]=df_new["Invalid ballots"]/df_new["Total"]
df_new.head()

Unnamed: 0,where_voting,Red,Green,Invalid ballots,Total,prop_green,prop_red,prop_inv
0,E-votes,489,913,28,1430,0.638462,0.341958,0.01958
1,Polling station,273,475,12,760,0.625,0.359211,0.015789


#### Two sample proportion z-test

In [None]:
# z-test for polling station 
z_score_p, p_value_p = proportions_ztest([475, 88], [760, 138], alternative='two-sided')

print("Z-score:", z_score_p)
print("P-value:", p_value_p)
print(p_value_p < 0.05)

Z-score: -0.28337848165736645
P-value: 0.7768867217758352
False


In [13]:
# z-test for evotes 
z_score_e, p_value_e = proportions_ztest([913, 43], [1430, 62], alternative='two-sided')

print("Z-score:", z_score_e)
print("P-value:", p_value_e)
print(p_value_e < 0.05)

Z-score: -0.8850847590713444
P-value: 0.37611091887805415
False


#### Chi-squared test

In [None]:
e_votes_r = np.array(df_new[df_new["where_voting"]=="E-votes"][["Red","Green", "Invalid ballots"]]/1430).tolist()[0]
p_votes_r = np.array(df_new[df_new["where_voting"]=="Polling station"][["Red","Green", "Invalid ballots"]]).tolist()[0]

e_votes_s = np.array(preference_counts[preference_counts["evote"]==1][["Red","Green", "Invalid vote"]]/62).tolist()[0]
p_votes_s = np.array(preference_counts[preference_counts["evote"]==0][["Red","Green", "Invalid vote"]]).tolist()[0]

# chi square test for evotes 
chisquare(f_obs=e_votes_s, f_exp=e_votes_r)
# data = pd.DataFrame({
#     "Evotes_results": e_votes_r,
#     "Evotes_survey": e_votes_s
# })

Power_divergenceResult(statistic=0.02802007491159843, pvalue=0.9860876464020224)

### (B) 

Is there a significant difference between political preferences of the voters depending on their demographic attributes recorded inthe survey (that is, age, gender, education level...)?

In [17]:
contingency_table_gender = pd.crosstab(survey_data['sex'], survey_data['party'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table_gender)
print("Chi-square Test Results for Gender vs Political Preference")
print("Chi-square statistic:", chi2)
print("p-value:", p_value)

contingency_table_education = pd.crosstab(survey_data['education'], survey_data['party'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table_education)
print("\nChi-square Test Results for Education vs Political Preference")
print("Chi-square statistic:", chi2)
print("p-value:", p_value)

#transform date of birth to age
survey_data['age'] = 2024 - survey_data['dob'].dt.year
contingency_table_age = pd.crosstab(survey_data['age'], survey_data['party'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table_age)
print("\nChi-square Test Results for Age vs Political Preference")
print("Chi-square statistic:", chi2)
print("p-value:", p_value)


Chi-square Test Results for Gender vs Political Preference
Chi-square statistic: 1.0948595723439902
p-value: 0.578434602059867

Chi-square Test Results for Education vs Political Preference
Chi-square statistic: 34.70051433812302
p-value: 0.0043666219362705665

Chi-square Test Results for Age vs Political Preference
Chi-square statistic: 130.03690118193933
p-value: 0.5806961048843412


In [18]:
# Prepare the data
survey_data['Political_Preference_Binary'] = (survey_data['party'] == 'Green').astype(int)

# Define independent variables (e.g., age, gender, education)
X = survey_data[['age', 'sex', 'education']]
X = pd.get_dummies(X, drop_first=True)  # Convert categorical vars to binary (dummy variables)
X = sm.add_constant(X)  # Add constant for the intercept

# Define dependent variable
y = survey_data['Political_Preference_Binary']

# Fit logistic regression
model = sm.Logit(y, X).fit()
print(model.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

### (C) 

Is there a significant difference between voter’s choice of the voting channel (that is, if they decide to vote either online or inperson) depending on their demographic attributes recorded in the survey?

In [19]:

# Example: Chi-square test for voting channel by gender
contingency_table_channel_gender = pd.crosstab(survey_data['sex'], survey_data['evote'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table_channel_gender)
print("Chi-square Test Results for Gender vs Voting Channel")
print("Chi-square statistic:", chi2)
print("p-value:", p_value)

contingency_table_channel_education = pd.crosstab(survey_data['education'], survey_data['evote'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table_channel_education)
print("\nChi-square Test Results for Education vs Voting Channel")
print("Chi-square statistic:", chi2)
print("p-value:", p_value)

contingency_table_channel_age = pd.crosstab(survey_data['age'], survey_data['evote'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table_channel_age)
print("\nChi-square Test Results for Age vs Voting Channel")
print("Chi-square statistic:", chi2)
print("p-value:", p_value)

Chi-square Test Results for Gender vs Voting Channel
Chi-square statistic: 2.6379263431645796
p-value: 0.10433965288985117

Chi-square Test Results for Education vs Voting Channel
Chi-square statistic: 8.784052225187445
p-value: 0.36083942749998765

Chi-square Test Results for Age vs Voting Channel
Chi-square statistic: 63.97738150893831
p-value: 0.5821109942082197


In [20]:
# Define independent variables (demographics)
X = survey_data[['age', 'sex', 'education']]
X = pd.get_dummies(X, drop_first=True)  # Create dummy variables for categorical data
X = sm.add_constant(X)

# Define the dependent variable
y = survey_data['evote']

# Fit the logistic regression model
model = sm.Logit(y, X).fit()
print(model.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).