#### Loading in the data 
Data attributes: 
- sex:
  - Female
  - Male
- dob: Date of birth (DD/MM/YYYY)
- zip: ZIP code of the voter’s address (2100, 2200, 2300, or 2400)
- evote: Whether the voter cast their vote electronically
  - 0: Vote cast on paper (polling station)
  - 1: Vote cast electronically
- party: How the voter has voted
  - Red party
  - Green party
  - Invalid vote (spoiled ballot)
- marital_status:
  - Never married
  - Married/separated
  - Divorced
  - Widowed
- education:
  - Primary education
  - Upper secondary education
  - Vocational Education and Training (VET)
  - Short cycle higher education
  - Vocational bachelor’s education
  - Bachelor’s programs
  - Master’s programs
  - PhD programs
  - Not stated
- citizenship: Name of the country

In [1]:
import pandas as pd 
survey_data = pd.read_excel("data/private_dataE.xlsx")
results_data = pd.read_excel("data/public_data_resultsE.xlsx")

survey_data.head()

Unnamed: 0,name,sex,evote,dob,zip,education,citizenship,marital_status,party
0,"Morris, Emily",Female,0,1977-12-19,2400,Vocational bachelors educations,Denmark,Never married,Red
1,"Freda, Michael",Male,1,1986-12-01,2200,Masters programmes,Denmark,Married/separated,Green
2,"Goosby, Emil",Male,1,1997-08-10,2200,Vocational bachelors educations,Denmark,Never married,Green
3,"Alcantar, Amanda",Female,0,1965-01-18,2200,Vocational Education and Training (VET),Denmark,Divorced,Green
4,"Havick, Justin",Male,0,1958-10-13,2200,Vocational Education and Training (VET),Denmark,Divorced,Green


In [2]:
results_data.head()

Unnamed: 0.1,Unnamed: 0,Red,Green,Invalid ballots,Total
0,Polling station: ZIP 2100,32,86,1,119
1,Polling station: ZIP 2200,58,138,5,201
2,Polling station: ZIP 2300,105,105,5,215
3,Polling station: ZIP 2400,78,146,1,225
4,E-votes,108,219,8,335


In [3]:
results_data = results_data.rename(columns = {"Unnamed: 0":"where_voting"})
results_data.head()

Unnamed: 0,where_voting,Red,Green,Invalid ballots,Total
0,Polling station: ZIP 2100,32,86,1,119
1,Polling station: ZIP 2200,58,138,5,201
2,Polling station: ZIP 2300,105,105,5,215
3,Polling station: ZIP 2400,78,146,1,225
4,E-votes,108,219,8,335


##### (A) Is there a significant difference between the political preferences as expressed in the survey and the election results for both electronic and polling station votes?

In [4]:
# Count votes by political preference within each voting type
preference_counts = survey_data.groupby(["evote", "party"]).size().unstack(fill_value=0).reset_index()
preference_counts["total"]= preference_counts[['Green', 'Red', "Invalid vote"]].sum(axis=1)
# Calculate proportions for each political preference within each voting type
preference_counts["prop_green"]=preference_counts["Green"]/preference_counts["total"]
preference_counts["prop_red"]=preference_counts["Red"]/preference_counts["total"]
preference_counts["prop_inv"]=preference_counts["Invalid vote"]/preference_counts["total"]
preference_counts.head()

party,evote,Green,Invalid vote,Red,total,prop_green,prop_red,prop_inv
0,0,88,3,47,138,0.637681,0.34058,0.021739
1,1,43,0,19,62,0.693548,0.306452,0.0


In [5]:
def rename(x): 
    if "Polling station" in x: 
        return "Polling station"
    else: 
        return "E-votes"
transformed_results_data = results_data.copy()
transformed_results_data["where_voting"]= transformed_results_data["where_voting"].apply(lambda x: rename(x))
agg_functions = {'Red': 'sum','Green': 'sum', 'Invalid ballots': 'sum' ,'Total': 'sum'  }
df_new = transformed_results_data.groupby(transformed_results_data["where_voting"]).aggregate(agg_functions).reset_index()
df_new["prop_green"]=df_new["Green"]/df_new["Total"]
df_new["prop_red"]=df_new["Red"]/df_new["Total"]
df_new["prop_inv"]=df_new["Invalid ballots"]/df_new["Total"]
df_new.head()

Unnamed: 0,where_voting,Red,Green,Invalid ballots,Total,prop_green,prop_red,prop_inv
0,E-votes,489,913,28,1430,0.638462,0.341958,0.01958
1,Polling station,273,475,12,760,0.625,0.359211,0.015789


Two sample proportion z-test - one for e-votes and one for polling station to see if the proportion of green votes differs between the survey data and the result data 

In [6]:
# z-test for polling station 
from statsmodels.stats.proportion import proportions_ztest
z_score_p, p_value_p = proportions_ztest([475, 88], [760, 138], alternative='two-sided')

print("Z-score:", z_score_p)
print("P-value:", p_value_p)
print(p_value_p < 0.05)

Z-score: -0.28337848165736645
P-value: 0.7768867217758352
False


In [7]:
# z-test for evotes 
z_score_e, p_value_e = proportions_ztest([913, 43], [1430, 62], alternative='two-sided')

print("Z-score:", z_score_e)
print("P-value:", p_value_e)
print(p_value_e < 0.05)

Z-score: -0.8850847590713444
P-value: 0.37611091887805415
False


Chi-squared test 

In [11]:
from scipy.stats import chisquare
import numpy as np 
e_votes_r = np.array(df_new[df_new["where_voting"]=="E-votes"][["Red","Green", "Invalid ballots"]]/1430).tolist()[0]
p_votes_r = np.array(df_new[df_new["where_voting"]=="Polling station"][["Red","Green", "Invalid ballots"]]).tolist()[0]

e_votes_s = np.array(preference_counts[preference_counts["evote"]==1][["Red","Green", "Invalid vote"]]/62).tolist()[0]
p_votes_s = np.array(preference_counts[preference_counts["evote"]==0][["Red","Green", "Invalid vote"]]).tolist()[0]

# chi square test for evotes 
chisquare(f_obs=e_votes_s, f_exp=e_votes_r)
# data = pd.DataFrame({
#     "Evotes_results": e_votes_r,
#     "Evotes_survey": e_votes_s
# })

Power_divergenceResult(statistic=0.02802007491159843, pvalue=0.9860876464020224)

##### (B) Is there a significant difference between political preferences of the voters depending on their demographic attributes recorded in the survey (that is, age, gender, education level…)?

In [None]:
survey_data.columns 

Index(['name', 'sex', 'evote', 'dob', 'zip', 'education', 'citizenship',
       'marital_status', 'party'],
      dtype='object')

##### (C) Is there a significant difference between voter’s choice of the voting channel (that is, if they decide to vote either online or in person) depending on their demographic attributes recorded in the survey?