In [1]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import collect_list
from pyspark.sql.types import StructField, StructType, StringType, LongType
import pandas as pd
import numpy as np

In [2]:
# Import dataframe 
Iowa = spark.read\
.format("csv")\
.option("header", "true")\
.option("nullValue", "NA")\
.option("delimiter", "\t")\
.option("inferSchema", "true")\
.load("gs://pstat135-mh-project/data/VM2Uniform--IA--2021-03-04/VM2Uniform--IA--2021-03-04.tab")


                                                                                

In [None]:
#Iowa.printSchema()

In [27]:
# choose variables to include in dataframe 
Iowa_df = Iowa[['County',
             'Parties_Description',
             "Residence_Addresses_Property_LandSq_Footage",
             "Residence_Addresses_Property_HomeSq_Footage",
             "Voters_Gender",
             'CommercialData_Education',
             'CommercialData_EstimatedAreaMedianHHIncome',
             'CommercialData_EstimatedHHIncome',
             "CommercialData_AreaMedianHousingValue"]]

# transform to pandas dataframe 
Iowa_df = Iowa_df.toPandas()


                                                                                

In [28]:
# rename columns 
Iowa_df = Iowa_df.rename(columns = {"Residence_Addresses_Property_LandSq_Footage" : "Property_LandSq_Footage",
                                      "Residence_Addresses_Property_HomeSq_Footage" : "Property_HomeSq_Footage",
                                      "CommercialData_Education" : "Education",
                                      "CommercialData_AreaMedianEducationYears" : "AreaMedianEducationYears",
                                      "CommercialData_EstimatedAreaMedianHHIncome" : "EstimatedAreaMedianHHIncome",
                                      "CommercialData_EstimatedHHIncome" : "EstimatedHHIncome",
                                      "CommercialData_AreaMedianHousingValue" : "AreaMedianHousingValue"})
Iowa_df.head()

Unnamed: 0,County,Parties_Description,Property_LandSq_Footage,Property_HomeSq_Footage,Voters_Gender,Education,EstimatedAreaMedianHHIncome,EstimatedHHIncome,AreaMedianHousingValue
0,ADAIR,Democratic,342000.0,3500.0,F,,$66266,$50000-74999,$104166
1,ADAIR,Non-Partisan,342000.0,3500.0,M,,$66266,$50000-74999,$104166
2,ADAIR,Non-Partisan,1698000.0,1600.0,F,Some College - Likely,$66266,$50000-74999,$104166
3,ADAIR,Non-Partisan,50000.0,2300.0,F,Bach Degree - Extremely Likely,$66266,$15000-24999,$104166
4,ADAIR,Democratic,1555000.0,2800.0,F,HS Diploma - Likely,$69948,$75000-99999,$91666


## Data Cleaning

In [29]:
Iowa_clean = Iowa_df

# percentage of data missing for each variable
percent_missing = Iowa_df.isnull().sum() * 100 / len(Iowa_df)

percent_missing


County                          0.000000
Parties_Description             0.000000
Property_LandSq_Footage        16.660255
Property_HomeSq_Footage        18.341540
Voters_Gender                   0.000000
Education                      29.684130
EstimatedAreaMedianHHIncome     3.614241
EstimatedHHIncome               2.586416
AreaMedianHousingValue          3.619193
dtype: float64

    The EstimatedAreaMedianHHIncome and EstimatedHHIncome and AreaMedianHousingValue variables have a very low proportion of missing values.We can proceed by changing the variable types from object to float and factor and proceed by filling in the data using the fillna method. 

In [30]:

# remove dollar signs from estimated median household income and area median housing value and convert to numeric
Iowa_clean["EstimatedAreaMedianHHIncome"] = Iowa_clean["EstimatedAreaMedianHHIncome"].str.replace("$"," ")
Iowa_clean["AreaMedianHousingValue"] = Iowa_clean["AreaMedianHousingValue"].str.replace("$"," ")


# change types of variables 
Iowa_clean["EstimatedAreaMedianHHIncome"] = pd.to_numeric(Iowa_clean["EstimatedAreaMedianHHIncome"])
Iowa_clean["AreaMedianHousingValue"] = pd.to_numeric(Iowa_clean["AreaMedianHousingValue"])

# Change types of Education and EstimatedHHIncome variables 
CleanUp = {'Education':{
    np.nan: 1, 'Less than HS Diploma - Ex Like': 2, 'Less than HS Diploma - Likely': 3, 
    'HS Diploma - Likely': 4,'HS Diploma - Extremely Likely': 5, 
    'Vocational Technical Degree - Extremely Likely':5,
    'Some College - Likely': 6, 'Some College -Extremely Likely': 7, 
    'Bach Degree - Likely': 8, 'Bach Degree - Extremely Likely': 9, 'Grad Degree - Likely': 10, 
    'Grad Degree - Extremely Likely': 11
},

'EstimatedHHIncome':{
    '$1000-14999': 1, '$15000-24999': 2, '$25000-34999':3, '$35000-49999':4,
    '$50000-74999': 5, '$75000-99999': 6, '$100000-124999': 7, '$125000-149999': 8,
    '$150000-174999': 9, '$175000-199999': 10, '$200000-249999': 11, '$250000+': 12
}

}

Iowa_clean = Iowa_df.replace(CleanUp)

Iowa_clean.head()


  Iowa_clean["EstimatedAreaMedianHHIncome"] = Iowa_clean["EstimatedAreaMedianHHIncome"].str.replace("$"," ")
  Iowa_clean["AreaMedianHousingValue"] = Iowa_clean["AreaMedianHousingValue"].str.replace("$"," ")


Unnamed: 0,County,Parties_Description,Property_LandSq_Footage,Property_HomeSq_Footage,Voters_Gender,Education,EstimatedAreaMedianHHIncome,EstimatedHHIncome,AreaMedianHousingValue
0,ADAIR,Democratic,342000.0,3500.0,F,1,66266.0,5.0,104166.0
1,ADAIR,Non-Partisan,342000.0,3500.0,M,1,66266.0,5.0,104166.0
2,ADAIR,Non-Partisan,1698000.0,1600.0,F,6,66266.0,5.0,104166.0
3,ADAIR,Non-Partisan,50000.0,2300.0,F,9,66266.0,2.0,104166.0
4,ADAIR,Democratic,1555000.0,2800.0,F,4,69948.0,6.0,91666.0


In [31]:
# fill missing values with mean of each column
Iowa_clean["EstimatedAreaMedianHHIncome"] = Iowa_clean["EstimatedAreaMedianHHIncome"].fillna(Iowa_clean["EstimatedAreaMedianHHIncome"].mean())
Iowa_clean["AreaMedianHousingValue"] = Iowa_clean["AreaMedianHousingValue"].fillna(Iowa_clean["AreaMedianHousingValue"].mean())

# Fill EstimatedHHIncome using the median 
Iowa_clean["EstimatedHHIncome"] = Iowa_clean["EstimatedHHIncome"].fillna(Iowa_clean["EstimatedHHIncome"].median())


In [32]:
Iowa_clean.isnull().sum() * 100 / len(Iowa_clean)

County                          0.000000
Parties_Description             0.000000
Property_LandSq_Footage        16.660255
Property_HomeSq_Footage        18.341540
Voters_Gender                   0.000000
Education                       0.000000
EstimatedAreaMedianHHIncome     0.000000
EstimatedHHIncome               0.000000
AreaMedianHousingValue          0.000000
dtype: float64

In [33]:
# Keep only the rows with at least 8 non-NA values.
Iowa_clean = Iowa_clean.dropna(thresh = 8)

# drop rows with missing values in EstimatedHouseholdIncome and Education

#Iowa_clean = Iowa_clean.dropna(subset = ["EstimatedHouseholdIncome","Education"],how = "all")
Iowa_clean.isnull().sum() * 100 / len(Iowa_clean)


County                         0.000000
Parties_Description            0.000000
Property_LandSq_Footage        1.104475
Property_HomeSq_Footage        3.099580
Voters_Gender                  0.000000
Education                      0.000000
EstimatedAreaMedianHHIncome    0.000000
EstimatedHHIncome              0.000000
AreaMedianHousingValue         0.000000
dtype: float64

In [35]:
# Only 1.1% and 3.09% is missing for Land square footage and home square footage respectively so fill in with mean 
Iowa_clean["Property_LandSq_Footage"] = Iowa_clean["Property_LandSq_Footage"].fillna(Iowa_clean["Property_LandSq_Footage"].mean())
Iowa_clean["Property_HomeSq_Footage"] = Iowa_clean["Property_HomeSq_Footage"].fillna(Iowa_clean["Property_HomeSq_Footage"].mean())

Iowa_clean.isnull().sum() * 100 / len(Iowa_clean)

County                         0.0
Parties_Description            0.0
Property_LandSq_Footage        0.0
Property_HomeSq_Footage        0.0
Voters_Gender                  0.0
Education                      0.0
EstimatedAreaMedianHHIncome    0.0
EstimatedHHIncome              0.0
AreaMedianHousingValue         0.0
dtype: float64

    Our dataset is now clean 

## Comparing Counties - Property Size

    In the following part of our analysis we will conduct a comparison of counties. The purpose of this comparison will be to identify different voting patterns in counties where voters have larger properties versus counties were voters have lower properties. As stated in our introduction it is our assumption that individuals with larger properties, like farmers, tend to lean toward the Republican party. The comparison will be based on the median property land square footage for each County.

In [36]:
# group by county and calculate median property square footage 

Iowa_grouped = Iowa_clean.groupby(by = ["County"]).aggregate({'Property_LandSq_Footage':'median'})

# Sort by descending order and reset index
Iowa_grouped = Iowa_grouped.sort_values(by = 'Property_LandSq_Footage', ascending = False).reset_index()

Iowa_grouped

Unnamed: 0,County,Property_LandSq_Footage
0,RINGGOLD,489000.0
1,MADISON,126000.0
2,DAVIS,98000.0
3,ADAMS,87000.0
4,VAN BUREN,82000.0
...,...,...
94,POLK,10000.0
95,BLACK HAWK,10000.0
96,SCOTT,9000.0
97,WOODBURY,9000.0


In [37]:
# assign top 5 and bottom 5 county names
top = ["RINGGOLD", "MADISON", "VAN BUREN", "DAVIS", "ADAMS"]
bottom = ["POLK","BLACK HAWK","SCOTT", "CERRO GORDO","WOODBURY"]


#return top 5 and bottom 5 counties in terms of square footage from clean dataframe 
top5_df = Iowa_clean[Iowa_clean["County"].isin(top)].round(0)

bottom5_df = Iowa_clean[Iowa_clean["County"].isin(bottom)].round(0)


    We have now extracted the data for the top and bottom 5 counties in terms of property land square footage per voter. We proceed by creating a table containing the voting pattern for each group including the proportion of votes each party has acucumulated. 

In [38]:
#Top 5 counties in land square footage voting patterns 

top5_voting_pattern = top5_df['Parties_Description'].value_counts().to_frame()

# add a proportion column that indicates the proportion of votes for each party
top5_voting_pattern["Proportion"] = top5_voting_pattern["Parties_Description"] / top5_voting_pattern["Parties_Description"].sum()
top5_voting_pattern = top5_voting_pattern.rename(columns = {"Parties_Description" : "Number of Votes"}).style.set_caption("Top 5 Counties - Vote Counts")

top5_voting_pattern


Unnamed: 0,Number of Votes,Proportion
Republican,11163,0.468286
Non-Partisan,7059,0.296124
Democratic,5454,0.228794
Libertarian,132,0.005537
Green,30,0.001258


In [39]:
# Bottom 5 counties in land square footage voting patterns 

bottom5_voting_pattern = bottom5_df['Parties_Description'].value_counts().to_frame()

# add a proportion column that indicates the proportion of votes for each party
bottom5_voting_pattern["Proportion"] = bottom5_voting_pattern["Parties_Description"] / bottom5_voting_pattern["Parties_Description"].sum()
bottom5_voting_pattern = bottom5_voting_pattern.rename(columns = {"Parties_Description" : "Number of Votes"}).style.set_caption("Bottom 5 Counties - Vote Counts")

bottom5_voting_pattern

Unnamed: 0,Number of Votes,Proportion
Democratic,190742,0.378363
Non-Partisan,157846,0.313109
Republican,150854,0.299239
Libertarian,3749,0.007437
Green,934,0.001853


    When comparing the proportion of voters for the 5 counties with the largest property in terms of land square footage per voter with the bottom 5 counties in terms of land square footage per voter we see a difference in the prevailing party. For the top 5 counties 46% of total votes went to the Republican party and 22% went to the Democratic party whereas for the bottom 5 counties 37% went to the Democratic party and 29% went to the Republican party. This falls inline with our assumption that that individuals with larger properties, like farmers, tend to lean toward the Republican party. In addition to that, we also notice that the top 5 counties have a significantly lower ammount of total votes than the bottom 5 counties. This makes intuitive sense as smaller property sizes may point toward more densly populated regions and as a result more votes. The data above may suggest that people with larger property may be more inclined to vote Red (Republican).

 ## Comparing Counties - Home size

    The relation of property size to voter patterns has raised an interesting question, does home size have a relation to voting patterns as well?. To examine this question we proceed with a similar analysis with the one carried out above using median house size per county as our measure. We will compare the voting habits of the top 5 and bottom counties in terms of home size in square feet. It is our expectation that people with larger homes will tend to lean toward the Republican party as larger homes are an indication of greater wealth.

In [40]:
# group by county and calculate median property square footage 

Iowa_home = Iowa_clean.groupby(by = ["County"]).aggregate({'Property_HomeSq_Footage':'median'})

# Sort by descending order and reset index
Iowa_home = Iowa_home.sort_values(by = 'Property_HomeSq_Footage', ascending = False).reset_index()

Iowa_home

Unnamed: 0,County,Property_HomeSq_Footage
0,DELAWARE,3100.0
1,LYON,3000.0
2,DUBUQUE,3000.0
3,PLYMOUTH,3000.0
4,CHICKASAW,2900.0
...,...,...
94,HANCOCK,1500.0
95,TAYLOR,1500.0
96,MONROE,1400.0
97,POTTAWATTAMIE,1300.0


In [41]:

top_home = ["LYON","DELAWARE","PLYMOUTH", "CHICKASAW", "DUBUQUE"]
bottom_home = ["HANCOCK", "TAYLOR", "MONROE", "RINGGOLD", "POTTAWATTAMIE"]

#return top 5 and bottom 5 counties in terms of home square footage from clean dataframe 
top5_home = Iowa_clean[Iowa_clean["County"].isin(top_home)].round(0)

bottom5_home = Iowa_clean[Iowa_clean["County"].isin(bottom_home)].round(0)


    Now that we have extracted the top and bottom 5 Counties in terms of median home size in square feet, we proceed by creating a table containing the voting pattern for each group including the proportion of votes each party has acucumulated. 

In [42]:
#Top 5 counties in home square footage voting patterns 

top5_votes = top5_home['Parties_Description'].value_counts().to_frame()

# add a proportion column that indicates the proportion of votes for each party
top5_votes["Proportion"] = top5_votes["Parties_Description"] / top5_votes["Parties_Description"].sum()
top5_votes = top5_votes.rename(columns = {"Parties_Description" : "Number of Votes"}).style.set_caption("Top 5 Counties - Vote Counts")

top5_votes

Unnamed: 0,Number of Votes,Proportion
Republican,37336,0.389761
Democratic,29486,0.307813
Non-Partisan,28333,0.295776
Libertarian,493,0.005147
Green,144,0.001503


In [43]:
# Bottom 5 counties in land square footage voting patterns 

bottom5_votes = bottom5_home['Parties_Description'].value_counts().to_frame()

# add a proportion column that indicates the proportion of votes for each party
bottom5_votes["Proportion"] = bottom5_votes["Parties_Description"] / bottom5_votes["Parties_Description"].sum()
bottom5_votes = bottom5_votes.rename(columns = {"Parties_Description" : "Number of Votes"}).style.set_caption("Bottom 5 Counties - Vote Counts")

bottom5_votes

Unnamed: 0,Number of Votes,Proportion
Republican,28786,0.43022
Non-Partisan,20118,0.300673
Democratic,17346,0.259244
Libertarian,510,0.007622
Green,150,0.002242


    Contrary to our initial assumption house size does not seem to have a relationship to voting patterns. From the results above we can see that the bottom 5 counties in terms of median house square footage size actually have a bigger proportion of their population voting Republican than the top 5 counties. 