In [1]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np

In [2]:
# Read in the presidential elections data set
pres_electionsdata = pd.read_csv('Input/presidential_elections.csv')
pres_electionsdata.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,candidate,party_detailed,writein,candidatevotes,totalvotes,version,notes,party_simplified
0,1976,ALABAMA,AL,1,63,41,US PRESIDENT,"CARTER, JIMMY",DEMOCRAT,False,659170,1182850,20210113,,DEMOCRAT
1,1976,ALABAMA,AL,1,63,41,US PRESIDENT,"FORD, GERALD",REPUBLICAN,False,504070,1182850,20210113,,REPUBLICAN
2,1976,ALABAMA,AL,1,63,41,US PRESIDENT,"MADDOX, LESTER",AMERICAN INDEPENDENT PARTY,False,9198,1182850,20210113,,OTHER
3,1976,ALABAMA,AL,1,63,41,US PRESIDENT,"BUBAR, BENJAMIN """"BEN""""",PROHIBITION,False,6669,1182850,20210113,,OTHER
4,1976,ALABAMA,AL,1,63,41,US PRESIDENT,"HALL, GUS",COMMUNIST PARTY USE,False,1954,1182850,20210113,,OTHER


In [3]:
# Get the information on the DataFrame
pres_electionsdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4287 entries, 0 to 4286
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              4287 non-null   int64  
 1   state             4287 non-null   object 
 2   state_po          4287 non-null   object 
 3   state_fips        4287 non-null   int64  
 4   state_cen         4287 non-null   int64  
 5   state_ic          4287 non-null   int64  
 6   office            4287 non-null   object 
 7   candidate         4000 non-null   object 
 8   party_detailed    3831 non-null   object 
 9   writein           4284 non-null   object 
 10  candidatevotes    4287 non-null   int64  
 11  totalvotes        4287 non-null   int64  
 12  version           4287 non-null   int64  
 13  notes             0 non-null      float64
 14  party_simplified  4287 non-null   object 
dtypes: float64(1), int64(7), object(7)
memory usage: 502.5+ KB


In [4]:
# Group the data by state and year, and find the candidate with the most votes and their corresponding party
winners = pres_electionsdata.groupby(['year', 'state'])[['candidatevotes', 'party_simplified']].apply(lambda x: x.loc[x['candidatevotes'].idxmax()])
# Reset the index to create a DataFrame with the state, year and winning party
winners = winners.reset_index()[['state', 'year','party_simplified']]
winners.head()

Unnamed: 0,state,year,party_simplified
0,ALABAMA,1976,DEMOCRAT
1,ALASKA,1976,REPUBLICAN
2,ARIZONA,1976,REPUBLICAN
3,ARKANSAS,1976,DEMOCRAT
4,CALIFORNIA,1976,REPUBLICAN


In [5]:
# Create a new DataFrame with a column for each party and a row for each state
state_party_counts = pd.crosstab(winners['state'], winners['party_simplified'])

# Rename the columns to match the desired output
state_party_counts = state_party_counts.rename(columns={'DEMOCRAT': 'Democrat', 'REPUBLICAN': 'Republican'})

# Reset the index to make 'state' a column
state_party_counts = state_party_counts.reset_index()

# Add a new column to indicate the party with the most wins in each state
state_party_counts['Most Wins'] = np.where((state_party_counts['Democrat'] - state_party_counts['Republican']).abs() < 3,
                                            'Neutral',
                                            np.where(state_party_counts['Democrat'] > state_party_counts['Republican'],
                                                     'Democrat',
                                                     'Republican'))
# Display the updated DataFrame

state_party_counts.to_csv('Resources/state_party_counts.csv', index=False)

state_party_counts

party_simplified,state,Democrat,Republican,Most Wins
0,ALABAMA,1,11,Republican
1,ALASKA,0,12,Republican
2,ARIZONA,2,10,Republican
3,ARKANSAS,3,9,Republican
4,CALIFORNIA,8,4,Democrat
5,COLORADO,5,7,Neutral
6,CONNECTICUT,8,4,Democrat
7,DELAWARE,9,3,Democrat
8,DISTRICT OF COLUMBIA,12,0,Democrat
9,FLORIDA,4,8,Republican


In [6]:
# Create a boolean mask to filter elections of 2000 and after
filter = winners['year'] >= 2000

# Filter the winners DataFrame using the mask
winners_2000 = winners[filter]

# Create a new DataFrame with a column for each party and a row for each state
state_party_counts_2000 = pd.crosstab(winners_2000['state'], winners_2000['party_simplified'])

# Rename the columns to match the desired output
state_party_counts_2000 = state_party_counts_2000.rename(columns={'DEMOCRAT': 'Democrat', 'REPUBLICAN': 'Republican'})

# Reset the index to make 'state' a column
state_party_counts_2000 = state_party_counts_2000.reset_index()

# Add a new column to indicate the party with the most wins in each state
state_party_counts_2000['Most Wins'] = np.where((state_party_counts_2000['Democrat'] - state_party_counts_2000['Republican']).abs() < 3,
                                                'Neutral',
                                                np.where(state_party_counts_2000['Democrat'] > state_party_counts_2000['Republican'],
                                                         'Democrat',
                                                         'Republican'))

# Display the updated DataFrame
state_party_counts_2000.to_csv('Resources/state_party_counts.csv', index=False)
state_party_counts_2000


party_simplified,state,Democrat,Republican,Most Wins
0,ALABAMA,0,6,Republican
1,ALASKA,0,6,Republican
2,ARIZONA,1,5,Republican
3,ARKANSAS,0,6,Republican
4,CALIFORNIA,6,0,Democrat
5,COLORADO,4,2,Neutral
6,CONNECTICUT,6,0,Democrat
7,DELAWARE,6,0,Democrat
8,DISTRICT OF COLUMBIA,6,0,Democrat
9,FLORIDA,2,4,Neutral
