In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import importlib
import nbimporter
import data_scraping
import data_cleaning


In [7]:
importlib.reload(data_scraping)
importlib.reload(data_cleaning)

<module 'data_cleaning' from 'data_cleaning.ipynb'>

# 1. Scraping Data
We scrape the results of:

- Presidential elections for each state since 1789 to 2020

- Senator elections for each state from 1789 to 2020

- House representatives for each state from 1789 to 2022

- Governors for each state from 1789 to 2020, although this period range varies for each state depending on wether they held elections or not.

We use the module contained in scraping_module.py. This module is the exact same code than the jupyter notebook 01_scraping_data.ipynb

In the cell below, uncoment the two lines corresponding to each election that you would like.

In [4]:
## ------------- president elections
# file_name = 'presidential_by_state.csv'
# data_scraping.president_elections(file_name)

## ------------- senate elections
# file_name = 'senate_by_state.csv'
# data_scraping.senate_elections(file_name)

## ------------- house representatives elections
## Warning: takes a long time!
# file_name = 'house_by_state2.csv'
# data_scraping.house_elections(file_name)

## ------------- governor elections
# file_name = 'governor_by_state.csv'
# data_scraping.governor_elections(file_name)

All done!


# 2. Cleaning Data

We clean the data for each election, by removing unwanted strings, unwanted columns, etc. We use the module in clean_module.py, this module is the same as contained on the notebook 02_cleaning_data.ipynb. The notebook presents further information

In [2]:
path = '/Users/Angeles/Desktop/USA_elections/'

dfp = pd.read_csv(path + 'presidential_by_state.csv')
dfp3 = data_cleaning.clean_president(dfp)

dfs = pd.read_csv(path + 'senators_by_state.csv')
dfs3 = data_cleaning.clean_senate(dfs)

dfh = pd.read_csv(path+'representatives_by_state.csv')
dfh3 = data_cleaning.clean_house(dfh)

dfg = pd.read_csv(path+'governors_by_state.csv')
dfg3 = data_cleaning.clean_governor(dfg)

In [4]:
dfg[dfg.State=='New_Jersey'].iloc[40:]

Unnamed: 0,State,Governor,Term,Party
1553,New_Jersey,Edward I. Edwards,"January 20, 1920–January 15, 1923\n",Democratic\n
1554,New_Jersey,George Sebastian Silzer,"January 15, 1923–January 19, 1926\n",Democratic\n
1555,New_Jersey,A. Harry Moore,"January 19, 1926–January 15, 1929\n",Democratic\n
1556,New_Jersey,Morgan Foster Larson,"January 15, 1929–January 19, 1932\n",Republican\n
1557,New_Jersey,A. Harry Moore,"January 19, 1932–January 3, 1935\n",Democratic\n
1558,New_Jersey,Clifford Ross Powell,"January 3, 1935–January 8, 1935\n",Republican\n
1559,New_Jersey,Harold G. Hoffman,"January 15, 1935–January 18, 1938\n",Republican\n
1560,New_Jersey,A. Harry Moore,"January 18, 1938–January 21, 1941\n",Democratic\n
1561,New_Jersey,Charles Edison,"January 21, 1941–January 18, 1944\n",Democratic\n
1562,New_Jersey,Walter Evans Edge,"January 18, 1944–January 21, 1947\n",Republican\n


In [5]:
dfg[dfg.Party.str.contains('Somerest County Board')]

Unnamed: 0,State,Governor,Term,Party
1570,New_Jersey,Christine Todd Whitman,"January 18, 1994–January 31, 2001\n","Somerest County Board,\nNew Jersey Board of Pu..."


In [7]:
for i in range(len(dfg.Party.unique())):
    print(dfg.Party.unique()[i], len(dfg.Party.unique()[i]))

Democratic-Republican
 22
JacksonDemocrat
 16
Democratic
 11
Independent[l]
 15
Whig[p]
 8
Pre-War Whig[t]
 16
Militaryoccupation[u]
 22
Republican
 11
Independent
 12
Independent Democratic[44]
 27
Independent[44]
 16
American
 9
Republican[o]
 14
Populist
 9
Federalist
 11
TolerationRepublican
 21
National Republican
 20
Whig
 5
A Connecticut Party
 20
No parties
 11
Democratic[j]
 14
Republican[t]
 14
None
 5
Union (Democratic)
 19
State Rights (Whig)
 20
Constitutional Union
 21
Militaryoccupation[w]
 22
Democratic[n]
 14
Adams-ClayRepublican
 21
NationalRepublican
 19
NationalRepublican[f]
 22
Know Nothing
 13
Military
 9
Democratic–Republican
 22
Greenback / Democratic
 23
Anti-Jacksonian
 16
Jacksonian
 11
American / Union
 17
Union
 6
Union / Democratic
 19
Know-Nothing
 13
Democratic[5]
 14
Farmer-Labor
 13
Democratic-Farmer-Labor
 24
Independent-Republican
 23
Independent-Republican/Republican
 34
Union Democratic
 17
Liberal Republican
 19
Silver
 7
Silver-Democratic
 18
Fed

# 3. Data Analysis

We start by cleaning the parties names. 

In [38]:
dfh2_clean = (dfh3
    .set_index(['Year','State','House_Result'])
    .unstack('House_Result')
    .replace(np.nan, 0)
    .reset_index()
    .rename(columns={'Democratic':'D','Republican':'R','Other':'O'})
       
    #.columns.drop_level()
    #.rename_axis(None,axis=0)
)

In [39]:
dfh2_clean.columns = dfh2_clean.columns.droplevel(level=0)
dfh2_clean.rename_axis(None,axis=1,inplace=True)
dfh2_clean.columns = ['Year','State','D','O','R']

In [40]:
cols=['Gov_Result','Presi_Result','Sen_Result']
dfgps2 = (dfg3.set_index(['Year','State'])
       
       .merge(dfp3.set_index(['Year','State'])
               , how='outer', on=['Year', 'State'],)
        .merge(dfs3.set_index(['Year','State'])
                , how='outer', on=['Year', 'State'],)
       .reset_index()
       .drop_duplicates()
       .melt(id_vars=['Year','State'])
       .drop(columns=['variable'])
      )
dfgps2 ['D'] = dfgps2 .apply(lambda x: x.str.contains("Democratic").sum(), axis=1)
dfgps2 ['R'] = dfgps2 .apply(lambda x: x.str.contains("Republican").sum(), axis=1)
dfgps2 ['O'] = dfgps2 .apply(lambda x: x.str.contains("Other").sum(), axis=1)
dfgps2 ['NA'] = dfgps2 .apply(lambda x: x.str.contains("NaN").sum(), axis=1)

In [41]:
dfgps2  = (dfgps2 .drop(columns=['value'])
   .groupby(['Year','State'])
    .sum()
    .reset_index()
      )

In [42]:
dfall=dfgps2.set_index(['Year','State']).add(dfh2_clean.set_index(['Year','State']), fill_value=0).reset_index()

In [43]:
#### NUMBER OF ELECTIONS
#bla[(bla['State']=='Alabama') &(bla['Year']==2016)].groupby(['Year','State']).value_counts()
# ble2.drop(columns=['value'],inplace=True)
# ble22=ble2.groupby(['Year','State']).sum().reset_index()
dfall['num_elections'] = dfall[['D','R','O']].sum(axis=1) 
dfall['%D'] = dfall['D']/dfall['num_elections']*100
dfall['%R'] = dfall['R']/dfall['num_elections']*100
dfall['%O'] = dfall['O']/dfall['num_elections']*100
#ble22[(ble22['State']=='Arkansas')&(ble22['Year']>1900)]

In [44]:
# drop odd years
dfalle = dfall.drop(dfall[dfall['Year']%2==1].index).copy()


In [74]:
# state='Florida'
# dfall[(dfall['Year']<=1980) & (dfall['Year']>=1900) & (dfall['State']==state)]

In [46]:
%matplotlib notebook

In [123]:
state='Alabama'
year0=1900
yD = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['%D'].rolling(3,axis=0).mean()
yR = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['%R'].rolling(3,axis=0).mean()
yO = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['%O'].rolling(3,axis=0).mean()
num_elect = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['num_elections'].rolling(3,axis=0).mean().replace(np.nan,0)

years = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['Year']

In [124]:
from matplotlib import animation
pause=False
def onClick(event):
    global pause
    pause ^= True
fig, ax = plt.subplots(1, 1, figsize = (6, 4))

#def yielder:


def animate(i):
    ax.cla() # clear the previous image

    ax.plot(years[:i], yD[:i],'.',label='D',) # plot the line
    ax.plot(years[:i], yR[:i],'.',label='R',) # plot the line
    ax.plot(years[:i], yO[:i],'.',label='O',) # plot the line
    ax.set_xlim([1895, 2025]) # fix the x axis
    ax.set_ylim([-0.8,105]) # fix the y axis
    ax.set_title('n = '+ str(np.round(num_elect.iloc[i])) +', year ='+str(years.iloc[i]) )
    ax.set_xlabel('Year')
    ax.set_ylabel('%')
    ax.legend(bbox_to_anchor=(1.13, 1))
#fig.canvas.mpl_connect('button_press_event', onClick)
anim = animation.FuncAnimation(fig, animate, frames = len(years) , interval = 10, blit = True, repeat=False)
plt.show()

<IPython.core.display.Javascript object>

In [105]:

    states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
            'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
            'Hawaii','Idaho', 'Illinois', 'Indiana', 'Iowa',
            'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
            'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
            'Montana', 'Nebraska', 'Nevada', 'New_Hampshire', 'New_Jersey',
            'New_Mexico', 'New_York', 'North_Carolina', 'North_Dakota', 'Ohio',
            'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode_Island', 'South_Carolina',
            'South_Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
            'Virginia', 'Washington', 'West_Virginia', 'Wisconsin', 'Wyoming'
             ]


In [None]:
axs, fig = plt.subplots(5,10)
for state,i in zip(states, range(len(states))):
    year0=1900
    yD = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['%D'].rolling(3,axis=0).mean()
    yR = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['%R'].rolling(3,axis=0).mean()
    yO = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['%O'].rolling(3,axis=0).mean()
    num_elect = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['num_elections'].rolling(3,axis=0).mean().replace(np.nan,0)

    years = dfall[(dfall['State']==state)&(dfall['Year']>=year0)]['Year']
    
    if i%5==0:
        axs[]

In [109]:
for i in range(50):
    if i%5==0:
        print('a',i)

a 0
a 5
a 10
a 15
a 20
a 25
a 30
a 35
a 40
a 45
