In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

# Data

In [None]:
_path = '/content/drive/MyDrive/Data Science/TAMIDS 2021/Competition/Data/FEC_Processed'
_files = [f for f in os.listdir(_path) if 'clean' in f]
names = ['comcom', 'comcand', 'induv']
fec = {names[i]: pd.read_csv(os.path.join(_path, _files[i])) for i in range(len(names))}

for n in names:
  fec[n].loc[:,'datetime'] = pd.to_datetime(fec[n]['datetime'])
  fec[n]['year'] = fec[n]['datetime'].apply(lambda x: x.year)
  fec[n]['month'] = fec[n]['datetime'].apply(lambda x: x.month)
_files


Columns (4) have mixed types.Specify dtype option on import or set low_memory=False.



['com_cand_clean.csv', 'com_com_clean.csv', 'induv_com_clean.csv']

In [None]:
_path = '/content/drive/MyDrive/Data Science/TAMIDS 2021/Competition/Data/PollDemo_Processed'
_files = [f for f in os.listdir(_path) if 'clean' in f]
names = ['polls', 'demo']
polldem = {names[i]: pd.read_csv(os.path.join(_path, _files[i])) for i in range(len(names))}

polldem['polls'].loc[:,'polldate'] = pd.to_datetime(polldem['polls']['polldate'])
polldem['polls']['year'] = polldem['polls']['polldate'].apply(lambda x: x.year)
_files

['polls_clean.csv', 'demo_clean.csv']

In [2]:
_path = '/content/drive/MyDrive/Data Science/TAMIDS 2021/Competition/Data/Given'
_files = os.listdir(_path)
names = ['returns', 'countylevel'] # county level has same data, just worse formatting
given = {names[i]: pd.read_csv(os.path.join(_path, _files[i])) for i in range(len(names))}

given['returns']['percentage'] = given['returns']['candidatevotes'] / given['returns']['totalvotes']
given['returns'] = given['returns'].query("party in ['democrat', 'republican']")
given['returns'].dropna(inplace = True)
given['returns'].loc[:,'FIPS'] = given['returns']['FIPS'].apply(lambda x : str(int(x)) if len(str(int(x))) == 5 else f'0{str(int(x))}')
diff = given['returns'].groupby(['year', 'FIPS'])['percentage'].diff().ffill()
diff = list(diff) + [diff.iloc[-1]]

# + diff = republicans | - diff = democrats
given['returns']['diff'] = diff[1:]
_files

['countypres.csv', 'county_level.csv']

# Votes

In [None]:
import plotly.graph_objects as go
from urllib.request import urlopen
import seaborn as sns
def plot_votes(df, year):

  df = df.query('year == @year')[['FIPS', 'county', 'diff', 'candidate', 'party', 'state']].drop_duplicates()
  rep = df.query("party == 'republican'")['candidate'].iloc[0]
  dem = df.query("party == 'democrat'")['candidate'].iloc[1]

  with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

  fig = go.Figure()
  fig.add_trace(
    go.Choroplethmapbox(
      geojson=counties, 
      locations=df['FIPS'], 
      z= df['diff'],
      text = df['county']+', ' + df['state'],
      colorscale='RdBu', 
      showscale=False,
      marker_opacity=0.8, 
      marker_line_width=.2,
      marker_line_color = 'rgb(150, 150, 150)',
      reversescale=True,
    )
  )

  fig.update_layout(
    title_text = f' {year} Election Results - {dem} vs {rep}',
    title_x=0.5,
    mapbox_style="carto-darkmatter",
    mapbox_zoom=3.8, 
    width=1400+200,
    height=850+200,
    geo_scope='usa',
    mapbox_center = {"lat": 38.0902, "lon": -97.5129},
    paper_bgcolor='rgb(0, 0, 0)',
    plot_bgcolor='rgb(0, 0, 0)',
    font=dict(
        family="Courier New, monospace",
        size=23,
        color="rgb(243, 243, 243)"
    ),
    margin={"r":50,"t":120,"l":50,"b":50}
  )

  fig.show()

In [None]:
plot_votes(given['returns'], 2016)

Output hidden; open in https://colab.research.google.com to view.

In [12]:
v_year = given['returns'].groupby(['year', 'state','party'])[['candidatevotes', 'totalvotes']].sum().reset_index()
v_year['percentage'] = v_year['candidatevotes']/v_year['totalvotes']
diff = v_year.groupby(['year', 'state'])['percentage'].diff().ffill()
diff = list(diff) + [diff.iloc[-1]]
v_year['diff'] = diff[1:]
v_year = v_year[['year', 'state', 'diff']].drop_duplicates()

M = []
states = v_year['state'].unique()
for y in v_year['year'].unique():
  sub = v_year.query('year == @y')
  M.append(sub['diff'].values)
corr_matr = np.corrcoef(np.array(M).T)[::-1]

In [16]:
state_corr_pairs = list(zip(states, np.mean(corr_matr, axis = 0)/np.min(corr_matr, axis = 0)))
state_corr_pairs.sort(key = lambda x: x[1], reverse=  False)
new_states = [i[0] for i in state_corr_pairs]

M = []
for y in v_year['year'].unique():
  vals = []
  for s in new_states:
    v = v_year.query('year == @y & state == @s')['diff'].iloc[0]
    vals.append(v)
  M.append(vals)
corr_matr = np.corrcoef(np.array(M).T)[::1][:,::-1]

In [17]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
new_states = [us_state_abbrev[s]+'  ' for s in new_states]

In [18]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
                   z=corr_matr,
                   x=new_states[::-1],
                   y=new_states,
                   colorscale='geyser', 
                   reversescale=False,
                   hoverongaps = False))

fig.update_layout(
    title_text = f'2000-2016 State Level Correlations',
    title_x=0.5,
    width=1600,
    height=1600,
    paper_bgcolor='rgb(0, 0, 0)',
    plot_bgcolor='rgb(0, 0, 0)',
    font=dict(
        family="Courier New, monospace",
        size=20,
        color="rgb(243, 243, 243)"
    ),
  )

fig.show()

In [None]:
induv_contribs = fec['induv'].sort_values(by = 'datetime')
induv_contribs.head()

Unnamed: 0,CMTE_ID,OTHER_ID,CITY,STATE,ZIP_CODE,datetime,TRANSACTION_TP,TRANSACTION_AMT,year,month
0,C00346098,C00383653,BOSTON,MA,2116.0,2003-03-12,24I,1000,2003,3
1,C00121368,C00360982,HOUSTON,TX,77019.0,2003-12-31,24I,-200,2003,12
6,C00306670,C00383653,HOUSTON,TX,77056.0,2004-02-17,24I,2000,2004,2
7,C00306670,C00383653,HOUSTON,TX,77056.0,2004-02-17,24I,2000,2004,2
8,C00306670,C00383653,HOUSTON,TX,77096.0,2004-02-27,24I,250,2004,2


In [None]:
x = induv_contribs.query("year >= 2012 and year <= 2012 & OTHER_ID == 'C00431445' & month < 11")
x['cumsum'] = x['TRANSACTION_AMT'].cumsum()
y = induv_contribs.query("year >= 2012 and year <= 2012 & OTHER_ID == 'C00431171' & month < 11")
y['cumsum'] = y['TRANSACTION_AMT'].cumsum()

fig = go.Figure()
fig.add_trace(go.Scatter(x=x['datetime'], y=x['cumsum'], name='Obama',
                         line=dict(color='#0015BC', width=4)))
fig.add_trace(go.Scatter(x=y['datetime'], y=y['cumsum'], name='Romney',
                         line=dict(color='#FF0000', width=4, )))
# Edit the layout
fig.update_layout(
    title_text = f'Cumulative Contributions (Induvidual)',
    title_x=0.5,

    width=900,
    height=750,
    
    paper_bgcolor='rgb(0, 0, 0)',
    plot_bgcolor='rgb(0, 0, 0)',
    font=dict(
        family="Courier New, monospace",
        size=20,
        color="rgb(243, 243, 243)"
    ),
  )

fig.update_xaxes(gridcolor='rgb(60,60,60)')
fig.update_yaxes(gridcolor='rgb(60,60,60)')

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
x = induv_contribs.query("year == 2008 & month < 11 & OTHER_ID == 'C00430470' ")
x['cumsum'] = x['TRANSACTION_AMT'].cumsum()
y = induv_contribs.query("year == 2008 &  month < 11 & OTHER_ID == 'C00431445' ")
y['cumsum'] = y['TRANSACTION_AMT'].cumsum()

fig = go.Figure()
fig.add_trace(go.Scatter(x=x['datetime'], y=x['cumsum'], name='Obama',
                         line=dict(color='#0015BC', width=4)))
fig.add_trace(go.Scatter(x=y['datetime'], y=y['cumsum'], name='McCain',
                         line=dict(color='#FF0000', width=4, )))
# Edit the layout
fig.update_layout(
    title_text = f'Cumulative Contributions (Induvidual)',
    title_x=0.5,

    width=900,
    height=750,
    
    paper_bgcolor='rgb(0, 0, 0)',
    plot_bgcolor='rgb(0, 0, 0)',
    font=dict(
        family="Courier New, monospace",
        size=20,
        color="rgb(243, 243, 243)"
    ),
  )

fig.update_xaxes(gridcolor='rgb(60,60,60)')
fig.update_yaxes(gridcolor='rgb(60,60,60)')

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
demo = polldem['demo']
cols = list(demo.columns[1:8])
demo = demo[cols]

In [None]:
M = np.corrcoef(demo.T)[::-1]
names = ['White %', 'Black %', 'Hisp/Other %', 'College %', 'Whte Work Class %', 'Median Age', 'Pop Dnsty']
names = [n + ' ' for n in names]

In [None]:
fig = go.Figure(data=go.Heatmap(
                   z=M,
                   x=names[::-1],
                   y=names,
                   colorscale='balance', 
                   reversescale=False,
                   hoverongaps = False))

fig.update_layout(
    title_text = f'Demographic Covariates',
    title_x=0.5,
    width=1400,
    height=1000,
    paper_bgcolor='rgb(0, 0, 0)',
    plot_bgcolor='rgb(0, 0, 0)',
    font=dict(
        family="Courier New, monospace",
        size=25,
        color="rgb(243, 243, 243)"
    ),
  )

fig.show()

In [None]:
poll = polldem['polls'].query("year == 2016 ")
poll

Unnamed: 0,pollster_rating_id,location,polldate,samplesize,cand1_name,cand1_pct,cand2_name,cand2_pct,electiondate,year
1159,124,IA,2016-01-12,461.0,Clinton,57.0,Sanders,36.0,2016-02-01,2016
1160,124,IA,2016-01-12,422.0,Cruz,28.0,Trump,34.0,2016-02-01,2016
1161,153,IA,2016-01-14,356.0,Clinton,47.4,Sanders,45.0,2016-02-01,2016
1162,153,IA,2016-01-14,283.0,Cruz,25.8,Trump,18.9,2016-02-01,2016
1163,171,IA,2016-01-16,500.0,Clinton,59.0,Sanders,30.0,2016-02-01,2016
...,...,...,...,...,...,...,...,...,...,...
1611,127,CA,2016-05-25,412.0,Clinton,49.0,Sanders,39.0,2016-06-07,2016
1612,94,CA,2016-05-29,571.0,Clinton,45.0,Sanders,43.0,2016-06-07,2016
1613,183,CA,2016-05-30,557.0,Clinton,49.0,Sanders,47.0,2016-06-07,2016
1614,9,CA,2016-06-01,400.0,Clinton,48.0,Sanders,47.0,2016-06-07,2016
