In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup as BS

%matplotlib inline

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
county = pd.read_stata('../Data/2006_precinct_detail.dta')

In [None]:
county.head(50)

In [None]:
county['BNAME1'] = county['BNAME1'].str.replace("\(R\)", "Repub")

In [None]:
county['BNAME1'] = county['BNAME1'].str.replace("\(D\)", "Democ")

In [None]:
county['BNAME2'] = county['BNAME2'].str.replace("\(R\)", "Repub")

In [None]:
county['BNAME2'] = county['BNAME2'].str.replace("\(D\)", "Democ")

In [None]:
# confirm all counties

len(county.COUNTY.unique())

In [None]:
# change all floats to ints

for column in county.columns:
    if county[column].dtype == 'float64':
        county[column] = county[column].fillna(0).astype('int64')

In [None]:
# all county names need to be uniform

county['COUNTY'] = county['COUNTY'].str.title()

In [None]:
# create column for Rep vote count
for index, row in county.iterrows():
    if 'Repub' in str(row['BNAME1']):
        county.loc[index, 'Rep'] = county.loc[index, 'TALLY1']
    elif 'Repub' in str(row['BNAME2']):
        county.loc[index, 'Rep'] = county.loc[index, 'TALLY2']       
    else:
        county.loc[index, 'Rep'] = 0

In [None]:
# create column for Dem vote count
for index, row in county.iterrows():
    if 'Democ' in str(row['BNAME1']):
        county.loc[index, 'Dem'] = county.loc[index, 'TALLY1']
    elif 'Democ' in str(row['BNAME2']):
        county.loc[index, 'Dem'] = county.loc[index, 'TALLY2']       
    else:
        county.loc[index, 'Dem'] = 0

In [None]:
county.head()

In [None]:
county['COL1'].value_counts()

In [None]:
# create column for Other vote count

cols_1 = ['TALLY3', 'TALLY4', 'TALLY5', 'TALLY6', 'TALLY7', 'TALLY8', 'TALLY9', 'TALLY10']
cols_11 = ['TALLY2','TALLY3', 'TALLY4', 'TALLY5', 'TALLY6', 'TALLY7', 'TALLY8', 'TALLY9', 'TALLY10']

for index, row in county.iterrows():
    if 'Repub' in str(row['BNAME2']):
        county.loc[index, 'Other'] = county.loc[index, cols_1].sum()   
    else:
        county.loc[index, 'Other'] = county.loc[index, cols_11].sum()   
        

In [None]:
county.head(10)

In [None]:
# convert vote count columns to int

county['Other'] = county['Other'].astype('Int64')
county['Rep'] = county['Rep'].astype('Int64')
county['Dem'] = county['Dem'].astype('Int64')

In [None]:
county.head(3)

In [None]:
county['year'] = pd.DatetimeIndex(county['ELECTDATE']).year

In [None]:
county['total_votes'] = county['Rep'] + county['Dem'] + county['Other']

In [None]:
county.head()

In [None]:
county.OfficeID.value_counts()

In [None]:
# create column for office

for index, row in county.iterrows():
    if 'TH' in row['OfficeID']:
        county.loc[index, 'office'] = 'State House'
    elif 'TS' in row['OfficeID']:
        county.loc[index, 'office'] = 'State Senate' 
    elif 'USH' in row['OfficeID']:
        county.loc[index, 'office'] = 'US_House'        
    elif 'USS' in row['OfficeID']:
        county.loc[index, 'office'] = 'US_Senate'
    elif 'PRES' in row['OfficeID']:
        county.loc[index, 'office'] = 'Pres'  
    elif 'Governor' in row['OfficeID']:
        county.loc[index, 'office'] = 'Governor'    
    else:
        county.loc[index, 'office'] = 'Other'

In [None]:
county.office.value_counts()

In [None]:
# remove constitional amendment rows

# county = county[(county.office != 'Other')]

In [None]:
# group by county & office and agg R/D/O/Total vote totals to create new df

county_df = county.groupby(['COUNTY', 'office']).agg({'Dem': 'sum', 'Rep': 'sum', 'Other': 'sum', 'total_votes': 'sum', 'year': 'first'}).reset_index()

In [None]:
county_df.head()

In [None]:
# create new columns for Rep / Dem / Other vote %

county_df['Rep%'] = (county_df['Rep'] / county_df['total_votes'] * 100).round(1)
county_df['Dem%'] = (county_df['Dem'] / county_df['total_votes'] * 100).round(1)
county_df['Other%'] = (county_df['Other'] / county_df['total_votes'] * 100).round(1)

In [None]:
# create new columns for Rep & Dem margin %

county_df['Rep_margin'] = county_df['Rep%'] - county_df['Dem%']
county_df['Dem_margin'] = county_df['Dem%'] - county_df['Rep%']

In [None]:
county_df = county_df.rename(columns={'COUNTY': 'county'})

In [None]:
county_df.head()

In [None]:
# rearrange columns to match other df's 

county_df = county_df[['year', 'county', 'office', 'total_votes', 'Rep', 'Dem', 'Other', 'Rep%', 'Dem%', 'Other%', 'Rep_margin', 'Dem_margin']]

In [None]:
county_df.info()

In [None]:
county_df

In [None]:
# confirm all counties present again

len(county_df.county.unique())

In [None]:
# county_df.to_csv("clean_county_08.csv", index=False)