In [7]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoplot as gplt
import geopandas as gpd

with open('data/raw/crime.html') as f:
    soup = BeautifulSoup(f.read(), 'lxml')
    table = soup.find_all('table')[0]
    data_columns = table.find_all('th')
    column_data = table.find_all('tr')

    cols_to_use = [name.text.strip() for name in data_columns]

    cols_to_use.remove('Yearly Crime Rates per 100,000 people')
    cols_to_use.remove('Violent crime')
    cols_to_use.remove('Property crime')

    cols_to_use[3] = 'Total (Violent and Property)'
    cols_to_use[9] = "Total Violent"
    cols_to_use[13] = "Total Property"

    element = cols_to_use.pop(4)
    cols_to_use.insert(13, element)
    
    crime_data = pd.DataFrame(columns = cols_to_use)

    for row in column_data[3:]:
        row_data = row.find_all('td')
        individual_row_data = [data.text.strip() for data in row_data]
        crime_data.loc[len(crime_data)] = individual_row_data

    crime_data.at[0, 'City'] = 'Mobile'
    crime_data.at[26, 'City'] = 'Stockton'
    crime_data.at[73, 'City'] = 'Toledo'
    crime_data.at[81, 'City'] = 'Arlington'
    crime_data.at[47, 'City'] = 'Louisville Metro'
    
    new_columns = {'Murder andNonnegligentmanslaughter': 'Murder/Manslaughter', 
               'Rape1': 'Rape',
               'Aggravatedassault': 'Assault',
               'Larceny-theft': 'Larceny/Theft',
               'Motorvehicletheft': 'Vehicle Theft',
               'Arson2' : 'Arson'}
    crime_data.rename(columns = new_columns, inplace=True)
    
    crime_data['Population'] = pd.to_numeric(crime_data['Population'].replace(',', '', regex=True), errors='coerce')
    crime_data['Total Property'] = pd.to_numeric(crime_data['Total Property'].replace(',', '', regex=True), errors='coerce')
    crime_data['Larceny/Theft'] = pd.to_numeric(crime_data['Larceny/Theft'].replace(',', '', regex=True), errors='coerce')
    crime_data['Burglary'] = pd.to_numeric(crime_data['Burglary'].replace(',', '', regex=True), errors='coerce')
    crime_data['Vehicle Theft'] = pd.to_numeric(crime_data['Vehicle Theft'].replace(',', '', regex=True), errors='coerce')
    crime_data['Assault'] = pd.to_numeric(crime_data['Assault'].replace(',', '', regex=True), errors='coerce')
    crime_data['Rape'] = pd.to_numeric(crime_data['Rape'].replace(',', '', regex=True), errors='coerce')
    crime_data['Arson'] = pd.to_numeric(crime_data['Arson'].replace(',', '', regex=True), errors='coerce')
    crime_data['Total Violent'] = pd.to_numeric(crime_data['Total Violent'].replace(',', '', regex=True), errors='coerce')

    crime_data['Total (Violent and Property)'] = pd.to_numeric(crime_data['Total (Violent and Property)'], errors='coerce')
    crime_data['Murder/Manslaughter'] = pd.to_numeric(crime_data['Murder/Manslaughter'], errors='coerce')
    crime_data['Robbery'] = pd.to_numeric(crime_data['Robbery'], errors='coerce')
    
    crime_data.to_csv('data/processed/crime_data.csv')

In [11]:
crime_data = pd.read_csv('data/processed/crime_data.csv')

missingval_col = ['Rape', 'Total Violent', 'Arson']

#Get mean within each column
mean_values = crime_data[missingval_col].mean() 

#Fill with mean imputation
crime_data[missingval_col] = crime_data[missingval_col].fillna(mean_values)

crime_data.to_csv('data/processed/crime_data.csv')

In [10]:
with open('data/raw/election.html') as f:
    soup = BeautifulSoup(f.read(), 'lxml')
    table = soup.find_all('table')[1]
    data_columns = table.find_all('th')
    column_data = table.find_all('tr')

    data_columns = table.find_all('th')
    data_columns = [name.text.strip() for name in data_columns]

    election_data = pd.DataFrame(columns=['State', 'ElectoralVotes', 'DemPresident', 'RepPresident', 'DemVP', 'RepVP'])
    column_data = table.find_all('tr')

    for row in column_data[2:]:
        row_data = row.find_all('td')
        individual_row_data = [data.text.strip() for data in row_data]
        election_data = election_data.append(pd.Series(individual_row_data, index=election_data.columns[:len(individual_row_data)]), ignore_index=True)
    
    election_data = election_data.drop([election_data.index[52], election_data.index[51]])
    
    election_data.to_csv('data/processed/election_data.csv')

In [14]:
election_data = pd.read_csv('data/processed/election_data.csv')

red_states = list(election_data.iloc[list(election_data['DemPresident'] == '-')]['State'])
red_states.append(str(election_data.iloc[list(election_data['DemPresident'] == '1')]['State']))
red_states[24] = 'Nebraska'

crime_data['blue'] = ['Red' if state in red_states else 'Blue' for state in crime_data['State']]

In [15]:
crime_data.to_csv('data/processed/crime_and_election_data.csv')

In [16]:
path = gplt.datasets.get_path("contiguous_usa")
contiguous_usa = gpd.read_file(path)

contiguous_usa = contiguous_usa.rename(columns={"state": "State"})
contiguous_usa_new = pd.merge(crime_data, contiguous_usa, on='State')

contiguous_usa_new.to_csv('data/processed/crime_election_geo_data.csv')

NameError: name 'gplt' is not defined