In [204]:
import pandas as pd

data = pd.read_csv('Spill_Incidents.csv')

In [205]:
# get rid of unnecessary columns and change units to all be gallons
location_data = data.loc[:, ['Locality', 'County', 'Quantity', 'Units']].copy()
location_data.loc[location_data['Units'] == 'Pounds', 'Quantity'] = location_data.loc[location_data['Units'] == 'Pounds', 'Quantity'] / 7.21
location_data.loc[location_data['Units'] == 'Pounds', ['Units']] = 'Gallons'
location_data.head(5)

Unnamed: 0,Locality,County,Quantity,Units
0,ELMSFORD,Westchester,10.0,Gallons
1,QUEENS,Queens,0.0,Gallons
2,QUEENS,Queens,0.0,
3,BRONX,Bronx,1.0,Gallons
4,LIBERTY,Sullivan,6.0,Gallons


In [206]:
# get total oil spilled by each locality and county
location_total = location_data.copy()
location_total['Total Spilled'] = location_total.groupby(['Locality', 'County'])['Quantity'].transform('sum')
location_total = location_total.drop(['Quantity', 'Units'], axis='columns')
location_total = location_total.drop_duplicates(subset=['Locality', 'County'])

# clean up total spilled by droping NaN values, sorting, and renaming column
location_total = location_total.dropna(subset=['Total Spilled'])
location_total = location_total.sort_values(['Total Spilled'], ascending=False)
location_total = location_total.rename(columns={'Total Spilled': 'Total Spilled (Gallons)'})
location_total = location_total.reset_index()
location_total = location_total.drop(['index'], axis='columns')
location_total.head(10)

Unnamed: 0,Locality,County,Total Spilled (Gallons)
0,COLONIE COHOES LATHAM WATERFOR,Albany,13869630000.0
1,BALDWINSVILLE,Onondaga,200409700.0
2,BRONX,Bronx,153712600.0
3,NEW YORK CITY,Queens,103959300.0
4,EAST SYRACUSE,Onondaga,100068000.0
5,JAMAICA,Queens,64019000.0
6,YONKERS,Westchester,43008180.0
7,BROOKLYN,Kings,33953500.0
8,OSWEGO,Oswego,17063090.0
9,NEW ROCHELLE,Westchester,15910200.0


In [207]:
# another sub-data set that tracks frequency of spills instead of total gallons
location_freq = pd.DataFrame(location_data[['Locality', 'County']].value_counts())
location_freq = location_freq.reset_index()
location_freq.head(10)

Unnamed: 0,Locality,County,count
0,BROOKLYN,Kings,21649
1,MANHATTAN,New York,15279
2,BRONX,Bronx,13128
3,QUEENS,Queens,11214
4,ROCHESTER,Monroe,9047
5,BUFFALO,Erie,7240
6,STATEN ISLAND,Richmond,6590
7,YONKERS,Westchester,5608
8,WHITE PLAINS,Westchester,4783
9,SYRACUSE,Onondaga,4657


In [208]:
# only use top 75% of values that have quantity > 0
top_location_freq = location_data.copy()
top_location_freq = top_location_freq[top_location_freq['Quantity'] != 0]
top_location_freq = top_location_freq.sort_values(['Quantity'], ascending=False)
top_location_freq = top_location_freq.reset_index()
top_location_freq = top_location_freq.drop(['index'], axis='columns')
top_location_freq = top_location_freq.head(3* (top_location_freq.shape[0] // 4))

# frequency of top 75% oil spills
top_location_freq = pd.DataFrame(top_location_freq[['Locality', 'County']].value_counts())
top_location_freq = top_location_freq.reset_index()
top_location_freq.head(10)

Unnamed: 0,Locality,County,count
0,MANHATTAN,New York,7040
1,BROOKLYN,Kings,6753
2,BRONX,Bronx,4816
3,QUEENS,Queens,4528
4,ROCHESTER,Monroe,3671
5,BUFFALO,Erie,2256
6,SYRACUSE,Onondaga,1881
7,STATEN ISLAND,Richmond,1817
8,JAMAICA,Queens,1791
9,WHITE PLAINS,Westchester,1625
