In [1]:
import pandas as pd
import ast

In [2]:
sea_2015 = pd.read_csv('data/2015-building-energy-benchmarking.csv')

In [3]:
sea_2015

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,TaxParcelIdentificationNumber,Location,CouncilDistrictCode,Neighborhood,YearBuilt,...,GHGEmissionsIntensity(kgCO2e/ft2),DefaultData,Comment,ComplianceStatus,Outlier,2010 Census Tracts,Seattle Police Department Micro Community Policing Plan Areas,City Council Districts,SPD Beats,Zip Codes
0,1,2015,NonResidential,Hotel,MAYFLOWER PARK HOTEL,659000030,"{'latitude': '47.61219025', 'longitude': '-122...",7,DOWNTOWN,1927,...,2.64,No,,Compliant,,,14.0,,31.0,18081
1,2,2015,NonResidential,Hotel,PARAMOUNT HOTEL,659000220,"{'latitude': '47.61310583', 'longitude': '-122...",7,DOWNTOWN,1996,...,2.38,No,,Compliant,,,14.0,,31.0,18081
2,3,2015,NonResidential,Hotel,WESTIN HOTEL,659000475,"{'latitude': '47.61334897', 'longitude': '-122...",7,DOWNTOWN,1969,...,1.92,Yes,,Compliant,,,56.0,,31.0,18081
3,5,2015,NonResidential,Hotel,HOTEL MAX,659000640,"{'latitude': '47.61421585', 'longitude': '-122...",7,DOWNTOWN,1926,...,31.38,No,,Compliant,High Outlier,,56.0,,31.0,18081
4,8,2015,NonResidential,Hotel,WARWICK SEATTLE HOTEL,659000970,"{'latitude': '47.6137544', 'longitude': '-122....",7,DOWNTOWN,1980,...,4.02,No,,Compliant,,,56.0,,31.0,19576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3335,50049,2015,Multifamily LR (1-4),Low-Rise Multifamily,PACIFIC CENTER CONDOMINIUM,6599950000,"{'latitude': '47.59950256', 'longitude': '-122...",2,DOWNTOWN,2000,...,0.07,No,,Compliant,,,26.0,,16.0,18379
3336,50055,2015,Multifamily MR (5-9),Mid-Rise Multifamily,IDENTITY APTS 4123,1142001670,"{'latitude': '47.65752471', 'longitude': '-122...",4,NORTHEAST,2014,...,0.61,No,,Compliant,,,60.0,,38.0,18383
3337,50057,2015,Multifamily HR (10+),High-Rise Multifamily,CIRRUS,660000575,"{'latitude': '47.61649845', 'longitude': '-122...",7,DOWNTOWN,2015,...,1.89,No,,Compliant,,,56.0,,7.0,19576
3338,50058,2015,Multifamily LR (1-4),Low-Rise Multifamily,WEDGEWOOD ESTATES BLDG A,6392001040,"{'latitude': '47.68396954', 'longitude': '-122...",4,NORTHEAST,1981,...,0.04,No,,Compliant,,,55.0,,48.0,18792


### Deleting unneeded columns

In [4]:
columns_to_drop = ['TaxParcelIdentificationNumber', 'PropertyGFATotal','2010 Census Tracts',
       'Seattle Police Department Micro Community Policing Plan Areas',
       'City Council Districts', 'SPD Beats', 'Zip Codes']

sea_2015.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
sea_2015[['SiteEnergyUse(kBtu)', 'SiteEnergyUse(kWh)', 'SiteEnergyUseWN(kBtu)',
                        'SiteEnergyUseWN(kWh)', 'SteamUse(kBtu)', 'SteamUse(kWh)',
                        'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)']]

### Extarcting address data
We need to extract latitude, longitude, address and zip from this dict-like column

In [None]:
print(sea_2015['Location'][0])

In [None]:
# creating dictionary from a string in "Location" column
sea_2015['Location'] = sea_2015['Location'].apply(ast.literal_eval)

In [None]:
sea_2015['Location'][0]

In [None]:
# creating 'latitude' and 'longitude' columns from the dictionary
location = pd.json_normalize(sea_2015['Location'])

In [None]:
location.head()

In [None]:
# do the same operations for "Human_address" column
location['human_address'] = location['human_address'].apply(ast.literal_eval)
address = pd.json_normalize(location['human_address'])

In [None]:
address.head()

In [None]:
# droping "Location" column and inserting new columns in a fixed order
sea_2015.drop("Location", axis=1, inplace=True)
sea_2015.insert(loc=5, column='Address', value=address['address'])
sea_2015.insert(loc=6, column='ZipCode', value=address['zip'])
sea_2015.insert(loc=9, column='Latitude', value=location['latitude'])
sea_2015.insert(loc=10, column='Longitude', value=location['longitude'])

In [None]:
sea_2015['Latitude'] = sea_2015['Latitude'].astype(float)
sea_2015['Longitude'] = sea_2015['Longitude'].astype(float)
sea_2015['ZipCode'] = sea_2015['ZipCode'].astype(float)

### Conversion kBtu units into kWh

In [None]:
# 1kWh = 3.4121416 kBtu
conventer = 3.4121416

# columns loc number
for i in range(len(sea_2015.columns)):
    print(i, sea_2015.columns[i])

In [None]:
sea_2015.insert(loc=30, column='SiteEnergyUse(kWh)', value=sea_2015['SiteEnergyUse(kBtu)'] * conventer)


sea_2015.insert(loc=32, column='SiteEnergyUseWN(kWh)', value=sea_2015['SiteEnergyUseWN(kBtu)'] * conventer)


sea_2015.insert(loc=34, column='SteamUse(kWh)', value=sea_2015['SteamUse(kBtu)'] * conventer)


sea_2015.insert(loc=39, column='NaturalGas(kWh)', value=sea_2015['NaturalGas(kBtu)'] * conventer)


### Droping Nan and zeros, renaming columns

In [None]:
sea_2015.dropna(subset=['SiteEnergyUse(kBtu)', 'SiteEnergyUse(kWh)', 'SiteEnergyUseWN(kBtu)',
                        'SiteEnergyUseWN(kWh)', 'SteamUse(kBtu)', 'SteamUse(kWh)',
                        'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)'],
                inplace=True)

sea_2015.rename(columns={'GHGEmissions(MetricTonsCO2e)': 'TotalGHGEmissions',
                         'GHGEmissionsIntensity(kgCO2e/ft2)': 'GHGEmissionsIntensity'},
                inplace=True)

sea_2015['SiteEnergyUse(kWh)'].astype(int)
sea_2015['SiteEnergyUseWN(kBtu)'].astype(int)
sea_2015['SteamUse(kBtu)'].astype(int)
sea_2015['NaturalGas(kBtu)'].astype(int)

In [None]:
sea_2015['SiteEnergyUse(kWh)'].astype(int)