In [1]:
import pandas as pd
import ast

In [2]:
sea_2015 = pd.read_csv('data/2015-building-energy-benchmarking.csv')

In [3]:
sea_2015.head()

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,TaxParcelIdentificationNumber,Location,CouncilDistrictCode,Neighborhood,YearBuilt,...,GHGEmissionsIntensity(kgCO2e/ft2),DefaultData,Comment,ComplianceStatus,Outlier,2010 Census Tracts,Seattle Police Department Micro Community Policing Plan Areas,City Council Districts,SPD Beats,Zip Codes
0,1,2015,NonResidential,Hotel,MAYFLOWER PARK HOTEL,659000030,"{'latitude': '47.61219025', 'longitude': '-122.33799744', 'human_address': '{""address"": ""405 OLIVE WAY"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}'}",7,DOWNTOWN,1927,...,2.64,No,,Compliant,,,14.0,,31.0,18081
1,2,2015,NonResidential,Hotel,PARAMOUNT HOTEL,659000220,"{'latitude': '47.61310583', 'longitude': '-122.33335756', 'human_address': '{""address"": ""724 PINE ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}'}",7,DOWNTOWN,1996,...,2.38,No,,Compliant,,,14.0,,31.0,18081
2,3,2015,NonResidential,Hotel,WESTIN HOTEL,659000475,"{'latitude': '47.61334897', 'longitude': '-122.33769944', 'human_address': '{""address"": ""1900 5TH AVE"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}'}",7,DOWNTOWN,1969,...,1.92,Yes,,Compliant,,,56.0,,31.0,18081
3,5,2015,NonResidential,Hotel,HOTEL MAX,659000640,"{'latitude': '47.61421585', 'longitude': '-122.33660889', 'human_address': '{""address"": ""620 STEWART ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}'}",7,DOWNTOWN,1926,...,31.38,No,,Compliant,High Outlier,,56.0,,31.0,18081
4,8,2015,NonResidential,Hotel,WARWICK SEATTLE HOTEL,659000970,"{'latitude': '47.6137544', 'longitude': '-122.3409238', 'human_address': '{""address"": ""401 LENORA ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98121""}'}",7,DOWNTOWN,1980,...,4.02,No,,Compliant,,,56.0,,31.0,19576


### Deleting unneeded columns

In [4]:
columns_to_drop = ['TaxParcelIdentificationNumber', 'PropertyGFATotal','2010 Census Tracts',
       'Seattle Police Department Micro Community Policing Plan Areas',
       'City Council Districts', 'SPD Beats', 'Zip Codes']

sea_2015.drop(columns_to_drop, axis=1, inplace=True)

### Extarcting address data
We need to extract latitude, longitude, address and zip from this dict-like column

In [5]:
print(sea_2015['Location'][0])

{'latitude': '47.61219025', 'longitude': '-122.33799744', 'human_address': '{"address": "405 OLIVE WAY", "city": "SEATTLE", "state": "WA", "zip": "98101"}'}


In [6]:
# creating dictionary from a string in "Location" column
sea_2015['Location'] = sea_2015['Location'].apply(ast.literal_eval)

In [7]:
sea_2015['Location'][0]

{'latitude': '47.61219025',
 'longitude': '-122.33799744',
 'human_address': '{"address": "405 OLIVE WAY", "city": "SEATTLE", "state": "WA", "zip": "98101"}'}

In [8]:
# creating 'latitude' and 'longitude' columns from the dictionary
location = pd.json_normalize(sea_2015['Location'])

In [9]:
location.head()

Unnamed: 0,latitude,longitude,human_address
0,47.61219025,-122.33799744,"{""address"": ""405 OLIVE WAY"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
1,47.61310583,-122.33335756,"{""address"": ""724 PINE ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
2,47.61334897,-122.33769944,"{""address"": ""1900 5TH AVE"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
3,47.61421585,-122.33660889,"{""address"": ""620 STEWART ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
4,47.6137544,-122.3409238,"{""address"": ""401 LENORA ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98121""}"


In [10]:
# do the same operations for "Human_address" column
location['human_address'] = location['human_address'].apply(ast.literal_eval)
address = pd.json_normalize(location['human_address'])

In [11]:
address.head()

Unnamed: 0,address,city,state,zip
0,405 OLIVE WAY,SEATTLE,WA,98101
1,724 PINE ST,SEATTLE,WA,98101
2,1900 5TH AVE,SEATTLE,WA,98101
3,620 STEWART ST,SEATTLE,WA,98101
4,401 LENORA ST,SEATTLE,WA,98121


In [12]:
# droping "Location" column and inserting new columns in a fixed order
sea_2015.drop("Location", axis=1, inplace=True)
sea_2015.insert(loc=5, column='Address', value=address['address'])
sea_2015.insert(loc=6, column='ZipCode', value=address['zip'])
sea_2015.insert(loc=9, column='Latitude', value=location['latitude'])
sea_2015.insert(loc=10, column='Longitude', value=location['longitude'])

In [13]:
sea_2015['Latitude'] = sea_2015['Latitude'].astype(float)
sea_2015['Longitude'] = sea_2015['Longitude'].astype(float)
sea_2015['ZipCode'] = sea_2015['ZipCode'].astype(float)

### Conversion kBtu units into kWh

In [14]:
# 1kWh = 3.4121416 kBtu
conventer = 3.4121416

# columns loc number
for i in range(len(sea_2015.columns)):
    print(i, sea_2015.columns[i])

0 OSEBuildingID
1 DataYear
2 BuildingType
3 PrimaryPropertyType
4 PropertyName
5 Address
6 ZipCode
7 CouncilDistrictCode
8 Neighborhood
9 Latitude
10 Longitude
11 YearBuilt
12 NumberofBuildings
13 NumberofFloors
14 PropertyGFAParking
15 PropertyGFABuilding(s)
16 ListOfAllPropertyUseTypes
17 LargestPropertyUseType
18 LargestPropertyUseTypeGFA
19 SecondLargestPropertyUseType
20 SecondLargestPropertyUseTypeGFA
21 ThirdLargestPropertyUseType
22 ThirdLargestPropertyUseTypeGFA
23 YearsENERGYSTARCertified
24 ENERGYSTARScore
25 SiteEUI(kBtu/sf)
26 SiteEUIWN(kBtu/sf)
27 SourceEUI(kBtu/sf)
28 SourceEUIWN(kBtu/sf)
29 SiteEnergyUse(kBtu)
30 SiteEnergyUseWN(kBtu)
31 SteamUse(kBtu)
32 Electricity(kWh)
33 Electricity(kBtu)
34 NaturalGas(therms)
35 NaturalGas(kBtu)
36 OtherFuelUse(kBtu)
37 GHGEmissions(MetricTonsCO2e)
38 GHGEmissionsIntensity(kgCO2e/ft2)
39 DefaultData
40 Comment
41 ComplianceStatus
42 Outlier


In [15]:
sea_2015.insert(loc=30, column='SiteEnergyUse(kWh)', value=sea_2015['SiteEnergyUse(kBtu)'] * conventer)
sea_2015.insert(loc=32, column='SiteEnergyUseWN(kWh)', value=sea_2015['SiteEnergyUseWN(kBtu)'] * conventer)
sea_2015.insert(loc=34, column='SteamUse(kWh)', value=sea_2015['SteamUse(kBtu)'] * conventer)
sea_2015.insert(loc=39, column='NaturalGas(kWh)', value=sea_2015['NaturalGas(kBtu)'] * conventer)

### Droping Nan and zeros, renaming columns

In [16]:
sea_2015.dropna(subset=['SiteEnergyUse(kBtu)', 'SiteEnergyUse(kWh)', 'SiteEnergyUseWN(kBtu)',
                        'SiteEnergyUseWN(kWh)', 'SteamUse(kBtu)', 'SteamUse(kWh)',
                        'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)'],
                inplace=True)

sea_2015 = sea_2015[sea_2015['SiteEnergyUse(kBtu)'] != 0]

sea_2015.rename(columns={'GHGEmissions(MetricTonsCO2e)': 'TotalGHGEmissions',
                         'GHGEmissionsIntensity(kgCO2e/ft2)': 'GHGEmissionsIntensity'},
                inplace=True)

# columns with kWh to integer
sea_2015['SiteEnergyUse(kWh)'] = sea_2015['SiteEnergyUse(kWh)'].astype(int)
sea_2015['SiteEnergyUseWN(kBtu)'] = sea_2015['SiteEnergyUseWN(kBtu)'].astype(int)
sea_2015['SteamUse(kBtu)'] = sea_2015['SteamUse(kBtu)'].astype(int)
sea_2015['NaturalGas(kBtu)'] = sea_2015['NaturalGas(kBtu)'].astype(int)

In [17]:
sea_2015.to_excel('data/2015-building-energy-benchmarking-processed.xlsx', index=False)