In [49]:
import pandas as pd

def convert_columns(df, columns, new_type):
    for column in columns:
        df[column] = df[column].astype(new_type)

def data_processing(path, columns_to_drop, convert_to_float, columns_to_rename,
                    convert_to_int, columns_order, columns_to_add=None):
    #read data
    df = pd.read_csv(path)
    
    # delete columns
    df.drop(columns_to_drop, axis=1, inplace=True)
    
    # convert to float
    convert_columns(df=df, columns=convert_to_float, new_type=float)
    
    # delete Nan and zeros 
    to_zero = df.loc[:, 'SiteEUI(kBtu/sf)':'NaturalGas(kBtu)'].columns.tolist()
    df[to_zero] = df[to_zero].fillna(0)
    df = df[df['SiteEnergyUse(kBtu)'] != 0]
    
    # rename columns
    df.rename(columns=columns_to_rename, inplace=True)
       
    # convert to int
    convert_columns(df=df, columns=convert_to_int, new_type=int)
    
    # sort columns
    if columns_to_add is not None:
        df[columns_to_add] = None
    df = df[columns_order]
    
    return df

# 2019

In [50]:
# fixed order of columns for all data is saved in columns_order.py
from columns_order import fixed_columns_order
columns_order = fixed_columns_order

path_19 = 'data/raw/2020_Building_Energy_Benchmarking.csv'

columns_to_drop_19 = [
    'TaxParcelIdentificationNumber',
    'City', 
    'State', 
    'PropertyGFABuilding(s)', 
    'LargestPropertyUseType', 
    'LargestPropertyUseTypeGFA', 
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA', 
    'EPAPropertyType',
    'ComplianceIssue',
    'ComplianceStatus'
]

convert_to_float_19 = [
    'Latitude', 
    'Longitude',
    'ZipCode'
]

columns_to_rename_19 = {
    'BuildingName': 'PropertyName',
    'TotalGHGEmissions': 'GHGEmissions(MetricTonsCO2e)',
    'GHGEmissionsIntensity': 'GHGEmissionsIntensity(kgCO2e/ft2)'
}

convert_to_int_19 = [
    'SiteEnergyUse(kBtu)',
    'SiteEnergyUseWN(kBtu)',
    'Electricity(kBtu)',
    'NaturalGas(kBtu)'
]

In [51]:
sea_2019 = data_processing(
    path=path_19, 
    columns_to_drop=columns_to_drop_19,
    convert_to_float=convert_to_float_19,
    columns_to_rename=columns_to_rename_19,
    convert_to_int=convert_to_int_19,
    columns_order=fixed_columns_order, 
    columns_to_add=['YearsENERGYSTARCertified', 'PrimaryPropertyType']
)

sea_2019.to_csv('data/clean/2019-building-energy-benchmarking-clean.csv', index=False)

In [48]:
sea_2019.head()

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,Address,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,...,SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kBtu),Electricity(kWh),NaturalGas(therms),NaturalGas(kBtu),GHGEmissions(MetricTonsCO2e),GHGEmissionsIntensity(kgCO2e/ft2)
0,1,2020,NonResidential,,MAYFLOWER PARK HOTEL,405 OLIVE WAY,98101.0,1.0,DOWNTOWN,47.6122,...,116.3,4923562,4824773,1457837,2734351.0,801392,6326,632586.0,169.1,1.9
1,2,2020,NonResidential,,PARAMOUNT HOTEL,724 PINE ST,98101.0,1.0,DOWNTOWN,47.61317,...,81.1,3601694,3601694,0,1940292.0,568667,16614,1661402.0,98.6,1.1
2,3,2020,NonResidential,,WESTIN HOTEL (Parent Building),1900 5TH AVE,98101.0,1.0,DOWNTOWN,47.61367,...,101.7,36772776,36772776,10359896,25517379.0,7478716,8955,895500.0,1043.2,1.4
3,5,2020,NonResidential,,HOTEL MAX,620 STEWART ST,98101.0,1.0,DOWNTOWN,47.61412,...,87.0,2982710,2982710,917724,1177927.0,345231,8871,887059.0,129.6,2.1
4,8,2020,NonResidential,,WARWICK SEATTLE HOTEL,401 LENORA ST,98121.0,1.0,DOWNTOWN,47.61375,...,124.5,8364978,8364978,0,3761566.0,1102452,46034,4603411.0,264.5,2.3


0 OSEBuildingID ok  
1 DataYear ok  
2 BuildingType = EPAPropertyType, kolejnosc(pod koniec)  
3 PrimaryPropertyType kolejnosc (przy GFA)    
4 PropertyName = BuildingName  
5 Address ok  
6 ZipCode ok  
7 CouncilDistrictCode ok  
8 Neighborhood ok  
9 Latitude ok  
10 Longitude ok  
11 YearBuilt ok  
12 NumberofBuildings brak  
13 NumberofFloors ok  
14 PropertyGFATotal ok  
15 PropertyGFAParking ok  
16 ListOfAllPropertyUseTypes brak  
17 LargestPropertyUseType ok  
18 LargestPropertyUseTypeGFA ok  
19 SecondLargestPropertyUseType ok  
20 SecondLargestPropertyUseTypeGFA ok  
21 ThirdLargestPropertyUseType ok  
22 ThirdLargestPropertyUseTypeGFA ok  
23 YearsENERGYSTARCertified ok  
24 ENERGYSTARScore ok  
25 SiteEUI(kBtu/sf) ok  
26 SiteEUIWN(kBtu/sf) ok 
27 SourceEUI(kBtu/sf) ok  
28 SourceEUIWN(kBtu/sf) ok  
29 SiteEnergyUse(kBtu) ok  
30 SiteEnergyUseWN(kBtu) ok  
31 SteamUse(kBtu) ok  
32 Electricity(kWh) ok  
33 Electricity(kBtu) ok  
34 NaturalGas(therms) ok  
35 NaturalGas(kBtu) ok  
36 OtherFuelUse(kBtu) brak  
37 GHGEmissions(MetricTonsCO2e) = TotalGHGEmissions  
38 GHGEmissionsIntensity(kgCO2e/ft2) = GHGEmissionsIntensity  
39 DefaultData
40 Comment
41 ComplianceStatus
42 Outlier

# 2015 

In [5]:
import ast

sea_2015 = pd.read_csv('data/raw/2015-building-energy-benchmarking.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/2015-building-energy-benchmarking.csv'

### Deleting unneeded columns

In [4]:
columns_to_drop = ['TaxParcelIdentificationNumber', 'PropertyGFABuilding(s)','2010 Census Tracts',
                   'Seattle Police Department Micro Community Policing Plan Areas',
                   'City Council Districts', 'SPD Beats', 'Zip Codes']

sea_2015.drop(columns_to_drop, axis=1, inplace=True)

### Extarcting address data
We need to extract latitude, longitude, address and zip from this dict-like column

In [5]:
print(sea_2015['Location'][0])

{'latitude': '47.61219025', 'longitude': '-122.33799744', 'human_address': '{"address": "405 OLIVE WAY", "city": "SEATTLE", "state": "WA", "zip": "98101"}'}


In [6]:
# creating dictionary from a string in "Location" column
sea_2015['Location'] = sea_2015['Location'].apply(ast.literal_eval)

In [7]:
sea_2015['Location'][0]

{'latitude': '47.61219025',
 'longitude': '-122.33799744',
 'human_address': '{"address": "405 OLIVE WAY", "city": "SEATTLE", "state": "WA", "zip": "98101"}'}

In [8]:
# creating 'latitude' and 'longitude' columns from the dictionary
location = pd.json_normalize(sea_2015['Location'])

In [9]:
location.head()

Unnamed: 0,latitude,longitude,human_address
0,47.61219025,-122.33799744,"{""address"": ""405 OLIVE WAY"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
1,47.61310583,-122.33335756,"{""address"": ""724 PINE ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
2,47.61334897,-122.33769944,"{""address"": ""1900 5TH AVE"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
3,47.61421585,-122.33660889,"{""address"": ""620 STEWART ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
4,47.6137544,-122.3409238,"{""address"": ""401 LENORA ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98121""}"


In [10]:
# do the same operations for "Human_address" column
location['human_address'] = location['human_address'].apply(ast.literal_eval)
address = pd.json_normalize(location['human_address'])

In [11]:
address.head()

Unnamed: 0,address,city,state,zip
0,405 OLIVE WAY,SEATTLE,WA,98101
1,724 PINE ST,SEATTLE,WA,98101
2,1900 5TH AVE,SEATTLE,WA,98101
3,620 STEWART ST,SEATTLE,WA,98101
4,401 LENORA ST,SEATTLE,WA,98121


In [12]:
# droping "Location" column and inserting new columns in a fixed order
sea_2015.drop("Location", axis=1, inplace=True)
sea_2015.insert(loc=5, column='Address', value=address['address'])
sea_2015.insert(loc=6, column='ZipCode', value=address['zip'])
sea_2015.insert(loc=9, column='Latitude', value=location['latitude'])
sea_2015.insert(loc=10, column='Longitude', value=location['longitude'])

In [19]:
def convert_columns(df, columns, new_type):
    for column in columns:
        df[column] = df[column].astype(new_type)

In [14]:
convert_to_float = ['Latitude', 'Longitude', 'ZipCode']

convert_columns(sea_2015, convert_to_float, float)

### Conversion kBtu units into kWh

In [15]:
# 1kWh = 3.4121416 kBtu
conventer = 3.4121416

# columns loc number
for i in range(len(sea_2015.columns)):
    print(i, sea_2015.columns[i])

0 OSEBuildingID
1 DataYear
2 BuildingType
3 PrimaryPropertyType
4 PropertyName
5 Address
6 ZipCode
7 CouncilDistrictCode
8 Neighborhood
9 Latitude
10 Longitude
11 YearBuilt
12 NumberofBuildings
13 NumberofFloors
14 PropertyGFATotal
15 PropertyGFAParking
16 ListOfAllPropertyUseTypes
17 LargestPropertyUseType
18 LargestPropertyUseTypeGFA
19 SecondLargestPropertyUseType
20 SecondLargestPropertyUseTypeGFA
21 ThirdLargestPropertyUseType
22 ThirdLargestPropertyUseTypeGFA
23 YearsENERGYSTARCertified
24 ENERGYSTARScore
25 SiteEUI(kBtu/sf)
26 SiteEUIWN(kBtu/sf)
27 SourceEUI(kBtu/sf)
28 SourceEUIWN(kBtu/sf)
29 SiteEnergyUse(kBtu)
30 SiteEnergyUseWN(kBtu)
31 SteamUse(kBtu)
32 Electricity(kWh)
33 Electricity(kBtu)
34 NaturalGas(therms)
35 NaturalGas(kBtu)
36 OtherFuelUse(kBtu)
37 GHGEmissions(MetricTonsCO2e)
38 GHGEmissionsIntensity(kgCO2e/ft2)
39 DefaultData
40 Comment
41 ComplianceStatus
42 Outlier


In [47]:
columns_to_convert = [
    ['SiteEnergyUse(kBtu)', 'SiteEnergyUse(kWh)'],
    ['SiteEnergyUseWN(kBtu)', 'SiteEnergyUseWN(kWh)']
]
df =sea_2019
conventer = 3.4121416

for column_convert in columns_to_convert:
    df[column_convert[1]] = df[column_convert[0]] * conventer

In [48]:
df

Unnamed: 0,OSEBuildingID,DataYear,PropertyName,BuildingType,Address,ZipCode,Latitude,Longitude,Neighborhood,CouncilDistrictCode,...,Electricity(kWh),SteamUse(kBtu),NaturalGas(therms),BuildingType.1,Electricity(kBtu),NaturalGas(kBtu),TotalGHGEmissions,GHGEmissionsIntensity,SiteEnergyUse(kWh),SiteEnergyUseWN(kWh)
0,1,2019,MAYFLOWER PARK HOTEL,NonResidential,405 OLIVE WAY,98101.0,47.61220,-122.33799,DOWNTOWN,1.0,...,1134817,2159078,13208,Hotel,3871996,1320791,208.8,2.4,2.512434e+07,2.508560e+07
1,2,2019,PARAMOUNT HOTEL,NonResidential,724 PINE ST,98101.0,47.61317,-122.33393,DOWNTOWN,1.0,...,863688,0,50595,Hotel,2946902,5059502,286.9,3.2,2.743583e+07,2.731898e+07
2,3,2019,WESTIN HOTEL (Parent Building),NonResidential,1900 5TH AVE,98101.0,47.61367,-122.33822,DOWNTOWN,1.0,...,12917890,22601024,14264,Hotel,44075841,1426400,1549.0,2.0,2.332389e+08,2.323780e+08
3,5,2019,HOTEL MAX,NonResidential,620 STEWART ST,98101.0,47.61412,-122.33664,DOWNTOWN,1.0,...,645119,2104444,20134,Hotel,2201145,2013415,232.4,3.8,2.165631e+07,2.156134e+07
4,8,2019,WARWICK SEATTLE HOTEL,NonResidential,401 LENORA ST,98121.0,47.61375,-122.34047,DOWNTOWN,1.0,...,1595784,0,104303,Hotel,5444815,10430292,587.5,5.2,5.416811e+07,5.416811e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3569,50531,2019,CREW APARTMENTS,Multifamily LR (1-4),8228 GREEN LAKE DR N,98103.0,47.68899,-122.34363,NORTHWEST,1.0,...,419433,0,0,Multifamily LR (1-4),1431105,0,8.8,0.2,4.858999e+06,4.883133e+06
3577,50542,2019,Aegis Living in Ravenna,NonResidential,8511 15TH AVE NE,98115.0,47.69050,-122.31320,NORTH,1.0,...,655350,0,0,Senior Care Community,2236055,0,13.8,0.2,7.629736e+06,7.629736e+06
3578,50543,2019,AEGIS WEST SEATTLE,NonResidential,4700 SW ADMIRAL WAY,98116.0,47.58099,-122.39305,SOUTHWEST,1.0,...,744809,0,23401,Senior Care Community,2541289,2340118,139.9,2.1,1.663680e+07,1.665605e+07
3579,50633,2019,BROADWAY ESTATES LLC,Multifamily MR (5-9),515 HARVARD AVE E,98102.0,47.62366,-122.32229,EAST,1.0,...,546165,0,4891,Multifamily MR (5-9),1863514,489112,37.5,0.6,8.020860e+06,8.027490e+06


In [39]:
columns_to_convert

{'SiteEnergyUse(kBtu)': 'SiteEnergyUse(kWh)'}

In [16]:
sea_2015.insert(loc=30, column='SiteEnergyUse(kWh)', value=sea_2015['SiteEnergyUse(kBtu)'] * conventer)
sea_2015.insert(loc=32, column='SiteEnergyUseWN(kWh)', value=sea_2015['SiteEnergyUseWN(kBtu)'] * conventer)
sea_2015.insert(loc=34, column='SteamUse(kWh)', value=sea_2015['SteamUse(kBtu)'] * conventer)
sea_2015.insert(loc=39, column='NaturalGas(kWh)', value=sea_2015['NaturalGas(kBtu)'] * conventer)

### Droping Nan and zeros, renaming columns

In [17]:
sea_2015.dropna(subset=['SiteEnergyUse(kBtu)', 'SiteEnergyUse(kWh)', 'SiteEnergyUseWN(kBtu)',
                        'SiteEnergyUseWN(kWh)', 'SteamUse(kBtu)', 'SteamUse(kWh)',
                        'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)'],
                inplace=True)

sea_2015 = sea_2015[sea_2015['SiteEnergyUse(kBtu)'] != 0]

sea_2015.rename(columns={'GHGEmissions(MetricTonsCO2e)': 'TotalGHGEmissions',
                         'GHGEmissionsIntensity(kgCO2e/ft2)': 'GHGEmissionsIntensity'},
                inplace=True)

# columns with kWh to integer

convert_to_int = ["SiteEnergyUse(kWh)", "SiteEnergyUseWN(kWh)", "SteamUse(kWh)", "Electricity(kWh)", "NaturalGas(kWh)"]

convert_columns(sea_2015, convert_to_int, int)

In [18]:
sea_2015.to_csv('data/2015-building-energy-benchmarking-clean.csv', index=False)

In [90]:
sea_2019[[]'YearsENERGYSTARCertified', 'chuj'] = None