In [None]:
import pandas as pd
import ast
from columns_order import fixed_columns_order

columns_order = fixed_columns_order

def convert_columns(df, columns, new_type):
    for column in columns:
        df[column] = df[column].astype(new_type)
        

def drop_columns(df, columns_to_drop):
    df.drop(columns_to_drop, axis=1, inplace=True)
    
    # delete Nan and zeros 
    to_zero = df.loc[:, 'SiteEUI(kBtu/sf)':'NaturalGas(kBtu)'].columns.tolist()
    df[to_zero] = df[to_zero].fillna(0)
    df = df[df['SiteEnergyUse(kBtu)'] != 0]
    return df

def convert(df, convert_to_float, convert_to_int):
    convert_columns(df=df, columns=convert_to_float, new_type=float)
    convert_columns(df=df, columns=convert_to_int, new_type=int)
    return df

def rename(df, columns_to_rename):
    df.rename(columns=columns_to_rename, inplace=True)
    return df

def sort_columns(df, columns_order):
    df = df[columns_order]
    return df

## 2016

In [None]:
sea_2016 = pd.read_csv('2016_Building_Energy_Benchmarking.csv')

path_16 = 'data/raw/2016_Building_Energy_Benchmarking.csv'

columns_to_drop_16 = [
    'City', 
    'State', 
    'TaxParcelIdentificationNumber',
    'PropertyGFABuilding(s)', 
    'ListOfAllPropertyUseTypes',
    'LargestPropertyUseType', 
    'LargestPropertyUseTypeGFA', 
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA', 
    'DefaultData',
    'Comments',
    'Outlier',
    'ComplianceStatus'
]

convert_to_float_16 = [
    'Latitude', 
    'Longitude',
    'ZipCode'
]

convert_to_int_16 = [
    'SiteEnergyUse(kBtu)',
    'SiteEnergyUseWN(kBtu)',
    'Electricity(kBtu)',
    'NaturalGas(kBtu)'
]

columns_to_rename_16 = {
    'TotalGHGEmissions': 'GHGEmissions(MetricTonsCO2e)',
    'GHGEmissionsIntensity': 'GHGEmissionsIntensity(kgCO2e/ft2)'
}

sea_2016 = pd.read_csv(path_16)
sea_2016 = drop_columns(sea_2016, columns_to_drop_16)
sea_2016 = convert(sea_2016, convert_to_float_16, convert_to_int_16)
sea_2016 = rename(sea_2016, columns_to_rename_16)
sea_2016[[columns_to_add_16]] = None
sea_2016 = sort_columns(sea_2016, columns_order)

sea_2016.to_csv('data/clean/2016-building-energy-benchmarking-clean.csv', index=False)

## 2017

In [None]:
sea_2017 = pd.read_csv('2017_Building_Energy_Benchmarking.csv')

path_17 = 'data/raw/2017_Building_Energy_Benchmarking.csv'

columns_to_drop_17 = [
    City', 
    'State', 
    'TaxParcelIdentificationNumber',
    'PropertyGFABuilding(s)', 
    'ListOfAllPropertyUseTypes',
    'LargestPropertyUseType', 
    'LargestPropertyUseTypeGFA', 
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA', 
    'DefaultData',
    'Outlier',
    'ComplianceStatus'
]

convert_to_float_17 = [
    'Latitude', 
    'Longitude',
    'ZipCode'
]

convert_to_int_17 = [
    'SiteEnergyUse(kBtu)',
    'SiteEnergyUseWN(kBtu)',
    'Electricity(kBtu)',
    'NaturalGas(kBtu)'
]

columns_to_rename_17 = {
    'TotalGHGEmissions': 'GHGEmissions(MetricTonsCO2e)',
    'GHGEmissionsIntensity': 'GHGEmissionsIntensity(kgCO2e/ft2)'
}

sea_2017 = pd.read_csv(path_17)
sea_2017 = drop_columns(sea_2017, columns_to_drop_17)
sea_2017 = convert(sea_2017, convert_to_float_16, convert_to_int_17)
sea_2017 = rename(sea_2017, columns_to_rename_17)
sea_2017[[columns_to_add_17]] = None
sea_2017 = sort_columns(sea_2017, columns_order)

sea_2017.to_csv('data/clean/2017-building-energy-benchmarking-clean.csv', index=False)

## 2015

### Extarcting address data
We need to extract latitude, longitude, address and zip from this dict-like column

In [9]:
path_15 = 'data/raw/2015_Building_Energy_Benchmarking.csv'
sea_2015 = pd.read_csv(path_15)

print(sea_2015['Location'][0])

{'latitude': '47.61219025', 'longitude': '-122.33799744', 'human_address': '{"address": "405 OLIVE WAY", "city": "SEATTLE", "state": "WA", "zip": "98101"}'}


In [10]:
# creating dictionary from a string in "Location" column
sea_2015['Location'] = sea_2015['Location'].apply(ast.literal_eval)
sea_2015['Location'][0]

{'latitude': '47.61219025',
 'longitude': '-122.33799744',
 'human_address': '{"address": "405 OLIVE WAY", "city": "SEATTLE", "state": "WA", "zip": "98101"}'}

In [11]:
# creating 'latitude' and 'longitude' columns from the dictionary
location = pd.json_normalize(sea_2015['Location'])
location.head()

Unnamed: 0,latitude,longitude,human_address
0,47.61219025,-122.33799744,"{""address"": ""405 OLIVE WAY"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
1,47.61310583,-122.33335756,"{""address"": ""724 PINE ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
2,47.61334897,-122.33769944,"{""address"": ""1900 5TH AVE"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
3,47.61421585,-122.33660889,"{""address"": ""620 STEWART ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98101""}"
4,47.6137544,-122.3409238,"{""address"": ""401 LENORA ST"", ""city"": ""SEATTLE"", ""state"": ""WA"", ""zip"": ""98121""}"


In [12]:
# do the same operations for "Human_address" column
location['human_address'] = location['human_address'].apply(ast.literal_eval)
address = pd.json_normalize(location['human_address'])
address.head()

Unnamed: 0,address,city,state,zip
0,405 OLIVE WAY,SEATTLE,WA,98101
1,724 PINE ST,SEATTLE,WA,98101
2,1900 5TH AVE,SEATTLE,WA,98101
3,620 STEWART ST,SEATTLE,WA,98101
4,401 LENORA ST,SEATTLE,WA,98121


In [14]:
# inserting new columns in a fixed order
sea_2015.insert(loc=5, column='Address', value=address['address'])
sea_2015.insert(loc=6, column='ZipCode', value=address['zip'])
sea_2015.insert(loc=9, column='Latitude', value=location['latitude'])
sea_2015.insert(loc=10, column='Longitude', value=location['longitude'])

columns_to_drop_15 =  [
    'TaxParcelIdentificationNumber',
    'Location',
    'PropertyGFABuilding(s)',
    'ListOfAllPropertyUseTypes',
    'LargestPropertyUseType', 
    'LargestPropertyUseTypeGFA', 
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA', 
    'OtherFuelUse(kBtu)',
    'DefaultData',
    'Comment',
    'ComplianceStatus',
    'Outlier',
    '2010 Census Tracts',
    'Seattle Police Department Micro Community Policing Plan Areas',
    'City Council Districts',
    'SPD Beats',
    'Zip Codes'    
]

convert_to_float_15 = [
    'Latitude', 
    'Longitude',
    'ZipCode'
]

convert_to_int_15 = [
    'SiteEnergyUse(kBtu)',
    'SiteEnergyUseWN(kBtu)',
    'SteamUse(kBtu)',
    'Electricity(kBtu)',
    'Electricity(kWh)',
    'NaturalGas(kBtu)',
    'NaturalGas(therms)'
]

sea_2015 = drop_columns(sea_2015, columns_to_drop_15)
sea_2015 = convert(sea_2015, convert_to_float_15, convert_to_int_15)
sea_2015 = sort_columns(sea_2015, columns_order)

sea_2015.to_csv('data/clean/2015-building-energy-benchmarking-clean.csv', index=False)

# 2020

In [3]:
# fixed order of columns for all data is saved in columns_order.py

path_20 = 'data/raw/2020_Building_Energy_Benchmarking.csv'

columns_to_drop_20 = [
    'TaxParcelIdentificationNumber',
    'City', 
    'State', 
    'PropertyGFABuilding(s)', 
    'LargestPropertyUseType', 
    'LargestPropertyUseTypeGFA', 
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA', 
    'EPAPropertyType',
    'ComplianceIssue',
    'ComplianceStatus'
]

convert_to_float_20 = [
    'Latitude', 
    'Longitude',
    'ZipCode'
]

convert_to_int_20 = [
    'SiteEnergyUse(kBtu)',
    'SiteEnergyUseWN(kBtu)',
    'Electricity(kBtu)',
    'NaturalGas(kBtu)'
]

columns_to_rename_20 = {
    'BuildingName': 'PropertyName',
    'TotalGHGEmissions': 'GHGEmissions(MetricTonsCO2e)',
    'GHGEmissionsIntensity': 'GHGEmissionsIntensity(kgCO2e/ft2)'
}

columns_to_add_20 = [
    'YearsENERGYSTARCertified',
    'PrimaryPropertyType'
]

sea_2020 = pd.read_csv(path_20)
sea_2020 = drop_columns(sea_2020, columns_to_drop_20)
sea_2020 = convert(sea_2020, convert_to_float_20, convert_to_int_20)
sea_2020 = rename(sea_2020, columns_to_rename_20)
sea_2020[[columns_to_add_20]] = None
sea_2020 = sort_columns(sea_2020, columns_order)

sea_2020.to_csv('data/clean/2020-building-energy-benchmarking-clean.csv', index=False)

# 2019

In [5]:
path_19 = 'data/raw/2019_Building_Energy_Benchmarking.csv'

columns_to_drop_19 = [
    'TaxParcelIdentificationNumber',
    'City', 
    'State', 
    'PropertyGFABuilding(s)', 
    'LargestPropertyUseType', 
    'LargestPropertyUseTypeGFA', 
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA', 
    'EPAPropertyType',
    'ComplianceIssue',
    'ComplianceStatus'
]

convert_to_float_19 = [
    'Latitude', 
    'Longitude',
    'ZipCode'
]

convert_to_int_19 = [
    'SiteEnergyUse(kBtu)',
    'SiteEnergyUseWN(kBtu)',
    'Electricity(kBtu)',
    'NaturalGas(kBtu)'
]

columns_to_rename_19 = {
    'BuildingName': 'PropertyName',
    'TotalGHGEmissions': 'GHGEmissions(MetricTonsCO2e)',
    'GHGEmissionsIntensity': 'GHGEmissionsIntensity(kgCO2e/ft2)'
}

columns_to_add_19 = [
    'NumberofBuildings',
    'YearsENERGYSTARCertified',
]

sea_2019 = pd.read_csv(path_19)
sea_2019 = drop_columns(sea_2019, columns_to_drop_19)
sea_2019 = convert(sea_2019, convert_to_float_19, convert_to_int_19)
sea_2019 = rename(sea_2019, columns_to_rename_19)
sea_2019[[columns_to_add_19]] = None
sea_2019 = sort_columns(sea_2019, columns_order)

sea_2019.to_csv('data/clean/2019-building-energy-benchmarking-clean.csv', index=False)

In [5]:
from os import listdir
from os.path import isfile, join

data_path = 'E:\python\projects\seattle\data\clean'
files_full_path = [join(mypath, f) for f in listdir(data_path) if isfile(join(data_path, f))]
files_full_path

['E:\\python\\projects\\seattle\\data\\clean\\2015-building-energy-benchmarking-clean.csv',
 'E:\\python\\projects\\seattle\\data\\clean\\2016-building-energy-benchmarking-clean.csv',
 'E:\\python\\projects\\seattle\\data\\clean\\2017-building-energy-benchmarking-clean.csv',
 'E:\\python\\projects\\seattle\\data\\clean\\2019-building-energy-benchmarking-clean.csv',
 'E:\\python\\projects\\seattle\\data\\clean\\2020-building-energy-benchmarking-clean.csv']

In [35]:
pd.read_csv(r'E:\python\projects\seattle\data\clean\2016-building-energy-benchmarking-clean.csv')

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,Address,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,...,SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kBtu),Electricity(kWh),NaturalGas(therms),NaturalGas(kBtu),GHGEmissions(MetricTonsCO2e),GHGEmissionsIntensity(kgCO2e/ft2)
0,1,2016,NonResidential,Hotel,Mayflower park hotel,405 Olive way,98101.0,7,DOWNTOWN,47.61220,...,189.000000,7226362,7456910,2003882.00,3946027,1.156514e+06,12764.529300,1276453,249.98,2.83
1,2,2016,NonResidential,Hotel,Paramount Hotel,724 Pine street,98101.0,7,DOWNTOWN,47.61317,...,179.399994,8387933,8664479,0.00,3242851,9.504252e+05,51450.816410,5145082,295.86,2.86
2,3,2016,NonResidential,Hotel,5673-The Westin Seattle,1900 5th Avenue,98101.0,7,DOWNTOWN,47.61393,...,244.100006,72587024,73937112,21566554.00,49526664,1.451544e+07,14938.000000,1493800,2089.28,2.19
3,5,2016,NonResidential,Hotel,HOTEL MAX,620 STEWART ST,98101.0,7,DOWNTOWN,47.61412,...,224.000000,6794584,6946800,2214446.25,2768924,8.115253e+05,18112.130860,1811213,286.43,4.67
4,8,2016,NonResidential,Hotel,WARWICK SEATTLE HOTEL (ID8),401 LENORA ST,98121.0,7,DOWNTOWN,47.61375,...,215.600006,14172606,14656503,0.00,5368607,1.573449e+06,88039.984380,8803998,505.01,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3348,50222,2016,Nonresidential COS,Office,Horticulture building,1600 S Dakota St,,2,GREATER DUWAMISH,47.56722,...,176.100006,849745,943003,0.00,524270,1.536550e+05,3254.750244,325475,20.94,1.70
3349,50223,2016,Nonresidential COS,Other,International district/Chinatown CC,719 8th Ave S,,2,DOWNTOWN,47.59625,...,118.900002,950276,1053705,0.00,396546,1.162210e+05,5537.299805,553729,32.17,2.01
3350,50224,2016,Nonresidential COS,Other,Queen Anne Pool,1920 1st Ave W,,7,MAGNOLIA / QUEEN ANNE,47.63644,...,767.799988,5765898,6053764,0.00,1792158,5.252517e+05,39737.390630,3973739,223.54,16.99
3351,50225,2016,Nonresidential COS,Mixed Use Property,South Park Community Center,8319 8th Ave S,,1,GREATER DUWAMISH,47.52832,...,110.800003,719471,782841,0.00,348870,1.022480e+05,3706.010010,370601,22.11,1.57


In [24]:
for file in files_full_path:
    print(file)
    df = pd.read_csv(file)
#     df_columns = df.columns
#     print(df_columns)

E:\python\projects\seattle\data\clean\2015-building-energy-benchmarking-clean.csv
E:\python\projects\seattle\data\clean\2016-building-energy-benchmarking-clean.csv
E:\python\projects\seattle\data\clean\2017-building-energy-benchmarking-clean.csv


ParserError: Error tokenizing data. C error: Expected 5 fields in line 499, saw 6


In [14]:
df1 = pd.read_csv(files_full_path[0])
df2 = pd.read_csv(files_full_path[1])

In [19]:
len(pd.concat([df1, df2]))

6681

In [None]:
for i in range(4, 10, 3):
  values = [i, i+1, i+2]
  zipped = zip(columns, values)
  a_dictionary = dict(zipped)
  print(a_dictionary)
  data.append(a_dictionary)
OUTPUT
{'A': 4, 'B': 5, 'C': 6}
{'A': 7, 'B': 8, 'C': 9}

df = df.append(data, True)

print(df)