In [1]:
# all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os.path
import itertools

In [2]:
colnames = ['borough',
            'neighborhood',
            'building_class_category',
            'tax_class_present', #can appear as "tax class as of final roll date" in some files
            'block',
            'lot',
            'easement',
            'building_class_present', #can appear as "building class as of final roll date" in some files
            'address',
            'apt_number', #can appear as NaN in some files; included in the "address" field
            'zipcode',
            'residential_units',
            'commercial_units',
            'total_units',
            'land_sqft',
            'gross_sqft',
            'year_built',
            'tax_class_at_sale',
            'building_class_at_sale',
            'sale_price',
            'sale_date']

In [3]:
def read_in_data(filename):
    '''
    read in the file as a dataframe, skipping the first 4 rows.
    standardize the column names
    '''
    df = pd.read_excel(filename, skiprows=[0,1,2,3])
    
    #df = df.rename(str.lower, axis='columns')
    #df = df.rename((lambda colname : colname.rstrip()), axis='columns')
    #df = df.rename(columns={ df.columns[3]: "tax_class_present" }, inplace = True)
    
    df.columns = colnames
    return df

In [4]:
# read in all files (all years, all boroughs), concatenate as one df, and export to CSV

path = '../data/'
years = list(range(2011, 2020))
boroughs = ['bronx', 'brooklyn', 'manhattan', 'queens', 'statenisland']
#boroughs = ['manhattan']

frames = []

for year, borough in itertools.product(years, boroughs):
    filename = path + str(year) + '_' + borough + '.xls'
    data = read_in_data(filename)
    frames.append(data)

df = pd.concat(frames)

display(df)
print(df.shape) #confirm 21 columns

Unnamed: 0,borough,neighborhood,building_class_category,tax_class_present,block,lot,easement,building_class_present,address,apt_number,...,residential_units,commercial_units,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date
0,2,BATHGATE,01 ONE FAMILY HOMES,1,3037,42,,A1,4428 PARK AVENUE,,...,1.0,0.0,1.0,3525.0,1340.0,1899.0,1,A1,0,2011-02-08
1,2,BATHGATE,01 ONE FAMILY HOMES,1,3039,64,,A1,467 EAST 185 STREET,,...,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19
2,2,BATHGATE,01 ONE FAMILY HOMES,1,3046,34,,A1,2085 BATHGATE AV,,...,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29
3,2,BATHGATE,01 ONE FAMILY HOMES,1,3048,28,,A1,540 EAST 182ND STREET,,...,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21
4,2,BATHGATE,01 ONE FAMILY HOMES,1,3053,103,,A1,2327 BASSFORD AVENUE,,...,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8366,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7355,95,,B9,115 AMBASSADOR LANE,,...,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20
8367,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,48,,B9,108 BERKSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24
8368,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,49,,B9,35 WILTSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22
8369,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,7349,28,,V0,N/A QUAIL LANE,,...,0.0,0.0,0.0,2640.0,0.0,,1,V0,26000,2019-08-22


In [7]:
exportfile = '../data/sales_combined.csv'
df.to_csv(exportfile, index=False)

# assert that the exported CSV file exists
os.path.isfile(exportfile)

True

In [8]:
# check the newly produced CSV file
combined_data = pd.read_csv(exportfile)

display(combined_data)
print(combined_data.shape)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,borough,neighborhood,building_class_category,tax_class_present,block,lot,easement,building_class_present,address,apt_number,...,residential_units,commercial_units,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date
0,2,BATHGATE,01 ONE FAMILY HOMES,1,3037,42,,A1,4428 PARK AVENUE,,...,1.0,0.0,1.0,3525.0,1340.0,1899.0,1,A1,0,2011-02-08
1,2,BATHGATE,01 ONE FAMILY HOMES,1,3039,64,,A1,467 EAST 185 STREET,,...,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19
2,2,BATHGATE,01 ONE FAMILY HOMES,1,3046,34,,A1,2085 BATHGATE AV,,...,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29
3,2,BATHGATE,01 ONE FAMILY HOMES,1,3048,28,,A1,540 EAST 182ND STREET,,...,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21
4,2,BATHGATE,01 ONE FAMILY HOMES,1,3053,103,,A1,2327 BASSFORD AVENUE,,...,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773876,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7355,95,,B9,115 AMBASSADOR LANE,,...,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20
773877,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,48,,B9,108 BERKSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24
773878,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,49,,B9,35 WILTSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22
773879,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,7349,28,,V0,N/A QUAIL LANE,,...,0.0,0.0,0.0,2640.0,0.0,,1,V0,26000,2019-08-22


(773881, 21)


**Unused cells**



In [None]:
# df1 = read_in_data('../data/2011_manhattan.xls')
# df2 = read_in_data('../data/2012_manhattan.xls')
# df3 = read_in_data('../data/2013_manhattan.xls')
# df4 = read_in_data('../data/2014_manhattan.xls')
# df5 = read_in_data('../data/2015_manhattan.xls')
# df6 = read_in_data('../data/2016_manhattan.xls')
# df7 = read_in_data('../data/2017_manhattan.xls')
# df8 = read_in_data('../data/2018_manhattan.xls')
# df9 = read_in_data('../data/2019_manhattan.xls')

# frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9]
# df = pd.concat(frames)