# Combine raw data files into one dataframe

In [38]:
# all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os.path
import itertools

In [39]:
colnames = ['borough',
            'neighborhood',
            'building_class_category',
            'tax_class_present', #can appear as "tax class as of final roll date" in some files
            'block',
            'lot',
            'easement',
            'building_class_present', #can appear as "building class as of final roll date" in some files
            'address',
            'apt_number', #can appear as NaN in some files -- included in the "address" field
            'zipcode',
            'residential_units',
            'commercial_units',
            'total_units',
            'land_sqft',
            'gross_sqft',
            'year_built',
            'tax_class_at_sale',
            'building_class_at_sale',
            'sale_price',
            'sale_date']

In [40]:
def read_in_data(filename):
    '''
    read in the file as a dataframe, skipping the first 4 rows.
    standardize the column names
    '''
    df = pd.read_excel(filename, skiprows=[0,1,2,3])
    
    #df = df.rename(str.lower, axis='columns')
    #df = df.rename((lambda colname : colname.rstrip()), axis='columns')
    #df = df.rename(columns={ df.columns[3]: "tax_class_present" }, inplace = True)
    
    df.columns = colnames
    return df

In [41]:
# read in all files (all years, all boroughs), concatenate as one df, and export to CSV

path = '../data/'
years = list(range(2011, 2020))
boroughs = ['bronx', 'brooklyn', 'manhattan', 'queens', 'statenisland']

frames = []

for year, borough in itertools.product(years, boroughs):
    filename = path + str(year) + '_' + borough + '.xls'
    data = read_in_data(filename)
    frames.append(data)

df = pd.concat(frames)

display(df)
print(df.shape) #confirm 21 columns

Unnamed: 0,borough,neighborhood,building_class_category,tax_class_present,block,lot,easement,building_class_present,address,apt_number,...,residential_units,commercial_units,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date
0,2,BATHGATE,01 ONE FAMILY HOMES,1,3037,42,,A1,4428 PARK AVENUE,,...,1.0,0.0,1.0,3525.0,1340.0,1899.0,1,A1,0,2011-02-08
1,2,BATHGATE,01 ONE FAMILY HOMES,1,3039,64,,A1,467 EAST 185 STREET,,...,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19
2,2,BATHGATE,01 ONE FAMILY HOMES,1,3046,34,,A1,2085 BATHGATE AV,,...,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29
3,2,BATHGATE,01 ONE FAMILY HOMES,1,3048,28,,A1,540 EAST 182ND STREET,,...,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21
4,2,BATHGATE,01 ONE FAMILY HOMES,1,3053,103,,A1,2327 BASSFORD AVENUE,,...,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8366,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7355,95,,B9,115 AMBASSADOR LANE,,...,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20
8367,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,48,,B9,108 BERKSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24
8368,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,49,,B9,35 WILTSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22
8369,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,7349,28,,V0,N/A QUAIL LANE,,...,0.0,0.0,0.0,2640.0,0.0,,1,V0,26000,2019-08-22


(773881, 21)


# Examining data
[Data dictionary](https://www1.nyc.gov/assets/finance/downloads/pdf/07pdf/glossary_rsf071607.pdf)

In [42]:
building_class_vals = df.building_class_category.unique()
building_class_vals.sort()
print(building_class_vals)

['                                            '
 '01  ONE FAMILY DWELLINGS                    '
 '01  ONE FAMILY HOMES                        ' '01 ONE FAMILY DWELLINGS'
 '01 ONE FAMILY DWELLINGS                    '
 '02  TWO FAMILY DWELLINGS                    '
 '02  TWO FAMILY HOMES                        ' '02 TWO FAMILY DWELLINGS'
 '02 TWO FAMILY DWELLINGS                    '
 '03  THREE FAMILY DWELLINGS                  '
 '03  THREE FAMILY HOMES                      '
 '03 THREE FAMILY DWELLINGS' '03 THREE FAMILY DWELLINGS                  '
 '04  TAX CLASS 1 CONDOS                      ' '04 TAX CLASS 1 CONDOS'
 '04 TAX CLASS 1 CONDOS                      '
 '05  TAX CLASS 1 VACANT LAND                 '
 '05 TAX CLASS 1 VACANT LAND'
 '05 TAX CLASS 1 VACANT LAND                 '
 '06  TAX CLASS 1 - OTHER                     ' '06 TAX CLASS 1 - OTHER'
 '06 TAX CLASS 1 - OTHER                     '
 '07  RENTALS - WALKUP APARTMENTS             '
 '07 RENTALS - WALKUP APARTMENT

# Cleaning data

In [43]:
# make a copy dataframe and clean; drop some unnecessary cols
clean = df

In [44]:
# Changed this to not drop block, lot, easement
dropcols = ['tax_class_present',
            #'block',
            #'lot',
            #'easement',
            'building_class_present'
            #'tax_class_at_sale',
            #'building_class_at_sale'
           ]
clean = clean.drop(dropcols, axis=1)

In [45]:
# dispose rows where sale price is zero (ownership transfer events)
clean = clean[clean.sale_price != 0]
print(f'This removed {773881 - clean.shape[0]} values out of formerly {773881}, now {clean.shape[0]} are left.')

This removed 240263 values out of formerly 773881, now 533618 are left.


In [46]:
# only keep Tax Class 1 and 2 records
clean = clean[clean['tax_class_at_sale'].isin([1, 2])]
clean.tax_class_at_sale.unique()

array([1, 2])

In [47]:
# clean apartment numbers
new = clean['address'].str.split(',', n=1, expand=True)

new[1] = new[1].transform(lambda val : val.str.strip())
new[1].unique()

clean['address'] = new[0]
clean['apt_2'] = new[1]
# TODO: need to merge apt and apt_2 cleanly

In [48]:
# add column for (sale price / total_units)
clean['sale_price_per_unit'] = clean['sale_price'] / clean['total_units']

In [49]:
display(clean)

Unnamed: 0,borough,neighborhood,building_class_category,block,lot,easement,address,apt_number,zipcode,residential_units,...,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date,apt_2,sale_price_per_unit
1,2,BATHGATE,01 ONE FAMILY HOMES,3039,64,,467 EAST 185 STREET,,10458.0,1.0,...,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19,,329000.0
2,2,BATHGATE,01 ONE FAMILY HOMES,3046,34,,2085 BATHGATE AV,,10457.0,1.0,...,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29,,288500.0
3,2,BATHGATE,01 ONE FAMILY HOMES,3048,28,,540 EAST 182ND STREET,,10457.0,1.0,...,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21,,122500.0
4,2,BATHGATE,01 ONE FAMILY HOMES,3053,103,,2327 BASSFORD AVENUE,,10458.0,1.0,...,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25,,10.0
5,2,BATHGATE,02 TWO FAMILY HOMES,2929,128,,559 CLAREMONT PARKWAY,,10457.0,2.0,...,2.0,2361.0,2394.0,1995.0,1,B1,365000,2011-07-13,,182500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8364,5,WOODROW,02 TWO FAMILY DWELLINGS,7355,85,,104 AMBASSADOR LANE,,10309.0,2.0,...,2.0,2730.0,2565.0,2001.0,1,B9,625000,2019-07-30,,312500.0
8366,5,WOODROW,02 TWO FAMILY DWELLINGS,7355,95,,115 AMBASSADOR LANE,,10309.0,2.0,...,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20,,318750.0
8367,5,WOODROW,05 TAX CLASS 1 VACANT LAND,7094,48,,108 BERKSHIRE LANE,,10309.0,2.0,...,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24,,299850.5
8368,5,WOODROW,05 TAX CLASS 1 VACANT LAND,7094,49,,35 WILTSHIRE LANE,,10309.0,2.0,...,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22,,448030.0


# Pickle the cleaned dataframe
[pandas.DataFrame.to_pickle documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_pickle.html)

In [50]:
picklefile = '../data/sales_combined.pkl'
clean.to_pickle(picklefile)

In [51]:
# test reading in pickle
pickle_df = pd.read_pickle(picklefile)
#display(pickle_df)

# [Optional] Export combined dataframe to CSV

In [None]:
exportfile = '../data/sales_combined.csv'
df.to_csv(exportfile, index=False)

# assert that the exported CSV file exists
os.path.isfile(exportfile)

In [None]:
# check the newly produced CSV file
combined_data = pd.read_csv(exportfile)

display(combined_data)
print(combined_data.shape)

# TODO
+ standardize building class category
+ clean apt number