# Combine raw data files into one dataframe

In [38]:
# all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os.path
import itertools

In [39]:
colnames = ['borough',
            'neighborhood',
            'building_class_category',
            'tax_class_present', #can appear as "tax class as of final roll date" in some files
            'block',
            'lot',
            'easement',
            'building_class_present', #can appear as "building class as of final roll date" in some files
            'address',
            'apt_number', #can appear as NaN in some files -- included in the "address" field
            'zipcode',
            'residential_units',
            'commercial_units',
            'total_units',
            'land_sqft',
            'gross_sqft',
            'year_built',
            'tax_class_at_sale',
            'building_class_at_sale',
            'sale_price',
            'sale_date']

In [40]:
def read_in_data(filename):
    '''
    read in the file as a dataframe, skipping the first 4 rows.
    standardize the column names
    '''
    df = pd.read_excel(filename, skiprows=[0,1,2,3])
    
    #df = df.rename(str.lower, axis='columns')
    #df = df.rename((lambda colname : colname.rstrip()), axis='columns')
    #df = df.rename(columns={ df.columns[3]: "tax_class_present" }, inplace = True)
    
    df.columns = colnames
    return df

In [None]:
# read in all files (all years, all boroughs), concatenate as one df, and export to CSV

path = '../data/'
years = list(range(2011, 2020))
boroughs = ['bronx', 'brooklyn', 'manhattan', 'queens', 'statenisland']

frames = []

for year, borough in itertools.product(years, boroughs):
    filename = path + str(year) + '_' + borough + '.xls'
    data = read_in_data(filename)
    frames.append(data)

df = pd.concat(frames)

display(df)
print(df.shape) #confirm 21 columns

# Examining data
[Data dictionary](https://www1.nyc.gov/assets/finance/downloads/pdf/07pdf/glossary_rsf071607.pdf)

In [None]:
building_class_vals = df.building_class_category.unique()
building_class_vals.sort()
print(building_class_vals)

# Cleaning data

In [None]:
# make a copy dataframe and clean; drop some unnecessary cols
clean = df

In [None]:
# Changed this to not drop block, lot, easement
dropcols = ['tax_class_present',
            #'block',
            #'lot',
            #'easement',
            'building_class_present'
            #'tax_class_at_sale',
            #'building_class_at_sale'
           ]
clean = clean.drop(dropcols, axis=1)

In [None]:
# dispose rows where sale price is zero (ownership transfer events)
clean = clean[clean.sale_price != 0]
print(f'This removed {773881 - clean.shape[0]} values out of formerly {773881}, now {clean.shape[0]} are left.')

In [None]:
# only keep Tax Class 1 and 2 records
clean = clean[clean['tax_class_at_sale'].isin([1, 2])]
clean.tax_class_at_sale.unique()

In [None]:
# clean apartment numbers
new = clean['address'].str.split(',', n=1, expand=True)

new[1] = new[1].transform(lambda val : val.str.strip())
new[1].unique()

clean['address'] = new[0]
clean['apt_2'] = new[1]
# TODO: need to merge apt and apt_2 cleanly

In [None]:
# add column for (sale price / total_units)
clean['sale_price_per_unit'] = clean['sale_price'] / clean['total_units']

In [None]:
display(clean)

# Pickle the cleaned dataframe
[pandas.DataFrame.to_pickle documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_pickle.html)

In [None]:
picklefile = '../data/sales_combined.pkl'
clean.to_pickle(picklefile)

In [None]:
# test reading in pickle
pickle_df = pd.read_pickle(picklefile)
#display(pickle_df)

# [Optional] Export combined dataframe to CSV

In [None]:
exportfile = '../data/sales_combined.csv'
df.to_csv(exportfile, index=False)

# assert that the exported CSV file exists
os.path.isfile(exportfile)

In [None]:
# check the newly produced CSV file
combined_data = pd.read_csv(exportfile)

display(combined_data)
print(combined_data.shape)

# TODO
+ standardize building class category
+ clean apt number