# Process raw sales records

In [1]:
# all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import itertools
from IPython.display import display 

# ipython display settings
display.max_columns = None
np.set_printoptions(threshold=np.inf)

In [43]:
# make sure your dir structure looks like this:
# dsga1001/ (git repo) --> dsga1001/ --> ipython/ --> .ipynb files
# dsga1001-data/ --> .pkl files

# path for exporting and importing data files
path = '../../../dsga1001-data/'

## Combine raw data files into one dataframe

In [2]:
colnames = ['borough',
            'neighborhood',
            'building_class_category',
            'tax_class_present', #can appear as "tax class as of final roll date" in some files
            'block',
            'lot',
            'easement',
            'building_class_present', #can appear as "building class as of final roll date" in some files
            'address',
            'apt_number', #can appear as NaN in some files -- included in the "address" field
            'zipcode',
            'residential_units',
            'commercial_units',
            'total_units',
            'land_sqft',
            'gross_sqft',
            'year_built',
            'tax_class_at_sale',
            'building_class_at_sale',
            'sale_price',
            'sale_date']

In [3]:
def read_in_data(filename):
    '''
    read in the file as a dataframe, skipping the first 4 rows.
    standardize the column names
    '''
    df = pd.read_excel(filename, skiprows=[0,1,2,3])
    
    #df = df.rename(str.lower, axis='columns')
    #df = df.rename((lambda colname : colname.rstrip()), axis='columns')
    #df = df.rename(columns={ df.columns[3]: "tax_class_present" }, inplace = True)
    
    df.columns = colnames
    return df

In [6]:
# read in all files (all years, all boroughs), concatenate as one df, and export to CSV

sales_raw_data_dir = '../sales-raw-data/'
years = list(range(2011, 2020))
boroughs = ['bronx', 'brooklyn', 'manhattan', 'queens', 'statenisland']

frames = []

for year, borough in itertools.product(years, boroughs):
    filename = sales_raw_data_dir + str(year) + '_' + borough + '.xls'
    data = read_in_data(filename)
    frames.append(data)

df = pd.concat(frames)

In [45]:
# save combined raw sales data as pickle
df.to_pickle(path + 'raw-sales.pkl')

## Examining data
[Data dictionary](https://www1.nyc.gov/assets/finance/downloads/pdf/07pdf/glossary_rsf071607.pdf)

In [46]:
# read in pickle of combined raw sales data
df = pd.read_pickle(path + 'raw-sales.pkl')
print(len(df))

616275


In [7]:
display(df) #confirm 21 columns

Unnamed: 0,borough,neighborhood,building_class_category,tax_class_present,block,lot,easement,building_class_present,address,apt_number,...,residential_units,commercial_units,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date
0,2,BATHGATE,01 ONE FAMILY HOMES,1,3037,42,,A1,4428 PARK AVENUE,,...,1.0,0.0,1.0,3525.0,1340.0,1899.0,1,A1,0,2011-02-08
1,2,BATHGATE,01 ONE FAMILY HOMES,1,3039,64,,A1,467 EAST 185 STREET,,...,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19
2,2,BATHGATE,01 ONE FAMILY HOMES,1,3046,34,,A1,2085 BATHGATE AV,,...,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29
3,2,BATHGATE,01 ONE FAMILY HOMES,1,3048,28,,A1,540 EAST 182ND STREET,,...,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21
4,2,BATHGATE,01 ONE FAMILY HOMES,1,3053,103,,A1,2327 BASSFORD AVENUE,,...,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8366,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7355,95,,B9,115 AMBASSADOR LANE,,...,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20
8367,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,48,,B9,108 BERKSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24
8368,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1,7094,49,,B9,35 WILTSHIRE LANE,,...,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22
8369,5,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,7349,28,,V0,N/A QUAIL LANE,,...,0.0,0.0,0.0,2640.0,0.0,,1,V0,26000,2019-08-22


## Cleaning data

In [31]:
# make a copy dataframe for cleaning
clean = df

In [32]:
# drop unnecessary cols
'''
the only cols we need to keep in valuation data:
- BBLE
- BLDGCL
- LTFRONT
- LTDEPTH
- STORIES
- FULLVAL
- BLDFRONT
- BLDDEPTH
- YEAR
- Latitude
- Longitude
- NTA
'''
dropcols = ['tax_class_present',
            'easement',
            'building_class_present']

clean = clean.drop(dropcols, axis=1)

In [33]:
# check unique vals on building_class_category col
building_class_vals = clean.building_class_category.unique()
building_class_vals.sort()
print(building_class_vals)

['                                            '
 '01  ONE FAMILY DWELLINGS                    '
 '01  ONE FAMILY HOMES                        ' '01 ONE FAMILY DWELLINGS'
 '01 ONE FAMILY DWELLINGS                    '
 '02  TWO FAMILY DWELLINGS                    '
 '02  TWO FAMILY HOMES                        ' '02 TWO FAMILY DWELLINGS'
 '02 TWO FAMILY DWELLINGS                    '
 '03  THREE FAMILY DWELLINGS                  '
 '03  THREE FAMILY HOMES                      '
 '03 THREE FAMILY DWELLINGS' '03 THREE FAMILY DWELLINGS                  '
 '04  TAX CLASS 1 CONDOS                      ' '04 TAX CLASS 1 CONDOS'
 '04 TAX CLASS 1 CONDOS                      '
 '05  TAX CLASS 1 VACANT LAND                 '
 '05 TAX CLASS 1 VACANT LAND'
 '05 TAX CLASS 1 VACANT LAND                 '
 '06  TAX CLASS 1 - OTHER                     ' '06 TAX CLASS 1 - OTHER'
 '06 TAX CLASS 1 - OTHER                     '
 '07  RENTALS - WALKUP APARTMENTS             '
 '07 RENTALS - WALKUP APARTMENT

In [34]:
# standardize building class values to the first two digits
clean['building_class_category'] = clean['building_class_category'].map(lambda val : val[:2])

In [35]:
# verify unique vals on building_class_category col after strip
building_class_vals = clean.building_class_category.unique()
building_class_vals.sort()
print(building_class_vals)

['  ' '01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12' '13'
 '14' '15' '16' '17' '18' '21' '22' '23' '24' '25' '26' '27' '28' '29'
 '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41' '42' '43'
 '44' '45' '46' '47' '48' '49']


In [36]:
rows_before = clean.shape[0]

# dispose rows where sale price is zero (ownership transfer events)
clean = clean[clean.sale_price != 0]

rows_after = clean.shape[0]

print(f'Removed={rows_before - rows_after}. Before={rows_before}, After={rows_after}')

Removed=240263. Before=773881, After=533618


In [37]:
rows_before = clean.shape[0]

# only keep Tax Class 1 and 2 records
clean = clean[clean['tax_class_at_sale'].isin([1, 2])]

rows_after = clean.shape[0]

print(f'Removed={rows_before - rows_after}. Before={rows_before}, After={rows_after}')

Removed=42314. Before=533618, After=491304


In [38]:
display(clean)

Unnamed: 0,borough,neighborhood,building_class_category,block,lot,address,apt_number,zipcode,residential_units,commercial_units,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date
1,2,BATHGATE,01,3039,64,467 EAST 185 STREET,,10458.0,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19
2,2,BATHGATE,01,3046,34,2085 BATHGATE AV,,10457.0,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29
3,2,BATHGATE,01,3048,28,540 EAST 182ND STREET,,10457.0,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21
4,2,BATHGATE,01,3053,103,2327 BASSFORD AVENUE,,10458.0,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25
5,2,BATHGATE,02,2929,128,559 CLAREMONT PARKWAY,,10457.0,2.0,0.0,2.0,2361.0,2394.0,1995.0,1,B1,365000,2011-07-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8364,5,WOODROW,02,7355,85,104 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,2730.0,2565.0,2001.0,1,B9,625000,2019-07-30
8366,5,WOODROW,02,7355,95,115 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20
8367,5,WOODROW,05,7094,48,108 BERKSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24
8368,5,WOODROW,05,7094,49,35 WILTSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22


## Apartment Number cleaning (skip for now -- not important)

TODO: check that each dataframe only has apartment number in this col index

In [None]:
null_apt_number_row_count = clean['apt_number'].isnull().values.sum()
print(f'rows with null apt_number: {null_apt_number_row_count}')

In [None]:
# split apartment number from address
new = clean['address'].str.split(',', n=1, expand=True)

new[1] = new[1].transform(lambda val : val.str.strip())
new[1].unique()

clean['address'] = new[0]
clean['apt_2'] = new[1]

extracted_apt_number_row_count = clean['apt_2'].notnull().values.sum()
print(f'rows with newly extracted apt_number: {extracted_apt_number_row_count}')

In [None]:
display(clean[['address', 'apt_2']].where(clean['apt_number'].notnull()))

In [None]:
# merge apt_number and apt_2 where apt_number is empty
clean['apt_number'] = clean['apt_number'].where(clean['apt_number'].notnull(), clean['apt_2'])

null_apt_number_row_count = clean['apt_number'].isnull().values.sum()
print(f'rows with null apt_number: {null_apt_number_row_count}')

In [None]:
display(clean['apt_2'].unique)

In [None]:
# drop the intermediate apt_2 column
clean = clean.drop(['apt_2'], axis=1)

In [None]:
# fill null apt_number rows with NaN
# clean['apt_number'] = clean['apt_number'].replace(r'^\s*$', np.nan, regex=True)
# clean['apt_number'] = clean['apt_number'].replace(None, np.nan, regex=True)

# strip spaces at the end
clean['apt_number'] = clean['apt_number'].map(lambda x: str(x).rstrip())

In [None]:
# apt_number_vals = clean['apt_number'].unique()
# apt_number_vals.sort()
# print(apt_number_vals)

In [None]:
# add column for (sale price / total_units)
# clean['sale_price_per_unit'] = clean['sale_price'] / clean['total_units']

In [None]:
display(clean)

## Pickle the cleaned dataframe
[pandas.DataFrame.to_pickle documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_pickle.html)

In [49]:
picklefile = 'sales.pkl'
clean.to_pickle(path + picklefile)

In [50]:
# test reading in pickle
pickle_df = pd.read_pickle(path + picklefile)
display(pickle_df)

Unnamed: 0,borough,neighborhood,building_class_category,block,lot,address,apt_number,zipcode,residential_units,commercial_units,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date
1,2,BATHGATE,01,3039,64,467 EAST 185 STREET,,10458.0,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19
2,2,BATHGATE,01,3046,34,2085 BATHGATE AV,,10457.0,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29
3,2,BATHGATE,01,3048,28,540 EAST 182ND STREET,,10457.0,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21
4,2,BATHGATE,01,3053,103,2327 BASSFORD AVENUE,,10458.0,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25
5,2,BATHGATE,02,2929,128,559 CLAREMONT PARKWAY,,10457.0,2.0,0.0,2.0,2361.0,2394.0,1995.0,1,B1,365000,2011-07-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8364,5,WOODROW,02,7355,85,104 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,2730.0,2565.0,2001.0,1,B9,625000,2019-07-30
8366,5,WOODROW,02,7355,95,115 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20
8367,5,WOODROW,05,7094,48,108 BERKSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24
8368,5,WOODROW,05,7094,49,35 WILTSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22


## [Optional] Export combined dataframe to CSV

In [None]:
exportfile = 'sales.csv'
df.to_csv(path + exportfile, index=False)

# assert that the exported CSV file exists
os.path.isfile(exportfile)

In [None]:
# check the newly produced CSV file
combined_sales_data = pd.read_csv(path + exportfile)

display(combined_sales_data)
print(combined_sales_data.shape)

## TODO
+ clean apt number