# Combine Sales and Valuation datasets into one dataframe

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import itertools
import datetime

from IPython.display import display
pd.options.display.max_columns = None

In [2]:
# make sure your dir structure looks like this:
# dsga1001/ (git repo) --> dsga1001/ --> ipython/ --> .ipynb files
# dsga1001-data/ --> .pkl files

# path for exporting and importing data files
path = '../../../dsga1001-data/'

## Read in the Sales and Valuation dataframes

In [3]:
# Read the datasets into dataframes
sales = pd.read_pickle(path + 'sales.pkl')
valuation = pd.read_pickle(path + 'valuation.pkl')

In [4]:
display(sales)

Unnamed: 0,borough,neighborhood,building_class_category,block,lot,address,apt_number,zipcode,residential_units,commercial_units,total_units,land_sqft,gross_sqft,year_built,tax_class_at_sale,building_class_at_sale,sale_price,sale_date
1,2,BATHGATE,01,3039,64,467 EAST 185 STREET,,10458.0,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19
2,2,BATHGATE,01,3046,34,2085 BATHGATE AV,,10457.0,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29
3,2,BATHGATE,01,3048,28,540 EAST 182ND STREET,,10457.0,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21
4,2,BATHGATE,01,3053,103,2327 BASSFORD AVENUE,,10458.0,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25
5,2,BATHGATE,02,2929,128,559 CLAREMONT PARKWAY,,10457.0,2.0,0.0,2.0,2361.0,2394.0,1995.0,1,B1,365000,2011-07-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8364,5,WOODROW,02,7355,85,104 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,2730.0,2565.0,2001.0,1,B9,625000,2019-07-30
8366,5,WOODROW,02,7355,95,115 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20
8367,5,WOODROW,05,7094,48,108 BERKSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24
8368,5,WOODROW,05,7094,49,35 WILTSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22


In [5]:
display(valuation)

Unnamed: 0,BBLE,BLDGCL,LTFRONT,LTDEPTH,STORIES,FULLVAL,BLDFRONT,YEAR,Latitude,Longitude,NTA
0,1010401210,R4,0,0,35.0,113465,327,2010/11,40.762988,-73.987791,Clinton
1,1008901282,R4,0,0,19.0,135599,0,2010/11,40.746162,-73.979129,Murray Hill-Kips Bay
2,1009681288,R4,0,0,37.0,222481,0,2010/11,40.745156,-73.972500,Turtle Bay-East Midtown
3,1008701611,R4,244,425,27.0,172555,0,2010/11,40.734072,-73.988713,Gramercy
4,1010381112,R4,0,0,43.0,175784,100,2010/11,40.761262,-73.987572,Clinton
...,...,...,...,...,...,...,...,...,...,...,...
9845849,1006951519,R4,0,0,14.0,131716,0,2010/11,40.748400,-74.005836,Hudson Yards-Chelsea-Flatiron-Union Square
9845852,1007650020,C4,25,103,5.0,782000,92,2010/11,40.739672,-74.000173,Hudson Yards-Chelsea-Flatiron-Union Square
9845854,1007640075,C4,25,93,4.0,3340000,55,2010/11,40.740141,-74.002227,Hudson Yards-Chelsea-Flatiron-Union Square
9845855,1006260047,S4,24,88,4.0,6460000,40,2010/11,40.738143,-74.004471,West Village


## Standardize column names

In [6]:
# Change the column names to be lower case, and replace white space with underscores
valuation.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
sales.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)

In [7]:
# add prefix to all cols depending on dataset
sales.rename(columns=lambda x: 's_' + x, inplace=True)
valuation.rename(columns=lambda x: 'v_' + x, inplace=True)

In [8]:
sales.columns

Index(['s_borough', 's_neighborhood', 's_building_class_category', 's_block',
       's_lot', 's_address', 's_apt_number', 's_zipcode',
       's_residential_units', 's_commercial_units', 's_total_units',
       's_land_sqft', 's_gross_sqft', 's_year_built', 's_tax_class_at_sale',
       's_building_class_at_sale', 's_sale_price', 's_sale_date'],
      dtype='object')

In [9]:
valuation.columns

Index(['v_bble', 'v_bldgcl', 'v_ltfront', 'v_ltdepth', 'v_stories',
       'v_fullval', 'v_bldfront', 'v_year', 'v_latitude', 'v_longitude',
       'v_nta'],
      dtype='object')

## Create Index

The BBL number is going to be ideal for indexing as it is used for this purpose by the NYC databases.

https://www.pincusco.com/data-dictionary/bbl-also-known-as-borough-block-and-lot-or-property-id/
https://www.propertyshark.com/Real-Estate-Reports/2018/12/13/all-you-need-to-know-about-nycs-bbl/#:~:text=BBL%20stands%20for%20borough%2C%20block,1%20%3D%20Manhattan

In [10]:
# for valuation data, BBLE is the same as BBL for non-easement rows -- rename as 'v_bbl'
valuation.rename(columns={'v_bble':'v_bbl'}, inplace=True)

In [12]:
# Add the BBL column for indexing to the sales data
sales['s_block'] = sales['s_block'].astype(str).map(lambda x : x.zfill(5))
sales['s_lot'] = sales['s_lot'].astype(str).map(lambda x : x.zfill(4))

sales['s_bbl'] = sales['s_borough'].astype(str) + sales['s_block'].astype(str) + sales['s_lot'].astype(str)

In [13]:
# create a binned year col in sales df (cutoff: July 1)

# first, convert string into datetime
# if the date in a given year N is past July 1st, then mark it as 'N/N+1'
# otherwise, mark as 'N-1/N'

def bin_year(sale_date):
    year = sale_date.year
    if datetime.date(year, 7, 1) <= sale_date:
        return str(year) + '/' + str(year + 1)[-2:]
    else:
        return str(year - 1) + '/' + str(year)[-2:]

# save that result string into a new col called 'year' (index col -- should be same as valuation col)
sales['s_year'] = pd.to_datetime(sales['s_sale_date']).map(lambda x : bin_year(x))

In [15]:
# print how many unique BBLs we have in sales and valuation datasets
print(sales.s_bbl.nunique())
print(valuation.v_bbl.nunique())

294796
991041


-------------------------------------------------------------------------------

## Merging

In [22]:
inner_merged = pd.merge(left=sales, right=valuation, how='inner', left_on=['s_bbl', 's_year'], right_on=['v_bbl', 'v_year'])

In [23]:
# drop duplicate index cols
inner_merged = inner_merged.drop(['v_bbl', 'v_year'], axis=1)

In [24]:
display(inner_merged)

Unnamed: 0,s_borough,s_neighborhood,s_building_class_category,s_block,s_lot,s_address,s_apt_number,s_zipcode,s_residential_units,s_commercial_units,s_total_units,s_land_sqft,s_gross_sqft,s_year_built,s_tax_class_at_sale,s_building_class_at_sale,s_sale_price,s_sale_date,s_bbl,s_year,v_bldgcl,v_ltfront,v_ltdepth,v_stories,v_fullval,v_bldfront,v_latitude,v_longitude,v_nta
0,2,BATHGATE,01,03039,0064,467 EAST 185 STREET,,10458.0,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19,2030390064,2010/11,A1,16,100,2.0,365000,44,40.856635,-73.892647,Claremont-Bathgate
1,2,BATHGATE,01,03046,0034,2085 BATHGATE AV,,10457.0,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29,2030460034,2010/11,A1,22,93,2.7,322000,37,40.850613,-73.895064,Claremont-Bathgate
2,2,BATHGATE,01,03048,0028,540 EAST 182ND STREET,,10457.0,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21,2030480028,2010/11,A1,15,80,2.0,299000,30,40.853004,-73.892627,East Tremont
3,2,BATHGATE,01,03053,0103,2327 BASSFORD AVENUE,,10458.0,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25,2030530103,2011/12,A1,20,45,2.0,376000,30,40.855847,-73.891755,Claremont-Bathgate
4,2,BATHGATE,02,02929,0128,559 CLAREMONT PARKWAY,,10457.0,2.0,0.0,2.0,2361.0,2394.0,1995.0,1,B1,365000,2011-07-13,2029290128,2011/12,B1,0,42,3.0,403000,42,40.837745,-73.900144,Claremont-Bathgate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425729,5,WOODROW,02,07316,0084,57 DARNELL LANE,,10309.0,2.0,0.0,2.0,2311.0,1500.0,1995.0,1,B2,620000,2019-06-20,5073160084,2018/19,B2,26,88,2.0,518000,44,40.536003,-74.223213,Charleston-Richmond Valley-Tottenville
425730,5,WOODROW,02,07317,0119,73 ROBIN COURT,,10309.0,2.0,0.0,2.0,4125.0,2160.0,1994.0,1,B2,625000,2019-05-21,5073170119,2018/19,B2,25,165,3.0,527000,40,40.536140,-74.222012,Charleston-Richmond Valley-Tottenville
425731,5,WOODROW,02,07349,0004,51 PHEASANT LANE,,10309.0,2.0,0.0,2.0,2123.0,1850.0,1998.0,1,B9,670000,2019-02-22,5073490004,2018/19,B9,23,91,2.0,581000,52,40.532829,-74.222422,Charleston-Richmond Valley-Tottenville
425732,5,WOODROW,02,07349,0035,33 QUAIL LANE,,10309.0,2.0,0.0,2.0,2255.0,2377.0,1998.0,1,B9,640000,2019-03-08,5073490035,2018/19,B9,27,83,3.0,517000,40,40.532390,-74.221234,Charleston-Richmond Valley-Tottenville


In [21]:
# save the merged df
inner_merged.to_pickle(path + 'merged-sales-val.pkl')