# Combining datasets into one dataframe

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import itertools
import datetime

from IPython.display import display
pd.options.display.max_columns = None

## Create Index

The BBL number is going to be ideal for indexing as it is used for this purpose by the NYC databases.

https://www.pincusco.com/data-dictionary/bbl-also-known-as-borough-block-and-lot-or-property-id/


https://www.propertyshark.com/Real-Estate-Reports/2018/12/13/all-you-need-to-know-about-nycs-bbl/#:~:text=BBL%20stands%20for%20borough%2C%20block,1%20%3D%20Manhattan

In [42]:
# Import the sales data from pckl
sales_pckl = '../data/sales_combined.pkl'

# This is the path to retrieve files one above the github directory
above = '../../../'
# Import valuation data using the above directory
valuation_pckl = above + 'assessment_og.pkl'

# Read the datasets into dataframes
sales = pd.read_pickle(sales_pckl)
valuation = pd.read_pickle(valuation_pckl)

# Standardizing Columns and cleaning Valuation

In [43]:
# remove 2011-2012 rows
# sales data starts at July 2013
valuation = valuation[~valuation['YEAR'].isin(['2010/11', '2011/12'])]

In [44]:
# remove rows with specific building classes
bldgcodekeep = [
'A0',
'A1',
'A2',
'A3',
'A4',
'A5',
'A6',
'A7',
'A8',
'A9',
'B1',
'B2',
'B3',
'B9',
'C0',
'C1',
'C2',
'C3',
'C4',
'C5',
'C6',
'C7',
'C8',
'C9',
'CM',
'D0',
'D1',
'D2',
'D3',
'D4',
'D5',
'D6',
'D7',
'D8',
'D9',
'R1',
'R2',
'R3',
'R4',
'R6',
'S0',
'S1',
'S2',
'S3',
'S4',
'S5',
'S9'
]

valuation = valuation[valuation['BLDGCL'].isin(bldgcodekeep)]

In [37]:
#len(bldgcodekeep)

47

In [25]:
#valuation['BLDGCL'].unique()

array(['R4', 'A1', 'R1', 'C3', 'A5', 'C7', 'S1', 'S5', 'A4', 'C0', 'C5',
       'B9', 'R6', 'D7', 'R3', 'B1', 'C1', 'B3', 'C2', 'C9', 'S9', 'B2',
       'A2', 'D6', 'S2', 'R2', 'S3', 'D1', 'A3', 'A0', 'A9', 'D2', 'C6',
       'S0', 'S4', 'A6', 'A7', 'C4', 'D3', 'D8', 'D9', 'D4', 'D5', 'D0',
       'C8', 'A8', 'CM'], dtype=object)

In [26]:
#valuation.shape

(6641071, 31)

In [48]:
#pd.to_pickle(valuation, above + 'temp.pkl')

In [70]:
#valuation = pd.read_pickle(above + 'temp.pkl')

In [73]:
# remove all records with a non-NaN easement
valuation = valuation[valuation['EASEMENT'].isna()]

In [74]:
valuation['EASEMENT'].unique()

array([nan], dtype=object)

In [75]:
# There is only one value for PERIOD in the valuation dataset so we can drop it.
print(valuation.PERIOD.unique())
dropcols = ["PERIOD",
            "Borough",
            "EASEMENT",
            "EXLAND",
            "EXTOT",
            "EXCD1",
            "EXMPTCL",
            "EXLAND2",
            "EXTOT2",
            "EXCD2",
            "VALTYPE"]

valuation.drop(dropcols, axis=1, inplace = True)

['FINAL']


In [77]:
valuation.columns

Index(['BBLE', 'BORO', 'BLOCK', 'LOT', 'OWNER', 'BLDGCL', 'TAXCLASS',
       'LTFRONT', 'LTDEPTH', 'EXT', 'STORIES', 'FULLVAL', 'AVLAND', 'AVTOT',
       'STADDR', 'POSTCODE', 'BLDFRONT', 'BLDDEPTH', 'AVLAND2', 'AVTOT2',
       'YEAR', 'Latitude', 'Longitude', 'Community Board', 'Council District',
       'Census Tract', 'BIN', 'NTA', 'New Georeferenced Column', 'bbl'],
      dtype='object')

In [79]:
#display(valuation)

In [76]:
# Add the BBL column for indexing to the valuation data
valuation['bbl'] = valuation['BORO'].astype(str) + valuation['BLOCK'].astype(str) + valuation['LOT'].astype(str)

# Add the BBL column for indexing to the sales data
sales['bbl'] = sales['borough'].astype(str) + sales['block'].astype(str) + sales['lot'].astype(str)

KeyError: 'borough'

In [80]:
# add prefix to all cols depending on dataset
valuation.rename(columns=lambda x: 'V_' + x, inplace=True)
sales.rename(columns=lambda x: 'S_' + x, inplace=True)

In [81]:
# Change the column names to be lower case, and replace white space with underscores
valuation.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)

In [89]:
# rename v_bbl to bbl
valuation.rename(columns={'v_bbl':'bbl'}, inplace=True)

In [100]:
# rename s_bbl to bbl
sales.rename(columns={'S_bbl':'bbl'}, inplace=True)

In [98]:
#sales.rename(columns=lambda x: x[2:], inplace=True)

In [156]:
# rename v_year to year (for joining with sales data)
valuation.rename(columns={'v_year':'year'}, inplace=True)

In [114]:
# Count the number of bbl values that are in the sales data and are also in the valuation
print(sum(sales.bbl.isin(valuation.bbl)))
print(f" There are {sales.shape[0]} total sales and {sales.shape[0] - 486771} missing from appraisal")

472740
 There are 491304 total sales and 4533 missing from appraisal


In [115]:
# drop valuation records that don't have corresponding BBLs in sales data
valuation = valuation[valuation.bbl.isin(sales.bbl)]

In [154]:
# create a binned year col in sales df (cutoff: July 1)

# first, convert string into datetime
# if the date in a given year N is past July 1st, then mark it as 'N/N+1'
# otherwise, mark as 'N-1/N'

def bin_year(sale_date):
    year = sale_date.year
    if datetime.date(year, 7, 1) <= sale_date:
        return str(year) + '/' + str(year + 1)
    else:
        return str(year - 1) + '/' + str(year)

# save that result string into a new col called 'year' (index col -- should be same as valuation col)
sales['year'] = pd.to_datetime(sales['S_sale_date']).map(lambda x : bin_year(x))

In [157]:
display(valuation)

Unnamed: 0,v_bble,v_boro,v_block,v_lot,v_owner,v_bldgcl,v_taxclass,v_ltfront,v_ltdepth,v_ext,v_stories,v_fullval,v_avland,v_avtot,v_staddr,v_postcode,v_bldfront,v_blddepth,v_avland2,v_avtot2,year,v_latitude,v_longitude,v_community_board,v_council_district,v_census_tract,v_bin,v_nta,v_new_georeferenced_column,bbl
2458,1000163859,1,16,3859,"CHEN, QI TOM",R4,2,0,0,,31.0,354180,3310,159381,1 RIVER TERRACE,,0,0,3310.0,148953.0,2018/19,,,,,,,,,1163859
13316,1006011352,1,601,1352,,R4,2,0,0,,12.0,1763939,77064,793773,160 LEROY STREET,10014.0,0,0,,,2018/19,40.730523,-74.010095,102.0,3.0,69.0,1090584.0,West Village,POINT (-74.010095 40.730523),16011352
15932,1002051103,1,205,1103,180 HESTER STREET INV,R1,2C,0,0,,7.0,180462,15288,48024,180 HESTER STREET,10013.0,0,0,,,2018/19,40.718109,-73.997810,102.0,1.0,41.0,1087706.0,SoHo-TriBeCa-Civic Center-Little Italy,POINT (-73.99781 40.718109),12051103
20592,1003730063,1,373,63,BAY PETDRE SIX LLC,C3,2A,21,96,,3.0,1384000,4177,33419,285 EAST 3 STREET,10009.0,22,44,,,2018/19,40.721440,-73.980017,103.0,2.0,2601.0,1004405.0,Lower East Side,POINT (-73.980017 40.72144),137363
34367,1006151006,1,615,1006,"D'ALATRI, ALESSANDRO",R1,2C,0,0,,5.0,865732,9956,92201,274 WEST 12 STREET,10014.0,0,0,,,2018/19,40.737567,-74.003359,102.0,3.0,77.0,1011003.0,West Village,POINT (-74.003359 40.737567),16151006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9810399,5079050097,5,7905,97,BLOCK 7905 LOT 97 COR,A2,1,125,100,G,1.0,465000,18000,27900,565 MAIN STREET,10307.0,24,44,,,2012/13,40.500781,-74.244041,503.0,51.0,24402.0,5088168.0,Charleston-Richmond Valley-Tottenville,POINT (-74.244041 40.500781),5790597
9810460,5079650033,5,7965,33,"ARTALE, ANGELA M",A1,1,42,123,E,2.0,798000,21793,44028,36 TOTTENVILLE PLACE,10307.0,37,25,,,2012/13,40.506937,-74.254301,503.0,51.0,24401.0,5088546.0,Charleston-Richmond Valley-Tottenville,POINT (-74.254301 40.506937),5796533
9810505,5078870021,5,7887,21,"REICHER, JARRETT",B2,1,41,100,,2.0,644000,22680,38640,264 MANHATTAN STREET,10307.0,27,45,,,2012/13,40.503795,-74.242254,503.0,51.0,24402.0,5087948.0,Charleston-Richmond Valley-Tottenville,POINT (-74.242254 40.503795),5788721
9810513,5075960219,5,7596,219,,A5,1,44,57,,2.0,610000,4507,32730,72 PILOT LANE,10309.0,22,45,,,2012/13,40.531203,-74.240580,503.0,51.0,226.0,5163812.0,Charleston-Richmond Valley-Tottenville,POINT (-74.24058 40.531203),57596219


In [165]:
# SAVING INTO TEMP PKL

#pd.to_pickle(valuation, above + 'temp.pkl')
#pd.to_pickle(sales, above + 'temp_sales.pkl')

#valuation = pd.read_pickle(above + 'temp.pkl')
#sales = pd.read_pickle(above + 'temp_sales.pkl')

In [169]:
display(sales)

Unnamed: 0,S_borough,S_neighborhood,S_building_class_category,S_block,S_lot,S_easement,S_address,S_apt_number,S_zipcode,S_residential_units,S_commercial_units,S_total_units,S_land_sqft,S_gross_sqft,S_year_built,S_tax_class_at_sale,S_building_class_at_sale,S_sale_price,S_sale_date,S_apt_2,S_sale_price_per_unit,bbl,year
1,2,BATHGATE,01 ONE FAMILY HOMES,3039,64,,467 EAST 185 STREET,,10458.0,1.0,0.0,1.0,1667.0,1296.0,1910.0,1,A1,329000,2011-01-19,,329000.0,2303964,2010/2011
2,2,BATHGATE,01 ONE FAMILY HOMES,3046,34,,2085 BATHGATE AV,,10457.0,1.0,0.0,1.0,2060.0,1629.0,1899.0,1,A1,288500,2011-06-29,,288500.0,2304634,2010/2011
3,2,BATHGATE,01 ONE FAMILY HOMES,3048,28,,540 EAST 182ND STREET,,10457.0,1.0,0.0,1.0,1209.0,1048.0,1901.0,1,A1,122500,2011-03-21,,122500.0,2304828,2010/2011
4,2,BATHGATE,01 ONE FAMILY HOMES,3053,103,,2327 BASSFORD AVENUE,,10458.0,1.0,0.0,1.0,913.0,1248.0,1901.0,1,A1,10,2011-08-25,,10.0,23053103,2011/2012
5,2,BATHGATE,02 TWO FAMILY HOMES,2929,128,,559 CLAREMONT PARKWAY,,10457.0,2.0,0.0,2.0,2361.0,2394.0,1995.0,1,B1,365000,2011-07-13,,182500.0,22929128,2011/2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8364,5,WOODROW,02 TWO FAMILY DWELLINGS,7355,85,,104 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,2730.0,2565.0,2001.0,1,B9,625000,2019-07-30,,312500.0,5735585,2019/2020
8366,5,WOODROW,02 TWO FAMILY DWELLINGS,7355,95,,115 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20,,318750.0,5735595,2019/2020
8367,5,WOODROW,05 TAX CLASS 1 VACANT LAND,7094,48,,108 BERKSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24,,299850.5,5709448,2018/2019
8368,5,WOODROW,05 TAX CLASS 1 VACANT LAND,7094,49,,35 WILTSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22,,448030.0,5709449,2018/2019


In [175]:
# merge (DB-style join) valuation and sales dataframes into one on bbl col
merged = sales.merge(right=valuation, how='outer', on=['bbl', 'year'])

In [176]:
display(merged)

Unnamed: 0,S_borough,S_neighborhood,S_building_class_category,S_block,S_lot,S_easement,S_address,S_apt_number,S_zipcode,S_residential_units,S_commercial_units,S_total_units,S_land_sqft,S_gross_sqft,S_year_built,S_tax_class_at_sale,S_building_class_at_sale,S_sale_price,S_sale_date,S_apt_2,S_sale_price_per_unit,bbl,year,v_bble,v_boro,v_block,v_lot,v_owner,v_bldgcl,v_taxclass,v_ltfront,v_ltdepth,v_ext,v_stories,v_fullval,v_avland,v_avtot,v_staddr,v_postcode,v_bldfront,v_blddepth,v_avland2,v_avtot2,v_latitude,v_longitude,v_community_board,v_council_district,v_census_tract,v_bin,v_nta,v_new_georeferenced_column
0,2.0,BATHGATE,01 ONE FAMILY HOMES,3039.0,64.0,,467 EAST 185 STREET,,10458.0,1.0,0.0,1.0,1667.0,1296.0,1910.0,1.0,A1,329000.0,2011-01-19,,329000.0,2303964,2010/2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2.0,BATHGATE,01 ONE FAMILY HOMES,3046.0,34.0,,2085 BATHGATE AV,,10457.0,1.0,0.0,1.0,2060.0,1629.0,1899.0,1.0,A1,288500.0,2011-06-29,,288500.0,2304634,2010/2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2.0,BATHGATE,01 ONE FAMILY HOMES,3048.0,28.0,,540 EAST 182ND STREET,,10457.0,1.0,0.0,1.0,1209.0,1048.0,1901.0,1.0,A1,122500.0,2011-03-21,,122500.0,2304828,2010/2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2.0,BATHGATE,01 ONE FAMILY HOMES,3053.0,103.0,,2327 BASSFORD AVENUE,,10458.0,1.0,0.0,1.0,913.0,1248.0,1901.0,1.0,A1,10.0,2011-08-25,,10.0,23053103,2011/2012,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2.0,BATHGATE,02 TWO FAMILY HOMES,2929.0,128.0,,559 CLAREMONT PARKWAY,,10457.0,2.0,0.0,2.0,2361.0,2394.0,1995.0,1.0,B1,365000.0,2011-07-13,,182500.0,22929128,2011/2012,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2429485,,,,,,,,,,,,,,,,,,,NaT,,,5790597,2012/13,5079050097,5.0,7905.0,97.0,BLOCK 7905 LOT 97 COR,A2,1,125.0,100.0,G,1.0,465000.0,18000.0,27900.0,565 MAIN STREET,10307.0,24.0,44.0,,,40.500781,-74.244041,503.0,51.0,24402.0,5088168.0,Charleston-Richmond Valley-Tottenville,POINT (-74.244041 40.500781)
2429486,,,,,,,,,,,,,,,,,,,NaT,,,5796533,2012/13,5079650033,5.0,7965.0,33.0,"ARTALE, ANGELA M",A1,1,42.0,123.0,E,2.0,798000.0,21793.0,44028.0,36 TOTTENVILLE PLACE,10307.0,37.0,25.0,,,40.506937,-74.254301,503.0,51.0,24401.0,5088546.0,Charleston-Richmond Valley-Tottenville,POINT (-74.254301 40.506937)
2429487,,,,,,,,,,,,,,,,,,,NaT,,,5788721,2012/13,5078870021,5.0,7887.0,21.0,"REICHER, JARRETT",B2,1,41.0,100.0,,2.0,644000.0,22680.0,38640.0,264 MANHATTAN STREET,10307.0,27.0,45.0,,,40.503795,-74.242254,503.0,51.0,24402.0,5087948.0,Charleston-Richmond Valley-Tottenville,POINT (-74.242254 40.503795)
2429488,,,,,,,,,,,,,,,,,,,NaT,,,57596219,2012/13,5075960219,5.0,7596.0,219.0,,A5,1,44.0,57.0,,2.0,610000.0,4507.0,32730.0,72 PILOT LANE,10309.0,22.0,45.0,,,40.531203,-74.240580,503.0,51.0,226.0,5163812.0,Charleston-Richmond Valley-Tottenville,POINT (-74.24058 40.531203)


In [177]:
# SAVING MERGED DF
#pd.to_pickle(merged, above + 'merged.pkl')

Convert b: borough the following are the same: block, lot, easement. Need to figure out when the financial assessment year starts to convert datetime values to a year column in the sales data. Do we need to convert nta and neighberhood in order to have the 

In [None]:
# The amount of vacant lots that we would lose
len(sales[sales["building_class_at_sale"].str.contains('V')])


In [None]:
len(sales[sales["building_class_at_sale"].str.contains('A')]) + len(sales[sales["building_class_at_sale"].str.contains('B')]) + len(sales[sales["building_class_at_sale"].str.contains('C')]) + len(sales[sales["building_class_at_sale"].str.contains('D')]) + len(sales[sales["building_class_at_sale"].str.contains('S')]) + len(sales[sales["building_class_at_sale"].str.contains('R')])

491304 - 486106

In [None]:
colnames = ['borough',
            'neighborhood',
            'building_class_category',
            'tax_class_present', #can appear as "tax class as of final roll date" in some files
            'block',
            'lot',
            'easement',
            'building_class_present', #can appear as "building class as of final roll date" in some files
            'address',
            'apt_number', #can appear as NaN in some files -- included in the "address" field
            'zipcode',
            'residential_units',
            'commercial_units',
            'total_units',
            'land_sqft',
            'gross_sqft',
            'year_built',
            'tax_class_at_sale',
            'building_class_at_sale',
            'sale_price',
            'sale_date']