# Combining datasets into one dataframe

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import itertools
import datetime

from IPython.display import display
pd.options.display.max_columns = None

## Create Index

The BBL number is going to be ideal for indexing as it is used for this purpose by the NYC databases.

https://www.pincusco.com/data-dictionary/bbl-also-known-as-borough-block-and-lot-or-property-id/
https://www.propertyshark.com/Real-Estate-Reports/2018/12/13/all-you-need-to-know-about-nycs-bbl/#:~:text=BBL%20stands%20for%20borough%2C%20block,1%20%3D%20Manhattan

In [6]:
# This is the path to retrieve files one above the github directory
above = '../../../'

# Import the sales data from pckl
# sales_pckl = '../data/sales_combined.pkl'
sales_pckl = above + 'sales_combined.pkl'

In [7]:
# Import valuation data using the above directory
valuation_pckl = above + 'assessment_og.pkl'

# Read the datasets into dataframes
sales = pd.read_pickle(sales_pckl)
valuation = pd.read_pickle(valuation_pckl)

# Standardizing Columns and cleaning Valuation

In [8]:
# remove 2011-2012 rows
# sales data starts at July 2013
valuation = valuation[~valuation['YEAR'].isin(['2010/11', '2011/12'])]

In [9]:
# remove rows with specific building classes
bldgcodekeep = [
'A0',
'A1',
'A2',
'A3',
'A4',
'A5',
'A6',
'A7',
'A8',
'A9',
'B1',
'B2',
'B3',
'B9',
'C0',
'C1',
'C2',
'C3',
'C4',
'C5',
'C6',
'C7',
'C8',
'C9',
'CM',
'D0',
'D1',
'D2',
'D3',
'D4',
'D5',
'D6',
'D7',
'D8',
'D9',
'R1',
'R2',
'R3',
'R4',
'R6',
'S0',
'S1',
'S2',
'S3',
'S4',
'S5',
'S9'
]

valuation = valuation[valuation['BLDGCL'].isin(bldgcodekeep)]

In [None]:
#len(bldgcodekeep)

In [None]:
#valuation['BLDGCL'].unique()

In [None]:
#valuation.shape

In [None]:
#pd.to_pickle(valuation, above + 'temp.pkl')

In [None]:
#valuation = pd.read_pickle(above + 'temp.pkl')

In [10]:
# remove all records with a non-NaN easement
valuation = valuation[valuation['EASEMENT'].isna()]

In [11]:
valuation['EASEMENT'].unique()

array([nan], dtype=object)

In [12]:
# There is only one value for PERIOD in the valuation dataset so we can drop it.
print(valuation.PERIOD.unique())
dropcols = ["PERIOD",
            "Borough",
            "EASEMENT",
            "EXLAND",
            "EXTOT",
            "EXCD1",
            "EXMPTCL",
            "EXLAND2",
            "EXTOT2",
            "EXCD2",
            "VALTYPE"]

valuation.drop(dropcols, axis=1, inplace = True)

['FINAL']


In [13]:
valuation.columns

Index(['BBLE', 'BORO', 'BLOCK', 'LOT', 'OWNER', 'BLDGCL', 'TAXCLASS',
       'LTFRONT', 'LTDEPTH', 'EXT', 'STORIES', 'FULLVAL', 'AVLAND', 'AVTOT',
       'STADDR', 'POSTCODE', 'BLDFRONT', 'BLDDEPTH', 'AVLAND2', 'AVTOT2',
       'YEAR', 'Latitude', 'Longitude', 'Community Board', 'Council District',
       'Census Tract', 'BIN', 'NTA', 'New Georeferenced Column'],
      dtype='object')

In [51]:
display(valuation)

Unnamed: 0,v_bble,v_boro,v_block,v_lot,v_owner,v_bldgcl,v_taxclass,v_ltfront,v_ltdepth,v_ext,v_stories,v_fullval,v_avland,v_avtot,v_staddr,v_postcode,v_bldfront,v_blddepth,v_avland2,v_avtot2,year,v_latitude,v_longitude,v_community_board,v_council_district,v_census_tract,v_bin,v_nta,v_new_georeferenced_column,bbl
1632,1000163612,1,16,3612,"PHILIP LAVIS HODGES,",R4,2,0,0,,31.0,235880,2205,106146,1 RIVER TERRACE,,0,0,2205.0,99198.0,2018/19,,,,,,,,,1163612
2458,1000163859,1,16,3859,"CHEN, QI TOM",R4,2,0,0,,31.0,354180,3310,159381,1 RIVER TERRACE,,0,0,3310.0,148953.0,2018/19,,,,,,,,,1163859
6668,4079650018,4,7965,18,ACOSTA REBECCA,A1,1,30,100,G,2.5,512000,11553,23288,241-25 87 AVENUE,11426.0,20,28,,,2015/16,40.730577,-73.725073,413.0,23.0,157102.0,4166775.0,Bellerose,POINT (-73.725073 40.730577),4796518
13316,1006011352,1,601,1352,,R4,2,0,0,,12.0,1763939,77064,793773,160 LEROY STREET,10014.0,0,0,,,2018/19,40.730523,-74.010095,102.0,3.0,69.0,1090584.0,West Village,POINT (-74.010095 40.730523),16011352
15932,1002051103,1,205,1103,180 HESTER STREET INV,R1,2C,0,0,,7.0,180462,15288,48024,180 HESTER STREET,10013.0,0,0,,,2018/19,40.718109,-73.997810,102.0,1.0,41.0,1087706.0,SoHo-TriBeCa-Civic Center-Little Italy,POINT (-73.99781 40.718109),12051103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9810545,5077400003,5,7740,3,"CORONATI, SABRINA",A5,1,25,100,,2.0,424000,9216,18519,262 RICHARD AVENUE,10309.0,15,40,,,2012/13,40.511832,-74.225132,503.0,51.0,24401.0,5087335.0,Charleston-Richmond Valley-Tottenville,POINT (-74.225132 40.511832),577403
9810548,5078090007,5,7809,7,AVDUL LLESHI,B2,1,80,161,,2.0,985000,27899,54742,18 GIEGERICH PLACE,10307.0,60,40,,,2012/13,40.506921,-74.234832,503.0,51.0,24401.0,5125209.0,Charleston-Richmond Valley-Tottenville,POINT (-74.234832 40.506921),578097
9810552,5079060046,5,7906,46,HARRY LAURO,A1,1,38,95,G,2.0,551000,17880,33060,610 MAIN STREET,10307.0,24,45,,,2012/13,40.499864,-74.243746,503.0,51.0,24402.0,5095747.0,Charleston-Richmond Valley-Tottenville,POINT (-74.243746 40.499864),5790646
9810556,5077100520,5,7710,520,W WIDMER,A2,1,69,100,,1.5,473000,13810,23001,442 RICHARD AVENUE,10309.0,31,26,,,2012/13,40.507401,-74.223787,503.0,51.0,24402.0,5087326.0,Charleston-Richmond Valley-Tottenville,POINT (-74.223787 40.507401),57710520


In [46]:
uniqueblock = sales['S_lot'].unique()
uniqueblock.sort()
print(uniqueblock)

[    1     3     6 ... 16323 16340 16350]


In [52]:
# Add the BBL column for indexing to the valuation data
# valuation['bbl_boro'] = valuation['BORO'].astype(str).
valuation['v_block'] = valuation['v_block'].astype(str).map(lambda x : x.zfill(5))
valuation['v_lot'] = valuation['v_lot'].astype(str).map(lambda x : x.zfill(4))

valuation['bbl'] = valuation['v_boro'].astype(str) + valuation['v_block'].astype(str) + valuation['v_lot'].astype(str)

# Add the BBL column for indexing to the sales data
sales['S_block'] = sales['S_block'].astype(str).map(lambda x : x.zfill(5))
sales['S_lot'] = sales['S_lot'].astype(str).map(lambda x : x.zfill(4))

sales['bbl'] = sales['S_borough'].astype(str) + sales['S_block'].astype(str) + sales['S_lot'].astype(str)

In [56]:
# sanity check
valuation.bbl.apply(len).unique()

array([10])

In [15]:
# add prefix to all cols depending on dataset
valuation.rename(columns=lambda x: 'v_' + x, inplace=True)
sales.rename(columns=lambda x: 's_' + x, inplace=True)

In [16]:
# Change the column names to be lower case, and replace white space with underscores
valuation.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)

In [17]:
# rename v_bbl to bbl
valuation.rename(columns={'v_bbl':'bbl'}, inplace=True)

In [18]:
# rename s_bbl to bbl
sales.rename(columns={'s_bbl':'bbl'}, inplace=True)

In [None]:
#sales.rename(columns=lambda x: x[2:], inplace=True)

In [19]:
# rename v_year to year (for joining with sales data)
valuation.rename(columns={'v_year':'year'}, inplace=True)

In [74]:
# Count the number of bbl values that are in the sales data and are also in the valuation
print(sales.bbl.nunique()) #397946
print(valuation.bbl.nunique())
print(sum(sales.bbl.isin(valuation.bbl)))
# print(f" There are {sales.shape[0]} total sales and {sales.shape[0] - 486771} missing from appraisal")

247841
971937
381840


In [35]:
display(valuation.bbl)

1632        1163612
2458        1163859
6668        4796518
13316      16011352
15932      12051103
             ...   
9810545      577403
9810548      578097
9810552     5790646
9810556    57710520
9810559    57863169
Name: bbl, Length: 6641062, dtype: object

In [34]:
display(sales.bbl)

0       2302825
1       2303928
2       2303928
3       2304639
4       2304652
         ...   
8364    5735585
8366    5735595
8367    5709448
8368    5709449
8369    5734928
Name: bbl, Length: 397946, dtype: object

In [None]:
# drop valuation records that don't have corresponding BBLs in sales data
valuation = valuation[valuation.bbl.isin(sales.bbl)]

In [65]:
# create a binned year col in sales df (cutoff: July 1)

# first, convert string into datetime
# if the date in a given year N is past July 1st, then mark it as 'N/N+1'
# otherwise, mark as 'N-1/N'

def bin_year(sale_date):
    year = sale_date.year
    if datetime.date(year, 7, 1) <= sale_date:
        return str(year) + '/' + str(year + 1)[-2:]
    else:
        return str(year - 1) + '/' + str(year)[-2:]

# save that result string into a new col called 'year' (index col -- should be same as valuation col)
sales['year'] = pd.to_datetime(sales['S_sale_date']).map(lambda x : bin_year(x))

In [69]:
# SAVING INTO TEMP PKL

pd.to_pickle(valuation, above + 'temp_valuation.pkl')
pd.to_pickle(sales, above + 'temp_sales.pkl')

In [None]:
valuation = pd.read_pickle(above + 'temp_valuation.pkl')
sales = pd.read_pickle(above + 'temp_sales.pkl')

-------------------------------------------------------------------------------

## Merging

In [72]:
# merge (DB-style join) valuation and sales dataframes into one on bbl col
merged = sales.merge(right=valuation, how='left', on=['bbl', 'year'])

In [75]:
outer_merged = sales.merge(right=valuation, how='outer', on=['bbl', 'year'])

In [77]:
inner_merged = sales.merge(right=valuation, how='inner', on=['bbl', 'year'])

In [None]:
print(sales.bbl.nunique()) #397946
print(sales[bbl, year])

In [73]:
display(merged)

Unnamed: 0,S_borough,S_neighborhood,S_building_class_category,S_block,S_lot,S_address,S_apt_number,S_zipcode,S_residential_units,S_commercial_units,S_total_units,S_land_sqft,S_gross_sqft,S_year_built,S_tax_class_at_sale,S_building_class_at_sale,S_sale_price,S_sale_date,bbl,bbl_block,bbl_lot,year,v_bble,v_boro,v_block,v_lot,v_owner,v_bldgcl,v_taxclass,v_ltfront,v_ltdepth,v_ext,v_stories,v_fullval,v_avland,v_avtot,v_staddr,v_postcode,v_bldfront,v_blddepth,v_avland2,v_avtot2,v_latitude,v_longitude,v_community_board,v_council_district,v_census_tract,v_bin,v_nta,v_new_georeferenced_column
0,2,BATHGATE,01 ONE FAMILY DWELLINGS,03028,0025,412 EAST 179TH STREET,,10457.0,1.0,0.0,1.0,1842.0,2048.0,1901.0,1,A5,355000,2013-07-08,2030280025,03028,0025,2013/14,2030280025,2.0,03028,0025,SANDRA FERNANDEZ,A5,1,18.0,100.0,E,3.0,258000.0,7496.0,8557.0,412 EAST 179 STREET,10457.0,18.0,32.0,,,40.850339,-73.898762,206.0,15.0,379.0,2011104.0,Mount Hope,POINT (-73.898762 40.850339)
1,2,BATHGATE,01 ONE FAMILY DWELLINGS,03039,0028,2329 WASHINGTON AVENUE,,10458.0,1.0,0.0,1.0,1103.0,1290.0,1910.0,1,A1,120000,2013-11-14,2030390028,03039,0028,2013/14,2030390028,2.0,03039,0028,"PERRY , EUGENE",A1,1,15.0,73.0,,2.0,303000.0,5249.0,8460.0,2329 WASHINGTON AVENUE,10458.0,15.0,40.0,,,40.855982,-73.892503,206.0,15.0,385.0,2011364.0,Claremont-Bathgate,POINT (-73.892503 40.855982)
2,2,BATHGATE,01 ONE FAMILY DWELLINGS,03039,0028,2329 WASHINGTON AVENUE,,10458.0,1.0,0.0,1.0,1103.0,1290.0,1910.0,1,A1,474819,2013-05-20,2030390028,03039,0028,2012/13,2030390028,2.0,03039,0028,"PERRY , EUGENE",A1,1,15.0,73.0,,2.0,334000.0,4762.0,8460.0,2329 WASHINGTON AVENUE,10458.0,15.0,40.0,,,40.855982,-73.892503,206.0,15.0,385.0,2011364.0,Claremont-Bathgate,POINT (-73.892503 40.855982)
3,2,BATHGATE,01 ONE FAMILY DWELLINGS,03046,0039,2075 BATHGATE AVENUE,,10457.0,1.0,0.0,1.0,1986.0,1344.0,1899.0,1,A1,210000,2013-03-12,2030460039,03046,0039,2012/13,2030460039,2.0,03046,0039,"VELEZ, PABLO",A1,1,21.0,94.0,,2.0,206000.0,7137.0,7862.0,2075 BATHGATE AVENUE,10457.0,18.0,42.0,,,40.850459,-73.895151,206.0,15.0,395.0,2011486.0,Claremont-Bathgate,POINT (-73.895151 40.850459)
4,2,BATHGATE,01 ONE FAMILY DWELLINGS,03046,0052,2047 BATHGATE AVENUE,,10457.0,1.0,0.0,1.0,2329.0,1431.0,1901.0,1,A1,343116,2013-07-01,2030460052,03046,0052,2013/14,2030460052,2.0,03046,0052,MICHAEL T DONALD,A1,1,25.0,93.0,E,2.0,200000.0,6979.0,8308.0,2047 BATHGATE AVENUE,10457.0,20.0,30.0,,,40.850031,-73.895394,206.0,15.0,395.0,2011492.0,Claremont-Bathgate,POINT (-73.895394 40.850031)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397941,5,WOODROW,02 TWO FAMILY DWELLINGS,07355,0085,104 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,2730.0,2565.0,2001.0,1,B9,625000,2019-07-30,5073550085,07355,0085,2019/20,,,,,,,,,,,,,,,,,,,,,,,,,,,,
397942,5,WOODROW,02 TWO FAMILY DWELLINGS,07355,0095,115 AMBASSADOR LANE,,10309.0,2.0,0.0,2.0,8846.0,1710.0,2001.0,1,B9,637500,2019-08-20,5073550095,07355,0095,2019/20,,,,,,,,,,,,,,,,,,,,,,,,,,,,
397943,5,WOODROW,05 TAX CLASS 1 VACANT LAND,07094,0048,108 BERKSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,599701,2019-04-24,5070940048,07094,0048,2018/19,,,,,,,,,,,,,,,,,,,,,,,,,,,,
397944,5,WOODROW,05 TAX CLASS 1 VACANT LAND,07094,0049,35 WILTSHIRE LANE,,10309.0,2.0,0.0,2.0,0.0,0.0,2018.0,1,V0,896060,2019-04-22,5070940049,07094,0049,2018/19,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [76]:
display(outer_merged)

Unnamed: 0,S_borough,S_neighborhood,S_building_class_category,S_block,S_lot,S_address,S_apt_number,S_zipcode,S_residential_units,S_commercial_units,S_total_units,S_land_sqft,S_gross_sqft,S_year_built,S_tax_class_at_sale,S_building_class_at_sale,S_sale_price,S_sale_date,bbl,bbl_block,bbl_lot,year,v_bble,v_boro,v_block,v_lot,v_owner,v_bldgcl,v_taxclass,v_ltfront,v_ltdepth,v_ext,v_stories,v_fullval,v_avland,v_avtot,v_staddr,v_postcode,v_bldfront,v_blddepth,v_avland2,v_avtot2,v_latitude,v_longitude,v_community_board,v_council_district,v_census_tract,v_bin,v_nta,v_new_georeferenced_column
0,2.0,BATHGATE,01 ONE FAMILY DWELLINGS,03028,0025,412 EAST 179TH STREET,,10457.0,1.0,0.0,1.0,1842.0,2048.0,1901.0,1.0,A5,355000.0,2013-07-08,2030280025,03028,0025,2013/14,2030280025,2.0,03028,0025,SANDRA FERNANDEZ,A5,1,18.0,100.0,E,3.0,258000.0,7496.0,8557.0,412 EAST 179 STREET,10457.0,18.0,32.0,,,40.850339,-73.898762,206.0,15.0,379.0,2011104.0,Mount Hope,POINT (-73.898762 40.850339)
1,2.0,BATHGATE,01 ONE FAMILY DWELLINGS,03039,0028,2329 WASHINGTON AVENUE,,10458.0,1.0,0.0,1.0,1103.0,1290.0,1910.0,1.0,A1,120000.0,2013-11-14,2030390028,03039,0028,2013/14,2030390028,2.0,03039,0028,"PERRY , EUGENE",A1,1,15.0,73.0,,2.0,303000.0,5249.0,8460.0,2329 WASHINGTON AVENUE,10458.0,15.0,40.0,,,40.855982,-73.892503,206.0,15.0,385.0,2011364.0,Claremont-Bathgate,POINT (-73.892503 40.855982)
2,2.0,BATHGATE,01 ONE FAMILY DWELLINGS,03039,0028,2329 WASHINGTON AVENUE,,10458.0,1.0,0.0,1.0,1103.0,1290.0,1910.0,1.0,A1,474819.0,2013-05-20,2030390028,03039,0028,2012/13,2030390028,2.0,03039,0028,"PERRY , EUGENE",A1,1,15.0,73.0,,2.0,334000.0,4762.0,8460.0,2329 WASHINGTON AVENUE,10458.0,15.0,40.0,,,40.855982,-73.892503,206.0,15.0,385.0,2011364.0,Claremont-Bathgate,POINT (-73.892503 40.855982)
3,2.0,BATHGATE,01 ONE FAMILY DWELLINGS,03046,0039,2075 BATHGATE AVENUE,,10457.0,1.0,0.0,1.0,1986.0,1344.0,1899.0,1.0,A1,210000.0,2013-03-12,2030460039,03046,0039,2012/13,2030460039,2.0,03046,0039,"VELEZ, PABLO",A1,1,21.0,94.0,,2.0,206000.0,7137.0,7862.0,2075 BATHGATE AVENUE,10457.0,18.0,42.0,,,40.850459,-73.895151,206.0,15.0,395.0,2011486.0,Claremont-Bathgate,POINT (-73.895151 40.850459)
4,2.0,BATHGATE,01 ONE FAMILY DWELLINGS,03046,0052,2047 BATHGATE AVENUE,,10457.0,1.0,0.0,1.0,2329.0,1431.0,1901.0,1.0,A1,343116.0,2013-07-01,2030460052,03046,0052,2013/14,2030460052,2.0,03046,0052,MICHAEL T DONALD,A1,1,25.0,93.0,E,2.0,200000.0,6979.0,8308.0,2047 BATHGATE AVENUE,10457.0,20.0,30.0,,,40.850031,-73.895394,206.0,15.0,395.0,2011492.0,Claremont-Bathgate,POINT (-73.895394 40.850031)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6777999,,,,,,,,,,,,,,,,,,NaT,5077400003,,,2012/13,5077400003,5.0,07740,0003,"CORONATI, SABRINA",A5,1,25.0,100.0,,2.0,424000.0,9216.0,18519.0,262 RICHARD AVENUE,10309.0,15.0,40.0,,,40.511832,-74.225132,503.0,51.0,24401.0,5087335.0,Charleston-Richmond Valley-Tottenville,POINT (-74.225132 40.511832)
6778000,,,,,,,,,,,,,,,,,,NaT,5078090007,,,2012/13,5078090007,5.0,07809,0007,AVDUL LLESHI,B2,1,80.0,161.0,,2.0,985000.0,27899.0,54742.0,18 GIEGERICH PLACE,10307.0,60.0,40.0,,,40.506921,-74.234832,503.0,51.0,24401.0,5125209.0,Charleston-Richmond Valley-Tottenville,POINT (-74.234832 40.506921)
6778001,,,,,,,,,,,,,,,,,,NaT,5079060046,,,2012/13,5079060046,5.0,07906,0046,HARRY LAURO,A1,1,38.0,95.0,G,2.0,551000.0,17880.0,33060.0,610 MAIN STREET,10307.0,24.0,45.0,,,40.499864,-74.243746,503.0,51.0,24402.0,5095747.0,Charleston-Richmond Valley-Tottenville,POINT (-74.243746 40.499864)
6778002,,,,,,,,,,,,,,,,,,NaT,5077100520,,,2012/13,5077100520,5.0,07710,0520,W WIDMER,A2,1,69.0,100.0,,1.5,473000.0,13810.0,23001.0,442 RICHARD AVENUE,10309.0,31.0,26.0,,,40.507401,-74.223787,503.0,51.0,24402.0,5087326.0,Charleston-Richmond Valley-Tottenville,POINT (-74.223787 40.507401)


In [78]:
display(inner_merged)

Unnamed: 0,S_borough,S_neighborhood,S_building_class_category,S_block,S_lot,S_address,S_apt_number,S_zipcode,S_residential_units,S_commercial_units,S_total_units,S_land_sqft,S_gross_sqft,S_year_built,S_tax_class_at_sale,S_building_class_at_sale,S_sale_price,S_sale_date,bbl,bbl_block,bbl_lot,year,v_bble,v_boro,v_block,v_lot,v_owner,v_bldgcl,v_taxclass,v_ltfront,v_ltdepth,v_ext,v_stories,v_fullval,v_avland,v_avtot,v_staddr,v_postcode,v_bldfront,v_blddepth,v_avland2,v_avtot2,v_latitude,v_longitude,v_community_board,v_council_district,v_census_tract,v_bin,v_nta,v_new_georeferenced_column
0,2,BATHGATE,01 ONE FAMILY DWELLINGS,03028,0025,412 EAST 179TH STREET,,10457.0,1.0,0.0,1.0,1842.0,2048.0,1901.0,1,A5,355000,2013-07-08,2030280025,03028,0025,2013/14,2030280025,2,03028,0025,SANDRA FERNANDEZ,A5,1,18,100,E,3.0,258000,7496,8557,412 EAST 179 STREET,10457.0,18,32,,,40.850339,-73.898762,206.0,15.0,379.0,2011104.0,Mount Hope,POINT (-73.898762 40.850339)
1,2,BATHGATE,01 ONE FAMILY DWELLINGS,03039,0028,2329 WASHINGTON AVENUE,,10458.0,1.0,0.0,1.0,1103.0,1290.0,1910.0,1,A1,120000,2013-11-14,2030390028,03039,0028,2013/14,2030390028,2,03039,0028,"PERRY , EUGENE",A1,1,15,73,,2.0,303000,5249,8460,2329 WASHINGTON AVENUE,10458.0,15,40,,,40.855982,-73.892503,206.0,15.0,385.0,2011364.0,Claremont-Bathgate,POINT (-73.892503 40.855982)
2,2,BATHGATE,01 ONE FAMILY DWELLINGS,03039,0028,2329 WASHINGTON AVENUE,,10458.0,1.0,0.0,1.0,1103.0,1290.0,1910.0,1,A1,474819,2013-05-20,2030390028,03039,0028,2012/13,2030390028,2,03039,0028,"PERRY , EUGENE",A1,1,15,73,,2.0,334000,4762,8460,2329 WASHINGTON AVENUE,10458.0,15,40,,,40.855982,-73.892503,206.0,15.0,385.0,2011364.0,Claremont-Bathgate,POINT (-73.892503 40.855982)
3,2,BATHGATE,01 ONE FAMILY DWELLINGS,03046,0039,2075 BATHGATE AVENUE,,10457.0,1.0,0.0,1.0,1986.0,1344.0,1899.0,1,A1,210000,2013-03-12,2030460039,03046,0039,2012/13,2030460039,2,03046,0039,"VELEZ, PABLO",A1,1,21,94,,2.0,206000,7137,7862,2075 BATHGATE AVENUE,10457.0,18,42,,,40.850459,-73.895151,206.0,15.0,395.0,2011486.0,Claremont-Bathgate,POINT (-73.895151 40.850459)
4,2,BATHGATE,01 ONE FAMILY DWELLINGS,03046,0052,2047 BATHGATE AVENUE,,10457.0,1.0,0.0,1.0,2329.0,1431.0,1901.0,1,A1,343116,2013-07-01,2030460052,03046,0052,2013/14,2030460052,2,03046,0052,MICHAEL T DONALD,A1,1,25,93,E,2.0,200000,6979,8308,2047 BATHGATE AVENUE,10457.0,20,30,,,40.850031,-73.895394,206.0,15.0,395.0,2011492.0,Claremont-Bathgate,POINT (-73.895394 40.850031)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340535,5,WOODROW,02 TWO FAMILY DWELLINGS,07316,0084,57 DARNELL LANE,,10309.0,2.0,0.0,2.0,2311.0,1500.0,1995.0,1,B2,620000,2019-06-20,5073160084,07316,0084,2018/19,5073160084,5,07316,0084,SALVATORE ANNARUMMA,B2,1,26,88,,2.0,518000,6690,27072,57 DARNELL LANE,10309.0,16,44,,,40.536003,-74.223213,503.0,51.0,226.0,5130720.0,Charleston-Richmond Valley-Tottenville,POINT (-74.223213 40.536003)
340536,5,WOODROW,02 TWO FAMILY DWELLINGS,07317,0119,73 ROBIN COURT,,10309.0,2.0,0.0,2.0,4125.0,2160.0,1994.0,1,B2,625000,2019-05-21,5073170119,07317,0119,2018/19,5073170119,5,07317,0119,JOSEPH DIBENEDETTO,B2,1,25,165,,3.0,527000,9683,29160,73 ROBIN COURT,10309.0,22,40,,,40.536140,-74.222012,503.0,51.0,226.0,5125758.0,Charleston-Richmond Valley-Tottenville,POINT (-74.222012 40.53614)
340537,5,WOODROW,02 TWO FAMILY DWELLINGS,07349,0004,51 PHEASANT LANE,,10309.0,2.0,0.0,2.0,2123.0,1850.0,1998.0,1,B9,670000,2019-02-22,5073490004,07349,0004,2018/19,5073490004,5,07349,0004,"GUITIAN, JOSEPH",B9,1,23,91,,2.0,581000,6988,30528,51 PHEASANT LANE,10309.0,14,52,,,40.532829,-74.222422,503.0,51.0,226.0,5135567.0,Charleston-Richmond Valley-Tottenville,POINT (-74.222422 40.532829)
340538,5,WOODROW,02 TWO FAMILY DWELLINGS,07349,0035,33 QUAIL LANE,,10309.0,2.0,0.0,2.0,2255.0,2377.0,1998.0,1,B9,640000,2019-03-08,5073490035,07349,0035,2018/19,5073490035,5,07349,0035,"PETRONE, AS TRUSTEE,",B9,1,27,83,,3.0,517000,9000,31020,33 QUAIL LANE,10309.0,18,40,,,40.532390,-74.221234,503.0,51.0,226.0,5150660.0,Charleston-Richmond Valley-Tottenville,POINT (-74.221234 40.53239)


In [79]:
# SAVING MERGED DFs
pd.to_pickle(merged, above + 'left-merged.pkl')
pd.to_pickle(outer_merged, above + 'outer-merged.pkl')
pd.to_pickle(inner_merged, above + 'inner-merged.pkl')

------------------------------------------------------------------

## Unused Cells

Convert b: borough the following are the same: block, lot, easement. Need to figure out when the financial assessment year starts to convert datetime values to a year column in the sales data. Do we need to convert nta and neighberhood in order to have the 

In [None]:
# The amount of vacant lots that we would lose
len(sales[sales["building_class_at_sale"].str.contains('V')])


In [None]:
len(sales[sales["building_class_at_sale"].str.contains('A')]) + len(sales[sales["building_class_at_sale"].str.contains('B')]) + len(sales[sales["building_class_at_sale"].str.contains('C')]) + len(sales[sales["building_class_at_sale"].str.contains('D')]) + len(sales[sales["building_class_at_sale"].str.contains('S')]) + len(sales[sales["building_class_at_sale"].str.contains('R')])

491304 - 486106

In [None]:
colnames = ['borough',
            'neighborhood',
            'building_class_category',
            'tax_class_present', #can appear as "tax class as of final roll date" in some files
            'block',
            'lot',
            'easement',
            'building_class_present', #can appear as "building class as of final roll date" in some files
            'address',
            'apt_number', #can appear as NaN in some files -- included in the "address" field
            'zipcode',
            'residential_units',
            'commercial_units',
            'total_units',
            'land_sqft',
            'gross_sqft',
            'year_built',
            'tax_class_at_sale',
            'building_class_at_sale',
            'sale_price',
            'sale_date']