# Combining datasets into one dataframe

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import itertools

from IPython.display import display
pd.options.display.max_columns = None

## Create Index

The BBL number is going to be ideal for indexing as it is used for this purpose by the NYC databases.

https://www.pincusco.com/data-dictionary/bbl-also-known-as-borough-block-and-lot-or-property-id/


https://www.propertyshark.com/Real-Estate-Reports/2018/12/13/all-you-need-to-know-about-nycs-bbl/#:~:text=BBL%20stands%20for%20borough%2C%20block,1%20%3D%20Manhattan

In [85]:
# Import the sales data from pckl
sales_pckl = '../data/sales_combined.pkl'

# This is the path to retrieve files one above the github directory
above = '../../../'
# Import valuation data using the above directory
valuation_pckl = above + 'assessment_og.pkl'

# Read the datasets into dataframes
sales = pd.read_pickle(sales_pckl)
valuation = pd.read_pickle(valuation_pckl)

In [86]:
# Add the BBL column for indexing to the valuation data
valuation['bbl'] = valuation['BORO'].astype(str) + valuation['BLOCK'].astype(str) + valuation['LOT'].astype(str)

# Add the BBL column for indexing to the sales data
sales['bbl'] = sales['borough'].astype(str) + sales['block'].astype(str) + sales['lot'].astype(str)

# Standardizing Columns and cleaning Valuation

In [87]:
# There is only one value for PERIOD in the valuation dataset so we can drop it.
print(valuation.PERIOD.unique())
valuation.drop(["PERIOD", "Borough"], axis=1, inplace = True)

['FINAL']


In [88]:
# Change the column names to be lower case, and replace white space with underscores
valuation.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
 
display(valuation)

Unnamed: 0,bble,boro,block,lot,easement,owner,bldgcl,taxclass,ltfront,ltdepth,ext,stories,fullval,avland,avtot,exland,extot,excd1,staddr,postcode,exmptcl,bldfront,blddepth,avland2,avtot2,exland2,extot2,excd2,year,valtype,latitude,longitude,community_board,council_district,census_tract,bin,nta,new_georeferenced_column,bbl
0,1010401210,1,1040,1210,,"H&G WARNER, LLC",R4,2,0,0,,35.0,113465,12707,51059,0,0,,350 WEST 50 STREET,10019.0,,200,327,12707.0,47727.0,,,,2010/11,AC-TR,40.762988,-73.987791,104.0,3.0,127.0,1077841.0,Clinton,POINT (-73.987791 40.762988),110401210
1,1008901282,1,890,1282,,"SPERBER, LORRAINE",R4,2,0,0,,19.0,135599,11877,61020,0,0,,155 EAST 34 STREET,10016.0,,0,0,11877.0,53712.0,,,,2010/11,AC-TR,40.746162,-73.979129,106.0,2.0,72.0,1018549.0,Murray Hill-Kips Bay,POINT (-73.979129 40.746162),18901282
2,1009681288,1,968,1288,,ROOS ESTHER A,R4,2,0,0,,37.0,222481,39824,100116,2090,2090,1017.0,630 1 AVENUE,10016.0,,0,0,39824.0,96085.0,2090.0,2090.0,,2010/11,AC-TR,40.745156,-73.972500,106.0,4.0,8601.0,1022060.0,Turtle Bay-East Midtown,POINT (-73.9725 40.745156),19681288
3,1008701611,1,870,1611,,BARRETO ANITA,R4,2,244,425,,27.0,172555,21306,77650,0,0,,1 IRVING PLACE,10003.0,,0,0,21306.0,75826.0,,,,2010/11,AC-TR,40.734072,-73.988713,105.0,2.0,50.0,1083247.0,Gramercy,POINT (-73.988713 40.734072),18701611
4,1010381112,1,1038,1112,,,R4,2,0,0,,43.0,175784,4698,79103,1270,75675,5110.0,306 WEST 48 STREET,10036.0,,23,100,4120.0,79714.0,692.0,76286.0,,2010/11,AC-TR,40.761262,-73.987572,104.0,3.0,127.0,1087980.0,Clinton,POINT (-73.987572 40.761262),110381112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9845852,1007650020,1,765,20,,EVGER 15TH STREET REA,C4,2,25,103,,5.0,782000,99900,351900,0,0,,229 WEST 15 STREET,10011.0,,25,92,99900.0,324000.0,,,,2010/11,AC-TR,40.739672,-74.000173,104.0,3.0,81.0,1013768.0,Hudson Yards-Chelsea-Flatiron-Union Square,POINT (-74.000173 40.739672),176520
9845853,1008690024,1,869,24,,PARSONAGE HILL VILL.A,O1,4,35,100,E,5.0,3410000,1237500,1534500,0,0,,263 MADISON AVENUE,10016.0,,35,100,1237500.0,1532700.0,,,,2010/11,AC-TR,40.750993,-73.980597,106.0,4.0,82.0,1017591.0,Murray Hill-Kips Bay,POINT (-73.980597 40.750993),186924
9845854,1007640075,1,764,75,,92 EIGHTH AVENUE REAL,C4,2B,25,93,,4.0,3340000,127622,448692,0,0,,92 8 AVENUE,10011.0,,26,55,,,,,,2010/11,AC-TR,40.740141,-74.002227,104.0,3.0,81.0,1000000.0,Hudson Yards-Chelsea-Flatiron-Union Square,POINT (-74.002227 40.740141),176475
9845855,1006260047,1,626,47,,HEALY DONALL B,S4,2A,24,88,,4.0,6460000,178647,309400,0,0,,35 JANE STREET,10014.0,,25,40,,,,,,2010/11,AC-TR,40.738143,-74.004471,102.0,3.0,77.0,1077100.0,West Village,POINT (-74.004471 40.738143),162647


In [89]:
# Count the number of bbl values that are in the sales data and are also in the valuation
print(sum(sales.bbl.isin(valuation.bbl)))
print(f" There are {sales.shape[0]} total sales and {sales.shape[0] - 486771} missing from appraisal")

486771
 There are 491304 total sales and 4533 missing from appraisal


In [90]:
valuation.year.unique()

array(['2010/11', '2018/19', '2011/12', '2015/16', '2017/18', '2016/17',
       '2014/15', '2013/14', '2012/13'], dtype=object)

Convert b: borough the following are the same: block, lot, easement. Need to figure out when the financial assessment year starts to convert datetime values to a year column in the sales data. Do we need to convert nta and neighberhood in order to have the 

In [96]:
# The amount of vacant lots that we would lose
len(sales[sales["building_class_at_sale"].str.contains('V')])


4656

In [97]:
len(sales[sales["building_class_at_sale"].str.contains('A')]) + len(sales[sales["building_class_at_sale"].str.contains('B')]) + len(sales[sales["building_class_at_sale"].str.contains('C')]) + len(sales[sales["building_class_at_sale"].str.contains('D')]) + len(sales[sales["building_class_at_sale"].str.contains('S')]) + len(sales[sales["building_class_at_sale"].str.contains('R')])

486106

In [99]:
491304 - 486106

5198

In [None]:
colnames = ['borough',
            'neighborhood',
            'building_class_category',
            'tax_class_present', #can appear as "tax class as of final roll date" in some files
            'block',
            'lot',
            'easement',
            'building_class_present', #can appear as "building class as of final roll date" in some files
            'address',
            'apt_number', #can appear as NaN in some files -- included in the "address" field
            'zipcode',
            'residential_units',
            'commercial_units',
            'total_units',
            'land_sqft',
            'gross_sqft',
            'year_built',
            'tax_class_at_sale',
            'building_class_at_sale',
            'sale_price',
            'sale_date']