## Outline

Last updated: 02/26/2023


1. [Import modules](#bullet1)
2. [Import NYCOD data, dtype handling](#bullet2)
3. [Import PLUTO data, dtype handling](#bullet3)
4. [Join NYCOD and PLUTO](#bullet4)
5. [Save out csv](#bullet5)

# Import modules <a class="anchor" id="bullet1"></a>

In [1]:
#import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

# Import NYCOD data <a class="anchor" id="bullet2"></a>

In [2]:
#read in sales data
sales17 = pd.read_csv("2017_manhattan.csv", skiprows=5, header=None).dropna(how='all')
sales18 = pd.read_csv("2018_manhattan.csv", skiprows=5, header=None).dropna(how='all')
sales19 = pd.read_csv("2019_manhattan.csv", skiprows=5, header=None).dropna(how='all')
sales20 = pd.read_csv("2020_manhattan.csv", skiprows=7, header=None).dropna(how='all')
sales21 = pd.read_csv("2021_manhattan.csv", skiprows=7, header=None).dropna(how='all')
sales22 = pd.read_csv("rollingsales_manhattan.csv", skiprows=1, header=None).dropna(how='all')
labels = pd.DataFrame(pd.read_csv("2021_manhattan.csv", header=6).columns).T

#concatenate all dfs, check shapes
allsales = pd.concat((sales17, sales18, sales19, sales20, sales21, sales22), axis=0)
print(sales17.shape, sales18.shape, sales19.shape, sales20.shape, sales21.shape, sales22.shape)
allsales = allsales.iloc[:,0:21]
print(allsales.shape)

#add labels
allsales = pd.concat((labels, allsales), axis=0)
allsales.columns=allsales.iloc[0] 
allsales = allsales[1:]
allsales.head()

(18642, 21) (17041, 21) (17583, 21) (12880, 21) (22756, 21) (21324, 24)
(110226, 21)


Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL\nUNITS,COMMERCIAL\nUNITS,TOTAL \nUNITS,LAND \nSQUARE FEET,GROSS \nSQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS\nAT TIME OF SALE,SALE PRICE,SALE DATE
0,1.0,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,375.0,28.0,,C4,738 EAST 6TH STREET,,...,11,0.0,11,1750,6500,1900.0,2.0,C4,3750000,4/3/2017
1,1.0,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,385.0,36.0,,C7,27 AVENUE C,,...,24,1.0,25,2650,9960,1910.0,2.0,C7,5235000,7/11/2017
2,1.0,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392.0,5.0,,C2,151 AVENUE B,,...,5,0.0,5,2139,4416,1900.0,2.0,C2,0,6/16/2017
3,1.0,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392.0,6.0,,C2,153 AVENUE B,,...,5,0.0,5,1633,6440,1900.0,2.0,C2,6625000,7/19/2017
4,1.0,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,400.0,61.0,,C7,193 EAST 4TH STREET,,...,11,1.0,12,2388,8045,1900.0,2.0,C7,7675000,11/13/2017


## NYCOD Dtype handling

In [None]:
#FOR VIZ ONLY
allsales.dtypes

In [3]:
#convert objects to numeric where possible
allsales['BOROUGH'] = allsales['BOROUGH'].astype(np.int64)
allsales['BLOCK'] = allsales['BLOCK'].astype(np.int64)
allsales['LOT'] = allsales['LOT'].astype(np.int64)
allsales['RESIDENTIAL\nUNITS'] = pd.to_numeric(allsales['RESIDENTIAL\nUNITS'].str.replace(',',''), errors='coerce')
allsales['TOTAL \nUNITS'] = pd.to_numeric(allsales['TOTAL \nUNITS'].str.replace(',',''), errors='coerce')
allsales['LAND \nSQUARE FEET'] = pd.to_numeric(allsales['LAND \nSQUARE FEET'].str.replace(',',''), errors='coerce')
allsales['GROSS \nSQUARE FEET'] = pd.to_numeric(allsales['GROSS \nSQUARE FEET'].str.replace(',',''), errors='coerce')
allsales['COMMERCIAL\nUNITS'] = pd.to_numeric(allsales['COMMERCIAL\nUNITS'], errors='coerce')
allsales['YEAR BUILT'] = pd.to_numeric(allsales['YEAR BUILT'], errors='coerce')

#add price variable
prices = []
for i in allsales['SALE PRICE']:
    a = float(i.replace(",", "").replace("'", "").replace("$", ""))
    a = float(a)
    prices.append(a)
allsales['saleprice'] = prices

#add sale date in datetime format
import datetime
dates = []
for i in allsales['SALE DATE']:
    b = datetime.datetime.strptime(i, "%m/%d/%Y")
    dates.append(b)
allsales['saledate'] = dates

In [4]:
#drop irrelevant and duplicate columns
allsales = allsales.drop(columns=['BOROUGH', 
                                  'BUILDING CLASS CATEGORY', 
                                  'TAX CLASS AT PRESENT', 
                                  'BUILDING CLASS AT PRESENT',
                                  'SALE PRICE',
                                  'SALE DATE',
                                  'EASE-MENT'])

#rename columns for interpretability
col_mapper = {'RESIDENTIAL\nUNITS':'units_residential',
             'COMMERCIAL\nUNITS':'units_commercial',
             'TOTAL \nUNITS':'units_total',
             'LAND \nSQUARE FEET': 'area_land',
             'GROSS \nSQUARE FEET': 'area_gross',
             'TAX CLASS AT TIME OF SALE':'class_at_sale_tax',
             'BUILDING CLASS\nAT TIME OF SALE':'class_at_sale_bldg'}

allsales = allsales.rename(mapper=col_mapper, axis=1)

In [None]:
#FOR VIZ ONLY
allsales.describe()

# Handling missing data

In [None]:
#FOR VIZ ONLY
#missing and zero data chart
missing = pd.DataFrame(allsales.isna().sum()).rename(columns={0: 'missing_count'})
zeros = pd.DataFrame((allsales == 0).astype(int).sum(axis=0)).rename(columns={0: 'zero_count'})
problems = pd.concat((missing, zeros), axis=1)
problems['missing_pct'] = problems.missing_count / len(allsales) * 100
problems['zero_pct'] = problems.zero_count / len(allsales) * 100
dtypes = list(allsales.dtypes)
problems['type'] = dtypes
problems.sort_values('missing_pct', ascending=False)

In [None]:
#FOR VIZ ONLY
#subset to include 1+ residential unit
allsales_nonzero_residentialunits = allsales[allsales['units_res'] >= 1]
allsales_zero_residentialunits = allsales[allsales['units_res'] == 0]
allsales_missing_residentialunits = allsales[allsales['units_res'].isna() == True]

print(len(allsales_zero_residentialunits))
print(len(allsales_missing_residentialunits))
len(allsales_nonzero_residentialunits)

import seaborn as sns
sns.scatterplot('saledate', np.log(allsales_nonzero_residentialunits.saleprice), data=allsales_nonzero_residentialunits)
plt.title('Sale date for properties with 1 or more residential units')
plt.savefig('1+resunits.png')
plt.show()

sns.scatterplot('saledate', np.log(allsales_zero_residentialunits.saleprice), data=allsales_zero_residentialunits)
plt.title('Sale date for properties with 0 residential units')
plt.savefig('0resunits.png')
plt.show()

sns.scatterplot('saledate', np.log(allsales_missing_residentialunits.saleprice), data=allsales_missing_residentialunits)
plt.title('Sale date for properties with missing number of residential units')
plt.savefig('missingresunits.png')
plt.show()

In [None]:
#FOR VIZ ONLY
#missing sale price
print("total observations: {}".format(len(allsales_res)))
print("missing sale price: {}".format(sum(allsales_res.saleprice.isna())))
print("sale price = 0: {}".format(len(allsales_res[allsales_res['saleprice'] == 0])))
#print("sale price = 1: {}".format(len(allsales_res[allsales_res['saleprice'] == 1])))

print("\n")
print("sale price > 1: {}".format(len(allsales_res[allsales_res['saleprice'] > 0])))

zero_sale_price = allsales[allsales.saleprice == 0]
plt.hist(zero_sale_price.saledate)
plt.show()

In [5]:
#SUBSET DATA
allsales_use = allsales[allsales['saleprice'] > 0]
print(len(allsales))
print(len(allsales_use))

110226
88439


# Import PLUTO data <a class="anchor" id="bullet3"></a>

In [6]:
pluto = pd.read_csv("pluto_22v3_1.csv", low_memory=False)
pluto = pluto.dropna(how='all')

#subset to include Manhattan only
man = pluto[pluto['borough']=='MN']
man.head()

Unnamed: 0,borough,block,lot,cd,bct2020,bctcb2020,ct2010,cb2010,schooldist,council,...,appbbl,appdate,plutomapid,firm07_flag,pfirm15_flag,version,dcpedited,latitude,longitude,notes
10,MN,1917,7502,110.0,1022600.0,10226000000.0,226.0,1002.0,5.0,9.0,...,1019170000.0,04/07/2009,1,,,22v3.1,t,40.812789,-73.943106,
51,MN,1888,7504,107.0,1018700.0,10187000000.0,187.0,2000.0,3.0,6.0,...,1018880000.0,07/22/2016,1,,,22v3.1,,40.797997,-73.973132,
63,MN,434,7502,103.0,1003200.0,10032000000.0,32.0,2000.0,1.0,2.0,...,1004340000.0,12/24/1991,1,,,22v3.1,,40.726073,-73.984147,
72,MN,1253,7502,107.0,1018300.0,10183000000.0,183.0,3000.0,3.0,6.0,...,1012530000.0,10/24/1989,1,,,22v3.1,,40.794874,-73.975838,
111,MN,637,7502,102.0,1007500.0,10075000000.0,75.0,2002.0,2.0,3.0,...,1006371000.0,08/25/1988,1,,1.0,22v3.1,,40.735162,-74.009284,


In [None]:
#FOR VIZ ONLY
man.describe()

## PLUTO Dtype handling

In [None]:
#make list of PLUTO object columns
my_list = pd.DataFrame(man.dtypes)
a = my_list[my_list[0] == 'object']
print(a)

In [None]:
#FOR VIZ ONLY
#print list to see which columns are categories and what their categories are
for i in a.index:
    print("Categorical variable: {}".format(i))
    print("Unique values ({}): {}".format(len(pd.unique(man[i])), pd.unique(man[i])))
    print("\n")

In [None]:
#CREATE DUMMY VARS FOR SELECT PLUTO CODES

binary_cols = ['ltdheight', 'splitzone', 'histdist', 'landmark']

ltdheight_list = []
for i in man.ltdheight:
    if str(i) == '' :
        ltdheight_list.append(0)
    elif str(i) != '' :
        ltdheight_list.append(1)
    else:
        pass

    ltdheight_list[:5]

#categorical to keep:
## address
## ltdheight   
## splitzone
## histdist
## landmark

# Merge NYCOD and PLUTO data <a class="anchor" id="bullet4"></a>

In [None]:
#MAYBE SKIP THIS STEP SO IT SHOWS UP ON OUTPUTS

allsales = allsales.add_suffix('_nycod')
man = man.add_suffix('_pluto')

In [13]:
#join based on BBL

bbl_test = allsales_use.merge(man, how='left', left_on=['BLOCK', 'LOT'], right_on=['block', 'lot'])
print(len(bbl_test))
print(len(bbl_test[bbl_test.borough.isna() == True]))
print("% successfully mapped: {}".format((len(bbl_test) - len(bbl_test[bbl_test.borough.isna() == True]))/(len(bbl_test))))

88439
44620
% successfully mapped: 0.49547145490111827


In [None]:
#join based on address

#pulling out address before apartment number (probably not useful since addresses don't exist elsewhere)
allsales_test = allsales.copy()
addresses = []
for i in allsales_test['ADDRESS']:
    addresses.append(i.split(',')[0])
addresses[:5]
allsales_test['add'] = addresses

In [None]:
#TEST MERGE
add_test = allsales_test.merge(man, left_on=['add'], right_on=['address'])
print(len(add_test))
len(add_test[add_test.latitude.isna() == True])

# Save out csv <a class="anchor" id="bullet5"></a>

In [None]:
df.to_csv('new_1.csv')

# Archive: using PAD as link

In [None]:
#reading in pad data

df_pad = pd.read_csv('bobabbl.txt')
df_pad.head()

df_pad_2 = pd.read_csv('bobaadr.txt', low_memory=False)
df_pad_2.head()

df_pad_2 = df_pad_2[df_pad_2['boro'] == 1]

In [None]:
df_pad.head()