# Note: this is not the analysis notebook.

### Goal: Exporting sample data for *Part 3: Plotting and Data Visualization in Python*

Returning only buildings built between 1990 and 2020

Mark Bauer

In [1]:
# importing libraries
import pandas as pd 
import numpy as np 
import requests
import os
import seaborn as sns
from io import BytesIO
import zipfile
from zipfile import ZipFile

%matplotlib inline
sns.set(color_codes=True)

In [2]:
# Printing verions of Python modules and packages with **watermark** - the IPython magic extension.
%reload_ext watermark
%watermark -v -p numpy,pandas,matplotlib.pyplot,seaborn

Python implementation: CPython
Python version       : 3.8.13
IPython version      : 8.4.0

numpy            : 1.22.4
pandas           : 1.4.2
matplotlib.pyplot: unknown
seaborn          : 0.11.2



# Merging Datasets from NYC Open Data

## Building Footprints Dataset

In [3]:
# assigning the link of our data (a static csv file) as the name 'url'
url = 'https://data.cityofnewyork.us/api/views/qb5r-6dgf/rows.csv?accessType=DOWNLOAD'
building_footprints = pd.read_csv(url)

# printing the shape or dimensions of our dataframe (i.e. rows, columns)
rows, columns = building_footprints.shape
print('rows: {:,}, columns: {}'.format(rows, columns))
      
building_footprints.head()

rows: 1,084,210, columns: 16


Unnamed: 0,the_geom,NAME,BIN,CNSTRCT_YR,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,SHAPE_AREA,SHAPE_LEN,BASE_BBL,MPLUTO_BBL,GEOMSOURCE,GLOBALID
0,MULTIPOLYGON (((-73.96664570466969 40.62599676...,,3170958,1925.0,08/22/2017 12:00:00 AM,Constructed,96807,29.749853,2100.0,40.0,0,0,3065220021,3065220021,Photogramm,{31298F86-3088-4F53-B3DB-71A9EFA6FA1F}
1,MULTIPOLYGON (((-74.16790202462265 40.63936048...,,5028452,1965.0,08/22/2017 12:00:00 AM,Constructed,326368,22.63,2100.0,39.0,0,0,5012640036,5012640036,Photogramm,{F5F8CDA5-69E2-46F8-8F69-BA95C025B520}
2,MULTIPOLYGON (((-74.19510813278613 40.55610681...,,5078368,1970.0,08/22/2017 12:00:00 AM,Constructed,746627,35.76,2100.0,51.0,0,0,5060190091,5060190091,Photogramm,{9F644794-F72C-4582-9E5E-B337E2B97068}
3,MULTIPOLYGON (((-73.96113466505085 40.57743931...,,3245111,1928.0,08/22/2017 12:00:00 AM,Constructed,786626,37.5,2100.0,6.0,0,0,3086910048,3086910048,Photogramm,{F916B22D-E25B-44AE-9FA9-2A51191B9CDF}
4,MULTIPOLYGON (((-73.75421559146166 40.75591276...,,4161096,1950.0,08/22/2017 12:00:00 AM,Constructed,746409,18.015113,2100.0,93.0,0,0,4075020005,4075020005,Photogramm,{525F2C24-616B-4F29-98A3-8FEA5D4B1A7D}


In [4]:
# printing the column names, non-null counts, and data types of our columns
building_footprints.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1084210 entries, 0 to 1084209
Data columns (total 16 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   the_geom    1084210 non-null  object 
 1   NAME        1935 non-null     object 
 2   BIN         1084210 non-null  int64  
 3   CNSTRCT_YR  1073454 non-null  float64
 4   LSTMODDATE  1084210 non-null  object 
 5   LSTSTATYPE  1083927 non-null  object 
 6   DOITT_ID    1084210 non-null  int64  
 7   HEIGHTROOF  1081527 non-null  float64
 8   FEAT_CODE   1084200 non-null  float64
 9   GROUNDELEV  1083669 non-null  float64
 10  SHAPE_AREA  1084210 non-null  int64  
 11  SHAPE_LEN   1084210 non-null  int64  
 12  BASE_BBL    1084210 non-null  int64  
 13  MPLUTO_BBL  1084210 non-null  int64  
 14  GEOMSOURCE  1083948 non-null  object 
 15  GLOBALID    1084210 non-null  object 
dtypes: float64(4), int64(6), object(6)
memory usage: 132.3+ MB


In [5]:
# returning only building built between 1990 and 2020
building_footprints = (
    building_footprints
    .loc[building_footprints['CNSTRCT_YR'].between(1990, 2020)]
)

# new shape of data
rows, columns = building_footprints.shape
print('rows: {:,}, columns: {}'.format(rows, columns))

rows: 95,202, columns: 16


In [6]:
# identify and drop null BBLs
count_null = (
    building_footprints['MPLUTO_BBL']
    .isnull()
    .sum()
)

print('count null: {:,}'.format(count_null))

print('dropping nulls...\n')
building_footprints = building_footprints.dropna(subset=['MPLUTO_BBL'])
count_null = building_footprints['MPLUTO_BBL'].isnull().sum()
print('count null: {:,}'.format(count_null))

# new shape of data
rows, columns = building_footprints.shape
print('rows: {:,}, columns: {}'.format(rows, columns))

count null: 0
dropping nulls...

count null: 0
rows: 95,202, columns: 16


## PLUTO Dataset

In [7]:
# assigning the PLUTO dataset link as 'url'
url = 'https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_pluto_20v1_csv.zip'

# reading in our zipfile data in-memory
content = requests.get(url)
zf = ZipFile(BytesIO(content.content))

# printing files in our zipfile
for item in zf.namelist():
    print("File in zip: "+ item)

File in zip: pluto_20v1.csv
File in zip: PLUTODD20v1.pdf
File in zip: PlutoReadme20v1.pdf


In [8]:
# read our csv data into a dataframe from our zipfile
pluto_data = pd.read_csv(
    zf.open('pluto_20v1.csv'),
    low_memory=False
)

# shape of data
rows, columns = pluto_data.shape
print('rows: {:,}, columns: {}'.format(rows, columns))

# previewing the first five rows of our pluto data 
pluto_data.head()

rows: 859,172, columns: 99


Unnamed: 0,borough,block,lot,cd,ct2010,cb2010,schooldist,council,zipcode,firecomp,...,dcasdate,zoningdate,landmkdate,basempdate,masdate,polidate,edesigdate,geom,dcpedited,notes
0,BK,834,46,307.0,106.0,2001.0,20.0,38.0,11220.0,L114,...,,,,,,,,0106000020E61000000100000001030000000100000005...,,
1,QN,4042,106,407.0,929.0,3000.0,25.0,19.0,11356.0,E297,...,,,,,,,,0106000020E61000000100000001030000000100000007...,,
2,BK,4679,17,317.0,866.0,3002.0,18.0,41.0,11203.0,L174,...,,,,,,,,0106000020E61000000100000001030000000100000006...,,
3,BK,7831,6,318.0,676.0,1002.0,22.0,46.0,11234.0,L159,...,,,,,,,,0106000020E61000000100000001030000000100000005...,,
4,BK,7831,7,318.0,676.0,1002.0,22.0,46.0,11234.0,L159,...,,,,,,,,0106000020E61000000100000001030000000100000005...,,


In [9]:
# printing the column names and their non-null count and data types 
pluto_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859172 entries, 0 to 859171
Data columns (total 99 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   borough               859171 non-null  object 
 1   block                 859172 non-null  int64  
 2   lot                   859172 non-null  int64  
 3   cd                    858186 non-null  float64
 4   ct2010                858186 non-null  float64
 5   cb2010                858186 non-null  float64
 6   schooldist            857481 non-null  float64
 7   council               858185 non-null  float64
 8   zipcode               857449 non-null  float64
 9   firecomp              857464 non-null  object 
 10  policeprct            857473 non-null  float64
 11  healtharea            857474 non-null  float64
 12  sanitboro             857270 non-null  float64
 13  sanitsub              857150 non-null  object 
 14  address               858772 non-null  object 
 15  

In [10]:
# printing the columns 
pluto_data.columns

Index(['borough', 'block', 'lot', 'cd', 'ct2010', 'cb2010', 'schooldist',
       'council', 'zipcode', 'firecomp', 'policeprct', 'healtharea',
       'sanitboro', 'sanitsub', 'address', 'zonedist1', 'zonedist2',
       'zonedist3', 'zonedist4', 'overlay1', 'overlay2', 'spdist1', 'spdist2',
       'spdist3', 'ltdheight', 'splitzone', 'bldgclass', 'landuse',
       'easements', 'ownertype', 'ownername', 'lotarea', 'bldgarea', 'comarea',
       'resarea', 'officearea', 'retailarea', 'garagearea', 'strgearea',
       'factryarea', 'otherarea', 'areasource', 'numbldgs', 'numfloors',
       'unitsres', 'unitstotal', 'lotfront', 'lotdepth', 'bldgfront',
       'bldgdepth', 'ext', 'proxcode', 'irrlotcode', 'lottype', 'bsmtcode',
       'assessland', 'assesstot', 'exempttot', 'yearbuilt', 'yearalter1',
       'yearalter2', 'histdist', 'landmark', 'builtfar', 'residfar', 'commfar',
       'facilfar', 'borocode', 'bbl', 'condono', 'tract2010', 'xcoord',
       'ycoord', 'latitude', 'longitude', '

In [11]:
# saving only the columns we want in our dataframe
pluto_data = pluto_data.loc[:,
    ['borough', 'block', 'lot', 'cd', 'ct2010', 'cb2010', 
     'council', 'zipcode', 'bldgclass', 'landuse', 'ownertype', 
     'borocode', 'bbl', 'tract2010', 'xcoord', 'ycoord', 
     'latitude', 'longitude', 'plutomapid', 'firm07_flag', 'pfirm15_flag']]

pluto_data.head()

Unnamed: 0,borough,block,lot,cd,ct2010,cb2010,council,zipcode,bldgclass,landuse,...,borocode,bbl,tract2010,xcoord,ycoord,latitude,longitude,plutomapid,firm07_flag,pfirm15_flag
0,BK,834,46,307.0,106.0,2001.0,38.0,11220.0,S9,4.0,...,3,3008340046,106.0,982211.0,171707.0,40.637972,-74.007347,1,,
1,QN,4042,106,407.0,929.0,3000.0,19.0,11356.0,A1,1.0,...,4,4040420106,929.0,1026895.0,225880.0,40.786562,-73.846003,1,,
2,BK,4679,17,317.0,866.0,3002.0,41.0,11203.0,B1,1.0,...,3,3046790017,866.0,1004527.0,177269.0,40.653216,-73.926923,1,,
3,BK,7831,6,318.0,676.0,1002.0,46.0,11234.0,A9,1.0,...,3,3078310006,676.0,1004804.0,166580.0,40.623876,-73.925958,1,,
4,BK,7831,7,318.0,676.0,1002.0,46.0,11234.0,A9,1.0,...,3,3078310007,676.0,1004784.0,166579.0,40.623874,-73.92603,1,,


In [12]:
# merging the pluto data with the building footprints data and saving as a new dataframe
building_footprints = (
    building_footprints.merge(
        pluto_data, 
        left_on='MPLUTO_BBL', 
        right_on='bbl')
)

building_footprints = building_footprints.reset_index(drop=True)

rows, columns = building_footprints.shape
print('rows: {:,}\ncolumns: {}'.format(rows, columns))

building_footprints.head()

rows: 94,563, columns: 37


Unnamed: 0,the_geom,NAME,BIN,CNSTRCT_YR,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,...,borocode,bbl,tract2010,xcoord,ycoord,latitude,longitude,plutomapid,firm07_flag,pfirm15_flag
0,MULTIPOLYGON (((-74.14644228489016 40.58885592...,,5122427,1997.0,08/22/2017 12:00:00 AM,Constructed,933599,32.5,2100.0,175.0,...,5,5019900003,27301.0,943549.0,153837.0,40.58883,-74.146542,1,,
1,MULTIPOLYGON (((-74.221228292902 40.5303177260...,,5133580,2001.0,08/22/2017 12:00:00 AM,Constructed,937499,27.31,2100.0,105.0,...,5,5073560090,226.0,922741.0,132534.0,40.530237,-74.221266,1,,
2,MULTIPOLYGON (((-74.15369944497655 40.62375637...,,5127645,1996.0,08/22/2017 12:00:00 AM,Constructed,616143,26.41,2100.0,68.0,...,5,5017010011,30302.0,941573.0,166545.0,40.623701,-74.153737,1,,
3,MULTIPOLYGON (((-74.15800726390819 40.62204540...,,5129621,2000.0,08/22/2017 12:00:00 AM,Constructed,954205,30.04,2100.0,22.0,...,5,5016800094,30302.0,940372.0,165936.0,40.622024,-74.158059,1,,
4,MULTIPOLYGON (((-73.87129515296562 40.65717370...,,3394646,2009.0,08/22/2017 12:00:00 AM,Constructed,1212853,21.608508,2100.0,18.0,...,3,3044520815,1070.0,1019957.0,178731.0,40.65718,-73.871307,1,,


In [13]:
# printing the column names and their non-null count and data types 
building_footprints.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94563 entries, 0 to 94562
Data columns (total 37 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   the_geom      94563 non-null  object 
 1   NAME          238 non-null    object 
 2   BIN           94563 non-null  int64  
 3   CNSTRCT_YR    94563 non-null  float64
 4   LSTMODDATE    94563 non-null  object 
 5   LSTSTATYPE    94455 non-null  object 
 6   DOITT_ID      94563 non-null  int64  
 7   HEIGHTROOF    92883 non-null  float64
 8   FEAT_CODE     94561 non-null  float64
 9   GROUNDELEV    94289 non-null  float64
 10  SHAPE_AREA    94563 non-null  int64  
 11  SHAPE_LEN     94563 non-null  int64  
 12  BASE_BBL      94563 non-null  int64  
 13  MPLUTO_BBL    94563 non-null  int64  
 14  GEOMSOURCE    94496 non-null  object 
 15  GLOBALID      94563 non-null  object 
 16  borough       94563 non-null  object 
 17  block         94563 non-null  int64  
 18  lot           94563 non-nu

In [14]:
# filling null values with '0' in Flood Insurance Rate Map fields
building_footprints['pfirm15_flag'] = building_footprints['pfirm15_flag'].fillna(value=0)
building_footprints['firm07_flag'] = building_footprints['firm07_flag'].fillna(value=0)
                               
# printing number of rows in data    
print('number of rows in new datframe: {}'.format(building_footprints.shape[0]))
building_footprints.head()

number of rows in new datframe: 94563


Unnamed: 0,the_geom,NAME,BIN,CNSTRCT_YR,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,...,borocode,bbl,tract2010,xcoord,ycoord,latitude,longitude,plutomapid,firm07_flag,pfirm15_flag
0,MULTIPOLYGON (((-74.14644228489016 40.58885592...,,5122427,1997.0,08/22/2017 12:00:00 AM,Constructed,933599,32.5,2100.0,175.0,...,5,5019900003,27301.0,943549.0,153837.0,40.58883,-74.146542,1,0.0,0.0
1,MULTIPOLYGON (((-74.221228292902 40.5303177260...,,5133580,2001.0,08/22/2017 12:00:00 AM,Constructed,937499,27.31,2100.0,105.0,...,5,5073560090,226.0,922741.0,132534.0,40.530237,-74.221266,1,0.0,0.0
2,MULTIPOLYGON (((-74.15369944497655 40.62375637...,,5127645,1996.0,08/22/2017 12:00:00 AM,Constructed,616143,26.41,2100.0,68.0,...,5,5017010011,30302.0,941573.0,166545.0,40.623701,-74.153737,1,0.0,0.0
3,MULTIPOLYGON (((-74.15800726390819 40.62204540...,,5129621,2000.0,08/22/2017 12:00:00 AM,Constructed,954205,30.04,2100.0,22.0,...,5,5016800094,30302.0,940372.0,165936.0,40.622024,-74.158059,1,0.0,0.0
4,MULTIPOLYGON (((-73.87129515296562 40.65717370...,,3394646,2009.0,08/22/2017 12:00:00 AM,Constructed,1212853,21.608508,2100.0,18.0,...,3,3044520815,1070.0,1019957.0,178731.0,40.65718,-73.871307,1,0.0,0.0


# Writing out file

In [15]:
building_footprints.to_csv('../data/building-footprints.csv', index=False)

In [16]:
%ls ../data

README.md                output.dbf               sample-data.geojson
building-footprints.csv  output.json              sample-data.gpkg
nta_shape.cpg            output.prj               sample-data.json
nta_shape.dbf            output.shp               sample-data.prj
nta_shape.geojson        output.shx               sample-data.shp
nta_shape.prj            output.xlsx              sample-data.shx
nta_shape.shp            sample-buildings.zip     sample-data.xlsx
nta_shape.shx            sample-data.cpg          [34munzipped-data[m[m/
output.cpg               sample-data.csv
output.csv               sample-data.dbf
