# Note: this is not the analysis notebook.

### Goal: Exporting sample data for *Part 1: Reading and Writing Files in Python*



Mark Bauer

In [15]:
# importing libraries
import pandas as pd
import numpy as np
import geopandas as gpd
from openpyxl import Workbook
import zipfile
from zipfile import ZipFile
from os.path import basename

Printing verions of Python modules and packages with **watermark** - the IPython magic extension.

Documention for installing watermark: https://github.com/rasbt/watermark

In [2]:
%load_ext watermark

%watermark -v -p numpy,pandas,geopandas,matplotlib.pyplot,json,requests,sodapy

CPython 3.7.1
IPython 7.20.0

numpy 1.19.2
pandas 1.2.1
geopandas 0.8.1
matplotlib.pyplot 3.3.2
json 2.0.9
requests 2.25.1
sodapy 2.0.0


In [3]:
# reading in data as a url from NYC Open Data
url = 'https://data.cityofnewyork.us/api/views/qb5r-6dgf/rows.csv?accessType=DOWNLOAD'
df = pd.read_csv(url)

df.head()

Unnamed: 0,CNSTRCT_YR,BIN,the_geom,NAME,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,SHAPE_AREA,SHAPE_LEN,BASE_BBL,MPLUTO_BBL,GEOMSOURCE
0,1925.0,3170958,MULTIPOLYGON (((-73.96664570466969 40.62599676...,,08/22/2017 12:00:00 AM +0000,Constructed,96807,29.749853,2100.0,40.0,0,0,3065220000.0,3065220000.0,Photogramm
1,1965.0,5028452,MULTIPOLYGON (((-74.16790202462265 40.63936048...,,08/22/2017 12:00:00 AM +0000,Constructed,326368,22.63,2100.0,39.0,0,0,5012640000.0,5012640000.0,Photogramm
2,1970.0,5078368,MULTIPOLYGON (((-74.19510813278613 40.55610681...,,08/22/2017 12:00:00 AM +0000,Constructed,746627,35.76,2100.0,51.0,0,0,5060190000.0,5060190000.0,Photogramm
3,1928.0,3245111,MULTIPOLYGON (((-73.96113466505085 40.57743931...,,08/22/2017 12:00:00 AM +0000,Constructed,786626,37.5,2100.0,6.0,0,0,3086910000.0,3086910000.0,Photogramm
4,1950.0,4161096,MULTIPOLYGON (((-73.75421559146167 40.75591276...,,08/22/2017 12:00:00 AM +0000,Constructed,746409,18.015113,2100.0,93.0,0,0,4075020000.0,4075020000.0,Photogramm


In [4]:
rows, columns = df.shape
print('This dataset has {:,} rows and {:,} columns.'.format(rows, columns))

This dataset has 1,084,416 rows and 15 columns.


### For simplicity, we're only exporting buildings built from 2010 to 2020

This pertains to both the building footprints and the PLUTO datasets.

In [5]:
# saving only buildings built between 2010 and 2020 as a new dataframe
df_sample = df.loc[df['CNSTRCT_YR'].between(2010, 2020)]

# reset our index
df_sample = df_sample.reset_index(drop=True)

# previewing first five rows of sample data
df_sample.head()

Unnamed: 0,CNSTRCT_YR,BIN,the_geom,NAME,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,SHAPE_AREA,SHAPE_LEN,BASE_BBL,MPLUTO_BBL,GEOMSOURCE
0,2013.0,1022662,MULTIPOLYGON (((-73.98406915139554 40.75857096...,,08/22/2017 12:00:00 AM +0000,Constructed,633694,76.93,2100.0,49.0,0,0,1009990000.0,1009990000.0,Photogramm
1,2011.0,3394834,MULTIPOLYGON (((-73.87172426474349 40.65519420...,,08/22/2017 12:00:00 AM +0000,Constructed,1250314,26.0,2100.0,15.0,0,0,3044521000.0,3044521000.0,Other (Man
2,2010.0,4540159,MULTIPOLYGON (((-73.86650099829305 40.74236058...,,08/22/2017 12:00:00 AM +0000,Constructed,201366,28.0,2100.0,37.0,0,0,4018780000.0,4018780000.0,Other (Man
3,2010.0,4260357,MULTIPOLYGON (((-73.7940773567428 40.680625171...,,08/10/2017 12:00:00 AM +0000,Constructed,1184712,29.0,2100.0,20.0,0,0,4120060000.0,4120060000.0,Other (Man
4,2010.0,4540051,MULTIPOLYGON (((-73.87805078807256 40.71475698...,,08/22/2017 12:00:00 AM +0000,Constructed,1171655,28.330225,2100.0,112.0,0,0,4030600000.0,4030600000.0,Photogramm


In [6]:
rows, columns = df_sample.shape
print('This dataset has {:,} rows and {:,} columns.'.format(rows, columns))

This dataset has 15,182 rows and 15 columns.


In [7]:
# sorting our construction year values and printing the unique values
df_sample.sort_values(by='CNSTRCT_YR')['CNSTRCT_YR'].unique()

array([2010., 2011., 2012., 2013., 2014., 2015., 2016., 2017., 2018.,
       2019., 2020.])

In [8]:
# value counts and sanity checks
df_sample['CNSTRCT_YR'].value_counts()

2018.0    2099
2017.0    1911
2016.0    1782
2011.0    1490
2012.0    1447
2010.0    1369
2020.0    1253
2015.0    1077
2019.0     998
2013.0     905
2014.0     851
Name: CNSTRCT_YR, dtype: int64

In [9]:
# value counts and sanity checks
df_sample['CNSTRCT_YR'].value_counts().sort_index()

2010.0    1369
2011.0    1490
2012.0    1447
2013.0     905
2014.0     851
2015.0    1077
2016.0    1782
2017.0    1911
2018.0    2099
2019.0     998
2020.0    1253
Name: CNSTRCT_YR, dtype: int64

In [10]:
# list items in data folder
%ls data/

README.md             sample-buildings.zip  [34munzipped-data[m[m/


In [11]:
# writing files as a csv
df_sample.to_csv('data/sample-data.csv', index=False)

# listing items in data folder
%ls data/

README.md             sample-data.csv
sample-buildings.zip  [34munzipped-data[m[m/


In [12]:
# writing files as an excel file
df_sample.to_excel('data/sample-data.xlsx', index=False)

# listing items in data folder
%ls data/

README.md             sample-data.csv       [34munzipped-data[m[m/
sample-buildings.zip  sample-data.xlsx


In [16]:
# writing files as json
df_sample.to_json('data/sample-data.json')

# listing items in data folder
%ls data/

README.md             sample-data.csv       sample-data.xlsx
sample-buildings.zip  sample-data.json      [34munzipped-data[m[m/


In [17]:
# reading in data as a geodataframe
url = 'https://data.cityofnewyork.us/api/geospatial/nqwf-w8eh?method=export&format=Shapefile'
gdf = gpd.read_file(url)

# printing the first five rows
gdf.head()

Unnamed: 0,base_bbl,bin,cnstrct_yr,doitt_id,feat_code,geomsource,groundelev,heightroof,date_lstmo,time_lstmo,lststatype,mpluto_bbl,name,shape_area,shape_len,geometry
0,3065220021,3170958.0,1925.0,96807.0,2100.0,Photogramm,40.0,29.749853,2017-08-22,00:00:00.000,Constructed,3065220021,,0.0,0.0,"POLYGON ((-73.96665 40.62600, -73.96685 40.625..."
1,5012640036,5028452.0,1965.0,326368.0,2100.0,Photogramm,39.0,22.63,2017-08-22,00:00:00.000,Constructed,5012640036,,0.0,0.0,"POLYGON ((-74.16790 40.63936, -74.16790 40.639..."
2,5060190091,5078368.0,1970.0,746627.0,2100.0,Photogramm,51.0,35.76,2017-08-22,00:00:00.000,Constructed,5060190091,,0.0,0.0,"POLYGON ((-74.19511 40.55611, -74.19520 40.556..."
3,3086910048,3245111.0,1928.0,786626.0,2100.0,Photogramm,6.0,37.5,2017-08-22,00:00:00.000,Constructed,3086910048,,0.0,0.0,"POLYGON ((-73.96113 40.57744, -73.96115 40.577..."
4,4075020005,4161096.0,1950.0,746409.0,2100.0,Photogramm,93.0,18.015113,2017-08-22,00:00:00.000,Constructed,4075020005,,0.0,0.0,"POLYGON ((-73.75422 40.75591, -73.75417 40.755..."


In [18]:
rows, columns = gdf.shape
print('This dataset has {:,} rows and {:,} columns.'.format(rows, columns))

This dataset has 1,084,416 rows and 16 columns.


In [19]:
# saving only buildings built between 2010 and 2020 as a new dataframe
gdf_sample = gdf.loc[gdf['cnstrct_yr'].between(2010, 2020)]

# reset our index
gdf_sample = gdf_sample.reset_index(drop=True)

# previewing first five rows of sample data
gdf_sample.head()

Unnamed: 0,base_bbl,bin,cnstrct_yr,doitt_id,feat_code,geomsource,groundelev,heightroof,date_lstmo,time_lstmo,lststatype,mpluto_bbl,name,shape_area,shape_len,geometry
0,1009990008,1022662.0,2013.0,633694.0,2100.0,Photogramm,49.0,76.93,2017-08-22,00:00:00.000,Constructed,1009990008,,0.0,0.0,"POLYGON ((-73.98407 40.75857, -73.98425 40.758..."
1,3044520924,3394834.0,2011.0,1250314.0,2100.0,Other (Man,15.0,26.0,2017-08-22,00:00:00.000,Constructed,3044520924,,0.0,0.0,"POLYGON ((-73.87172 40.65519, -73.87179 40.655..."
2,4018780115,4540159.0,2010.0,201366.0,2100.0,Other (Man,37.0,28.0,2017-08-22,00:00:00.000,Constructed,4018780115,,0.0,0.0,"POLYGON ((-73.86650 40.74236, -73.86645 40.742..."
3,4120060029,4260357.0,2010.0,1184712.0,2100.0,Other (Man,20.0,29.0,2017-08-10,00:00:00.000,Constructed,4120060029,,0.0,0.0,"POLYGON ((-73.79408 40.68063, -73.79407 40.680..."
4,4030600139,4540051.0,2010.0,1171655.0,2100.0,Photogramm,112.0,28.330225,2017-08-22,00:00:00.000,Constructed,4030600139,,0.0,0.0,"POLYGON ((-73.87805 40.71476, -73.87787 40.714..."


In [20]:
# writing out spatial data
gdf_sample.to_file('data/sample-data.shp')
gdf_sample.to_file('data/sample-data.geojson', driver='GeoJSON')
gdf_sample.to_file('data/sample-data.gpkg', layer='buildings', driver='GPKG')

# listing items in data folder
%ls data/

README.md             sample-data.geojson   sample-data.shx
sample-buildings.zip  sample-data.gpkg      sample-data.xlsx
sample-data.cpg       sample-data.json      [34munzipped-data[m[m/
sample-data.csv       sample-data.prj
sample-data.dbf       sample-data.shp


### Creating a zip file with our sample data

In [21]:
file_path = 'data/sample-buildings.zip'

# create a zipfile
with zipfile.ZipFile(file_path, 'w') as file:
        # write mode overrides all the existing files in the 'Zip.'
        # you have to create the file which you have to write to the 'Zip.'
        file.write('data/sample-data.csv', 
                   basename('data/sample-data.csv'))

In [22]:
# seeing if a file is a zipfile
print(zipfile.is_zipfile(file_path))

True


In [23]:
# list items in this file path
%ls data/

README.md             sample-data.geojson   sample-data.shx
sample-buildings.zip  sample-data.gpkg      sample-data.xlsx
sample-data.cpg       sample-data.json      [34munzipped-data[m[m/
sample-data.csv       sample-data.prj
sample-data.dbf       sample-data.shp


In [24]:
# save items in our zipfile
items = zipfile.ZipFile(file_path)

# available files in the container
print(items.namelist())

['sample-data.csv']
