# Note: this is not the analysis notebook.

### Goal: Exporting sample data for *Part 1: Reading and Writing Files in Python*



Mark Bauer

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import geopandas as gpd
from openpyxl import Workbook
import zipfile
from zipfile import ZipFile
from os.path import basename

Printing verions of Python modules and packages with **watermark** - the IPython magic extension.

Documention for installing watermark: https://github.com/rasbt/watermark

In [3]:
%load_ext watermark

%watermark -v -p numpy,pandas,geopandas,matplotlib,json,requests,sodapy

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.8.13
IPython version      : 8.4.0

numpy     : 1.23.1
pandas    : 1.4.3
geopandas : 0.11.1
matplotlib: 3.5.2
json      : 2.0.9
requests  : 2.28.1
sodapy    : 2.1.1



In [4]:
# reading in data as a url from NYC Open Data
url = 'https://data.cityofnewyork.us/api/views/qb5r-6dgf/rows.csv?accessType=DOWNLOAD'
df = pd.read_csv(url)

df.head()

Unnamed: 0,the_geom,NAME,BIN,CNSTRCT_YR,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,SHAPE_AREA,SHAPE_LEN,BASE_BBL,MPLUTO_BBL,GEOMSOURCE,GLOBALID
0,MULTIPOLYGON (((-73.96664570466969 40.62599676...,,3170958,1925.0,08/22/2017 12:00:00 AM,Constructed,96807,29.749853,2100,40.0,0,0,3065220021,3065220021,Photogramm,{31298F86-3088-4F53-B3DB-71A9EFA6FA1F}
1,MULTIPOLYGON (((-74.16790202462265 40.63936048...,,5028452,1965.0,08/22/2017 12:00:00 AM,Constructed,326368,22.63,2100,39.0,0,0,5012640036,5012640036,Photogramm,{F5F8CDA5-69E2-46F8-8F69-BA95C025B520}
2,MULTIPOLYGON (((-74.19510813278613 40.55610681...,,5078368,1970.0,08/22/2017 12:00:00 AM,Constructed,746627,35.76,2100,51.0,0,0,5060190091,5060190091,Photogramm,{9F644794-F72C-4582-9E5E-B337E2B97068}
3,MULTIPOLYGON (((-73.96113466505085 40.57743931...,,3245111,1928.0,08/22/2017 12:00:00 AM,Constructed,786626,37.5,2100,6.0,0,0,3086910048,3086910048,Photogramm,{F916B22D-E25B-44AE-9FA9-2A51191B9CDF}
4,MULTIPOLYGON (((-73.75421559146166 40.75591276...,,4161096,1950.0,08/22/2017 12:00:00 AM,Constructed,746409,18.015113,2100,93.0,0,0,4075020005,4075020005,Photogramm,{525F2C24-616B-4F29-98A3-8FEA5D4B1A7D}


In [5]:
rows, columns = df.shape
print('This dataset has {:,} rows and {:,} columns.'.format(rows, columns))

This dataset has 1,083,347 rows and 16 columns.


### For simplicity, we're only exporting buildings built from 2010 to 2020

This pertains to both the building footprints and the PLUTO datasets.

In [6]:
# saving only buildings built between 2010 and 2020 as a new dataframe

df_sample = (
    df
    .loc[df['CNSTRCT_YR'].between(2010, 2020)]
    .reset_index(drop=True)
)

# previewing first five rows of sample data
df_sample.head()

Unnamed: 0,the_geom,NAME,BIN,CNSTRCT_YR,LSTMODDATE,LSTSTATYPE,DOITT_ID,HEIGHTROOF,FEAT_CODE,GROUNDELEV,SHAPE_AREA,SHAPE_LEN,BASE_BBL,MPLUTO_BBL,GEOMSOURCE,GLOBALID
0,MULTIPOLYGON (((-73.87172426474349 40.65519420...,,3394834,2011.0,08/22/2017 12:00:00 AM,Constructed,1250314,26.0,2100,15.0,0,0,3044520924,3044520924,Other (Man,{C045C815-79DB-4644-AD9D-C34AC03D1AB4}
1,MULTIPOLYGON (((-73.86650099829305 40.74236058...,,4540159,2010.0,08/22/2017 12:00:00 AM,Constructed,201366,28.0,2100,37.0,0,0,4018780115,4018780115,Other (Man,{FDF673E7-FF92-4A7A-AF6D-C49D77343C47}
2,MULTIPOLYGON (((-73.87805078807256 40.71475698...,,4540051,2010.0,08/22/2017 12:00:00 AM,Constructed,1171655,28.330225,2100,112.0,0,0,4030600139,4030600139,Photogramm,{788E5D72-46C1-443F-8BC9-6B97F329BFED}
3,MULTIPOLYGON (((-73.81520745135124 40.73053646...,,4545453,2012.0,08/22/2017 12:00:00 AM,Constructed,1118502,16.64,2100,74.0,0,0,4066560052,4066560052,Photogramm,{789A5A51-5B12-46DC-AE85-B06820A3225E}
4,MULTIPOLYGON (((-73.84769179857282 40.87911947...,,2118998,2012.0,08/22/2017 12:00:00 AM,Constructed,1254551,33.0,2100,154.0,0,0,2047220003,2047220003,Other (Man,{BB58FD7B-CC22-4896-901D-F8BAFF4AC129}


In [7]:
rows, columns = df_sample.shape
print('This dataset has {:,} rows and {:,} columns.'.format(rows, columns))

This dataset has 16,307 rows and 16 columns.


In [8]:
# sorting our construction year values and printing the unique values
(df_sample
 .sort_values(by='CNSTRCT_YR')['CNSTRCT_YR']
 .unique()
)

array([2010., 2011., 2012., 2013., 2014., 2015., 2016., 2017., 2018.,
       2019., 2020.])

In [9]:
# value counts and sanity checks
df_sample['CNSTRCT_YR'].value_counts()

2018.0    2350
2017.0    1962
2016.0    1866
2020.0    1594
2011.0    1491
2012.0    1460
2010.0    1371
2019.0    1331
2015.0    1090
2013.0     918
2014.0     874
Name: CNSTRCT_YR, dtype: int64

In [10]:
# value counts and sanity checks
(df_sample['CNSTRCT_YR']
 .value_counts()
 .sort_index()
)

2010.0    1371
2011.0    1491
2012.0    1460
2013.0     918
2014.0     874
2015.0    1090
2016.0    1866
2017.0    1962
2018.0    2350
2019.0    1331
2020.0    1594
Name: CNSTRCT_YR, dtype: int64

In [11]:
# list items in data folder
%ls ../data/

README.md                nta_shape.shx            sample-data.json
building-footprints.csv  sample-buildings.zip     sample-data.prj
nta_shape.cpg            sample-data.cpg          sample-data.shp
nta_shape.dbf            sample-data.csv          sample-data.shx
nta_shape.geojson        sample-data.dbf          sample-data.xlsx
nta_shape.prj            sample-data.geojson      [34munzipped-data[m[m/
nta_shape.shp            sample-data.gpkg


In [12]:
# writing files as a csv
df_sample.to_csv('../data/sample-data.csv', index=False)

# writing files as an excel file
df_sample.to_excel('../data/sample-data.xlsx', index=False)

# writing files as json
df_sample.to_json('../data/sample-data.json')

# listing items in data folder
%ls ../data/

README.md                nta_shape.shx            sample-data.json
building-footprints.csv  sample-buildings.zip     sample-data.prj
nta_shape.cpg            sample-data.cpg          sample-data.shp
nta_shape.dbf            sample-data.csv          sample-data.shx
nta_shape.geojson        sample-data.dbf          sample-data.xlsx
nta_shape.prj            sample-data.geojson      [34munzipped-data[m[m/
nta_shape.shp            sample-data.gpkg


In [13]:
# reading in data as a geodataframe
url = 'https://data.cityofnewyork.us/api/geospatial/nqwf-w8eh?method=export&format=Shapefile'
gdf = gpd.read_file(url)

# printing the first five rows
gdf.head()

KeyboardInterrupt: 

In [None]:
rows, columns = gdf.shape
print('This dataset has {:,} rows and {:,} columns.'.format(rows, columns))

In [21]:
# saving only buildings built between 2010 and 2020 as a new dataframe
gdf_sample = (
    gdf
    .loc[gdf['cnstrct_yr'].between(2010, 2020)]
    .reset_index(drop=True)
)

# previewing first five rows of sample data
gdf_sample.head()

Unnamed: 0,base_bbl,bin,cnstrct_yr,doitt_id,feat_code,geomsource,groundelev,heightroof,date_lstmo,time_lstmo,lststatype,mpluto_bbl,name,shape_area,shape_len,geometry
0,3044520924,3394834.0,2011.0,1250314.0,2100.0,Other (Man,15.0,26.0,2017-08-22,00:00:00.000,Constructed,3044520924,,0.0,0.0,"POLYGON ((-73.87172 40.65519, -73.87179 40.655..."
1,4018780115,4540159.0,2010.0,201366.0,2100.0,Other (Man,37.0,28.0,2017-08-22,00:00:00.000,Constructed,4018780115,,0.0,0.0,"POLYGON ((-73.86650 40.74236, -73.86645 40.742..."
2,4120060029,4260357.0,2010.0,1184712.0,2100.0,Other (Man,20.0,29.0,2017-08-10,00:00:00.000,Constructed,4120060029,,0.0,0.0,"POLYGON ((-73.79408 40.68063, -73.79407 40.680..."
3,4030600139,4540051.0,2010.0,1171655.0,2100.0,Photogramm,112.0,28.330225,2017-08-22,00:00:00.000,Constructed,4030600139,,0.0,0.0,"POLYGON ((-73.87805 40.71476, -73.87787 40.714..."
4,4066560052,4545453.0,2012.0,1118502.0,2100.0,Photogramm,74.0,16.64,2017-08-22,00:00:00.000,Constructed,4066560052,,0.0,0.0,"POLYGON ((-73.81521 40.73054, -73.81546 40.730..."


In [22]:
# writing out spatial data
gdf_sample.to_file('../data/sample-data.shp')
gdf_sample.to_file('../data/sample-data.geojson', driver='GeoJSON')
gdf_sample.to_file('../data/sample-data.gpkg', layer='buildings', driver='GPKG')

# listing items in data folder
%ls ../data/

README.md                nta_shape.shx            sample-data.json
building-footprints.csv  sample-buildings.zip     sample-data.prj
nta_shape.cpg            sample-data.cpg          sample-data.shp
nta_shape.dbf            sample-data.csv          sample-data.shx
nta_shape.geojson        sample-data.dbf          sample-data.xlsx
nta_shape.prj            sample-data.geojson      [34munzipped-data[m[m/
nta_shape.shp            sample-data.gpkg


### Creating a ZIP file with our sample data

In [23]:
file_path = '../data/sample-buildings.zip'

# create a zipfile
with zipfile.ZipFile(file_path, 'w') as file:
        # write mode overrides all the existing files in the 'Zip.'
        # you have to create the file which you have to write to the 'Zip.'
        file.write('../data/sample-data.csv', basename('../data/sample-data.csv'))
        
# seeing if a file is a zipfile
print(zipfile.is_zipfile(file_path))  

# list items in this file path
%ls ../data/

In [26]:
# save items in our zipfile
items = zipfile.ZipFile(file_path)

# available files in the container
print(items.namelist())

['sample-data.csv']
