# Preview Metadata

Exploring the NYC Building Elevation and Subgrade (BES) Dataset in Python  

Author: Mark Bauer

In [1]:
import duckdb

In [2]:
# Printing verions of Python modules and packages with **watermark** - the IPython magic extension.
%reload_ext watermark
%watermark -v -p duckdb

Python implementation: CPython
Python version       : 3.8.13
IPython version      : 8.4.0

duckdb: 0.10.0



![cover-photo](images/dataset-cover-photo.png)
Screenshot of data set on NYC Open Data.

Link to dataset website: https://data.cityofnewyork.us/City-Government/Building-Elevation-and-Subgrade-BES-/bsin-59hv

In [3]:
# list files
%ls data/

bes-data.parquet   nfip-data.parquet


In [4]:
# create a DuckDB database instance
con = duckdb.connect()

# create table of the bes data named bes_data
con.sql(
    """
    CREATE TABLE bes_data AS
    FROM read_parquet('data/bes-data.parquet')
    """
)

# examine count of rows
con.sql("SELECT COUNT(*) AS count_rows FROM bes_data").show()

┌────────────┐
│ count_rows │
│   int64    │
├────────────┤
│     861876 │
└────────────┘



In [5]:
# examine column types and other metadata features.
# in this example, any column can be null, as there is no constraint, but this information
# is very useful when looking at a table from a database.
con.sql("DESCRIBE bes_data").to_df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,the_geom,VARCHAR,YES,,,
1,bin,BIGINT,YES,,,
2,bbl,BIGINT,YES,,,
3,borough,BIGINT,YES,,,
4,block,BIGINT,YES,,,
5,lot,BIGINT,YES,,,
6,address,VARCHAR,YES,,,
7,z_grade,DOUBLE,YES,,,
8,z_floor,DOUBLE,YES,,,
9,subgrade,VARCHAR,YES,,,


In [6]:
# count of columns
con.sql("SELECT count(*) AS count_columns FROM (DESCRIBE bes_data)").to_df()

Unnamed: 0,count_columns
0,26


In [7]:
# preview the data, limit to only five rows
sample_df = con.sql("SELECT * FROM bes_data LIMIT 5").to_df()

sample_df

Unnamed: 0,the_geom,bin,bbl,borough,block,lot,address,z_grade,z_floor,subgrade,...,longitude,pluto_bbl,Council,BoroCD,CTLabel,BoroCT2020,NTA2020,NTAName,CDTA2020,CDTAName
0,POINT (-74.22274561187417 40.52134422844183),5128004,5075340353,5,7534,353,78 SAVO LOOP,29.428,32.332,N,...,-74.222746,5075340353,51,503,226.01,5022601,SI0304,Annadale-Huguenot-Prince's Bay-Woodrow,SI03,SI03 South Shore (CD 3 Approximation)
1,POINT (-74.24179250549321 40.52875316810818),5155392,5075960125,5,7596,125,72 CHART LOOP,25.366,26.703,N,...,-74.241793,5075960125,51,503,226.02,5022602,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)
2,POINT (-74.24109012652094 40.528883822921635),5148808,5075960131,5,7596,131,40 TIDES LANE,36.172,38.902,N,...,-74.24109,5075960131,51,503,226.02,5022602,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)
3,POINT (-74.24649640323727 40.507045708438184),5088274,5079150042,5,7915,42,328 MAIN STREET,69.897,72.459,N,...,-74.246496,5079150042,51,503,244.01,5024401,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)
4,POINT (-74.24198032271697 40.51005666426356),5087850,5078680123,5,7868,123,309 SLEIGHT AVENUE,74.907,79.122,N,...,-74.24198,5078680123,51,503,244.01,5024401,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)


In [8]:
# too many columns to view, examine columns by every 15
sample_df.iloc[:, :15]

Unnamed: 0,the_geom,bin,bbl,borough,block,lot,address,z_grade,z_floor,subgrade,notes1,notes2,notes3,x,y
0,POINT (-74.22274561187417 40.52134422844183),5128004,5075340353,5,7534,353,78 SAVO LOOP,29.428,32.332,N,Property was Successfully Measured,,,922321.468334,129295.106289
1,POINT (-74.24179250549321 40.52875316810818),5155392,5075960125,5,7596,125,72 CHART LOOP,25.366,26.703,N,Property was Successfully Measured,Attached Garage to Living Space,,917033.446429,132008.386296
2,POINT (-74.24109012652094 40.528883822921635),5148808,5075960131,5,7596,131,40 TIDES LANE,36.172,38.902,N,Property was Successfully Measured,,,917228.833178,132055.448538
3,POINT (-74.24649640323727 40.507045708438184),5088274,5079150042,5,7915,42,328 MAIN STREET,69.897,72.459,N,Property was Successfully Measured,,,915703.546202,124103.533995
4,POINT (-74.24198032271697 40.51005666426356),5087850,5078680123,5,7868,123,309 SLEIGHT AVENUE,74.907,79.122,N,Property was Successfully Measured,,,916962.418356,125196.988173


In [9]:
sample_df.iloc[:, 15:]

Unnamed: 0,latitude,longitude,pluto_bbl,Council,BoroCD,CTLabel,BoroCT2020,NTA2020,NTAName,CDTA2020,CDTAName
0,40.521344,-74.222746,5075340353,51,503,226.01,5022601,SI0304,Annadale-Huguenot-Prince's Bay-Woodrow,SI03,SI03 South Shore (CD 3 Approximation)
1,40.528753,-74.241793,5075960125,51,503,226.02,5022602,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)
2,40.528884,-74.24109,5075960131,51,503,226.02,5022602,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)
3,40.507046,-74.246496,5079150042,51,503,244.01,5024401,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)
4,40.510057,-74.24198,5078680123,51,503,244.01,5024401,SI0305,Tottenville-Charleston,SI03,SI03 South Shore (CD 3 Approximation)


In [10]:
# describe, examine column summary statistics
con.sql("SUMMARIZE bes_data").to_df()

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,the_geom,VARCHAR,POINT (-73.70028493810709 40.73982670098595),POINT (-74.25526629488128 40.507487398894796),859938,,,,,,861876,0.0
1,bin,BIGINT,1000003,5799523,859098,3610723.244786953,1053306.7928804,3090435.0,4027304.0,4259253.0,861876,0.0
2,bbl,BIGINT,1090961,5999999999,796558,3517132892.094105,1034376481.787392,3029920782.0,4009117546.0,4104760364.0,861876,0.0
3,borough,BIGINT,1,5,5,3.464081218274112,1.0262401210524326,3.0,4.0,4.0,861876,0.0
4,block,BIGINT,1,99999,13737,5310.378501522842,3725.973701412565,2302.0,4856.0,7513.0,861876,0.0
5,lot,BIGINT,0,9999,2228,152.17001401593734,835.862078549265,18.0,38.0,64.0,861876,0.0
6,address,VARCHAR,1 1 PLACE,YORK AVENUE,799297,,,,,,861876,1.97
7,z_grade,DOUBLE,-13.096,402.839,154473,53.91501240781752,42.19289508297004,23.27459448281752,44.31280393267302,72.65348627165316,861876,0.0
8,z_floor,DOUBLE,0.0,404.397,157573,58.11370394813199,42.56067271472593,27.437067619111296,48.37118643622693,77.1645560876327,861876,0.0
9,subgrade,VARCHAR,N,Y,2,,,,,,861876,0.0


In [15]:
# preview null percentage, might be rounding issues
summarize_df = con.sql("SUMMARIZE bes_data").to_df()

summarize_df = (
    summarize_df
    .loc[:, ['column_name', 'null_percentage']]
    .sort_values(by='null_percentage', ascending=False)
)

summarize_df

Unnamed: 0,column_name,null_percentage
11,notes2,76.51
12,notes3,45.24
6,address,1.97
0,the_geom,0.0
15,latitude,0.0
24,CDTA2020,0.0
23,NTAName,0.0
22,NTA2020,0.0
21,BoroCT2020,0.0
20,CTLabel,0.0


In [15]:
# close connection
con.close()