In [48]:
# pyarrow is the library that provides support for parquet files
!pip install pyarrow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [49]:
import pyarrow
import pandas as pd
import numpy as np

In [None]:
# For my example, I'm going to generate some random data about locations on a theoretical map
# Don't try to make sense out of the topology or geography

NUMROWS=100

raw = {
    'lat': np.random.randint(0,180000, NUMROWS)/1000,  # 0.000-180.000
    'lat_d': np.random.choice(['N','S'], NUMROWS),     # N or S
    'lng': np.random.randint(0,180000, NUMROWS)/1000,  # 0.000-180.000
    'lng_d': np.random.choice(['E','W'], NUMROWS),     # E or W
    'elevation': np.random.randint(0,10000, NUMROWS),  # 0-10km
    'climate': np.random.choice(['Tundra','Arid','Polar','Tropical','Alpine','Oceanic'], NUMROWS)
}

locations = pd.DataFrame(raw)

In [51]:
locations

Unnamed: 0,lat,lat_d,lng,lng_d,elevation,climate
0,33.500,S,80.002,W,2304,Arid
1,24.859,S,136.069,W,1737,Polar
2,138.023,S,96.807,W,3637,Tropical
3,79.865,S,165.303,E,3313,Alpine
4,118.190,N,44.093,E,9882,Tropical
...,...,...,...,...,...,...
99995,122.705,N,101.028,W,1605,Polar
99996,143.093,N,27.322,W,371,Alpine
99997,151.102,S,47.549,W,4272,Arid
99998,50.412,N,159.283,E,6909,Tundra


In [52]:
import sys

# A good estimate of how much space in memory this df takes up (in bytes)
sys.getsizeof(locations)

20300454

In [53]:
# The %timeit magic allows you to time the execution of a command and get the timing
t = %timeit -n1 -r10 -o locations.to_csv('locations.csv')
t.average

176 ms ± 12.8 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


0.17630057909991592

In [54]:
t = %timeit -n1 -r10 -o locations.to_csv('locations.csv.gz')
t.average

456 ms ± 29.3 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


0.45555860829772427

In [55]:
t = %timeit -n1 -r10 -o locations.to_parquet('locations.parquet')
t.average

28.7 ms ± 2.59 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


0.028702275117393584

In [56]:
t = %timeit -n1 -r10 -o locations.to_parquet('locations.parquet.gz')
t.average

27.4 ms ± 1.07 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


0.027449595893267544

In [57]:
import os

# We can use the os.stat(FILE).st_size to get information about how big our files are
sz_csv = os.stat('locations.csv').st_size
sz_csvgz = os.stat('locations.csv.gz').st_size
sz_pq = os.stat('locations.parquet').st_size
sz_pqgz = os.stat('locations.parquet.gz').st_size

(sz_csv, sz_csvgz, sz_pq, sz_pqgz)

(3633761, 1356308, 1502851, 1502851)

In [58]:
# What are the tradeoffs for us when we look at larger and larger file sizes?
# Clean up your code so that you can capture the timing and file sizes for each format
# for multiple numbers of records. Pick some intervals between the 100 we've done here
# and as large as your laptop can handle. Maybe 10M or 100M rows?