# Working with Large Datasets

Practical patterns to handle datasets that do not fit comfortably in memory.


In [2]:
import pandas as pd
import numpy as np

## Reading data in chunks

In [3]:
chunks = pd.read_csv(
    'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv',
    chunksize=50
)

sizes = []
for chunk in chunks:
    sizes.append(len(chunk))

sizes

[50, 50, 50, 50, 44]

In [4]:
chunks = pd.read_csv(
    'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv',
    chunksize=50
)

total = 0
for chunk in chunks:
    total += chunk['total_bill'].sum()

total

np.float64(4827.77)

## Sampling strategies

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')

df.sample(10, random_state=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
67,3.07,1.0,Female,Yes,Sat,Dinner,1
243,18.78,3.0,Female,No,Thur,Dinner,2
206,26.59,3.41,Male,Yes,Sat,Dinner,3
122,14.26,2.5,Male,No,Thur,Lunch,2
89,21.16,3.0,Male,No,Thur,Lunch,2
218,7.74,1.44,Male,Yes,Sat,Dinner,2
58,11.24,1.76,Male,Yes,Sat,Dinner,2
186,20.9,3.5,Female,Yes,Sun,Dinner,3
177,14.48,2.0,Male,Yes,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
df.sample(frac=0.1, random_state=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
67,3.07,1.0,Female,Yes,Sat,Dinner,1
243,18.78,3.0,Female,No,Thur,Dinner,2
206,26.59,3.41,Male,Yes,Sat,Dinner,3
122,14.26,2.5,Male,No,Thur,Lunch,2
89,21.16,3.0,Male,No,Thur,Lunch,2
218,7.74,1.44,Male,Yes,Sat,Dinner,2
58,11.24,1.76,Male,Yes,Sat,Dinner,2
186,20.9,3.5,Female,Yes,Sun,Dinner,3
177,14.48,2.0,Male,Yes,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
df.groupby('day').sample(n=3, random_state=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
93,16.32,4.3,Female,Yes,Fri,Dinner,2
223,15.98,3.0,Female,No,Fri,Lunch,3
96,27.28,4.0,Male,Yes,Fri,Dinner,2
211,25.89,5.16,Male,Yes,Sat,Dinner,4
74,14.73,2.2,Female,No,Sat,Dinner,2
73,25.28,5.0,Female,Yes,Sat,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2
3,23.68,3.31,Male,No,Sun,Dinner,2
47,32.4,6.0,Male,No,Sun,Dinner,4
78,22.76,3.0,Male,No,Thur,Lunch,2


## Downcasting data types

In [8]:
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [9]:
df['size'] = pd.to_numeric(df['size'], downcast='integer')
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size             int8
dtype: object

## Incremental processing

In [10]:
chunks = pd.read_csv(
    'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv',
    chunksize=50
)

parts = []
for chunk in chunks:
    parts.append(chunk.groupby('day')['total_bill'].mean())

pd.concat(parts).groupby(level=0).mean()

day
Fri     15.711238
Sat     20.544585
Sun     21.930640
Thur    18.172354
Name: total_bill, dtype: float64

## Dask integration (conceptual)

In [12]:
import dask.dataframe as dd

In [13]:
# ddf = dd.read_csv('large_file.csv')
# ddf.groupby('day')['total_bill'].mean().compute()