# Using Large Datasets with Pandas
Tutorial based on "Using large datasets with Pandas" by Miki Tebeka

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load parquet or csv file
import pandas as pd
  
file_name = '/content/drive/MyDrive/BigData/yellow_tripdata_2021-02.parquet'
df = pd.read_parquet(file_name)

# If not able to read parquet run the following code
#file_name = '/content/drive/MyDrive/BigData/yellow_trip_data.csv'
#df = pd.read_csv(file_name)

df.head()

In [None]:
# Check size in memory of dataframe (MB)
mb = 1_000_000
df.memory_usage(deep=True).sum() / mb

In [None]:
# Check size of the file (MB)
from pathlib import Path
Path(file_name).stat().st_size / mb

In [None]:
# Calculate median distance by vendor
df.groupby('VendorID')['trip_distance'].median()

In [None]:
# Avoid loading all the columns into memory - load only the data you need (in this case vendor and trip distance)
columns=['VendorID', 'trip_distance']
df = pd.read_parquet(file_name, columns=columns)
# df = pd.read_csv(file_name, usecols=['VendorID', 'trip_distance'])
df.memory_usage(deep=True).sum() / mb

In [None]:
# Calculate again the median distance by vendor
df.groupby('VendorID')['trip_distance'].median()

In [None]:
# Reload data
df = pd.read_parquet(file_name)
# df = pd.read_csv(file_name)

In [None]:
# Check data types
df.dtypes

In [None]:
# Check for range of a specific variable (total amount in this case)
# Look for the min and max
df['total_amount'].describe()

In [None]:
# Check if this variable could be stored as a float32 type instead of float64
# to consume less memory
# Check for min and max values that can fit into float32
import numpy as np

np.finfo(np.float32)

In [None]:
# Calculate memory usage of column stored as float64
mb = 1_000_000
df['total_amount'].memory_usage(deep=True) / mb

In [None]:
# Convert to float32 and check the size in memory - it's much less!
amount = df['total_amount'].astype(np.float32)
amount.memory_usage(deep=True) / mb

In [None]:
# Create strings for categorical data (easier to read than indices)
names = {
    1: 'Creative',
    2: 'VeriFone',
}
df['vendor'] = df['VendorID'].map(names)

In [None]:
# Compare size in memory of indices versus string data
mb = 1_000_000
id_size = df['VendorID'].memory_usage(deep=True) / mb
name_size = df['vendor'].memory_usage(deep=True) / mb
print(f'id size: {id_size}, name size: {name_size}')

In [None]:
# Use data type 'category' to save memory
df['vendor'] = df['vendor'].astype('category')
df['vendor'].memory_usage(deep=True) / mb

In [None]:
# It is easier to read than indices and uses less data than strings
df['vendor'][:10]