In [1]:
import ray
import pandas as pd

In [None]:
COLUMNS = [
    "trip_distance",
    "passenger_count",
    "PULocationID",
    "DOLocationID",
    "payment_type",
    "tolls_amount",
    "tip_amount",
    "total_amount",
]

DATA_PATH = "s3://anyscale-public-materials/nyc-taxi-cab"

In [None]:
df = pd.read_parquet(
    f"{DATA_PATH}/yellow_tripdata_2011-05.parquet",
    columns=COLUMNS,
)

df.head()

In [None]:
df.memory_usage(deep=True).sum().sum() / 1024**2

In [None]:
!aws s3 ls s3://anyscale-public-materials/nyc-taxi-cab/ --human-readable | wc -l

In [None]:
ds = ray.data.read_parquet(
    DATA_PATH,
    columns=COLUMNS,
)

In [None]:
ds

In [None]:
def adjust_total_amount(df: pd.DataFrame) -> pd.DataFrame:
    df["adjusted_total_amount"] = df["total_amount"] - df["tip_amount"]
    return df

df = adjust_total_amount(df)

In [None]:
ds_adjusted = ds.map_batches(adjust_total_amount, batch_format="pandas")

In [None]:
def compute_tip_percentage(df: pd.DataFrame) -> pd.DataFrame:
    df["tip_percentage"] = df["tip_amount"] / df["total_amount"]
    return df

df = compute_tip_percentage(df)

In [None]:
ds_tip = ds_adjusted.map_batches(compute_tip_percentage, batch_format="pandas", batch_size=1024)

In [None]:
ds.take_batch()

In [None]:
ds_tip.take_batch()

In [None]:
storage_folder = '/mnt/cluster_storage' 

In [None]:
df.to_parquet(f"{storage_folder}/adjusted_data.parquet")

In [None]:
!ls -lh {storage_folder}/adjusted_data.parquet

In [None]:
!rm -rf /mnt/cluster_storage/adjusted_data_ray/ 
ds_limited = ds_adjusted.limit(df.shape[0]) 
ds_limited.write_parquet(f"{storage_folder}/adjusted_data_ray/")

In [None]:
!ls -lh {storage_folder}/adjusted_data_ray/

In [None]:
ds_file_shuffled = ray.data.read_parquet(DATA_PATH, columns=COLUMNS, shuffle="files")

In [None]:
ds_file_shuffled

In [None]:
ds = (
    ray.data.read_parquet(
        "s3://anyscale-public-materials/nyc-taxi-cab/yellow_tripdata_2011-05.parquet",
        columns=COLUMNS,
    )
)

In [None]:
ds_block_based_shuffle = ds.randomize_block_order()
ds_block_based_shuffle.to_pandas()

In [None]:
ds_row_based_shuffle = ds.random_shuffle()

In [None]:
ds_row_based_shuffle.to_pandas()

In [None]:
df.groupby("payment_type")["trip_distance"].mean()

In [None]:
num_cpus = 8
ds.repartition(num_cpus).groupby("payment_type").mean("trip_distance").to_pandas()

In [None]:
!rm {storage_folder}/adjusted_data.parquet
!rm -rf {storage_folder}/adjusted_data_ray/