In [10]:
# util file

In [11]:
# yaml file

In [12]:
# Perform basic validation on data columns : 
# eg: remove special character , white spaces from the col name

In [14]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import ray
import time

# File path
file_path = "data/parking_tickets_2017.csv"

# Using Pandas
start_time = time.time()
df_pandas = pd.read_csv(file_path)
pandas_time = time.time() - start_time

# Using Dask
start_time = time.time()
df_dask = dd.read_csv(file_path, dtype={'House Number': 'object', 'Time First Observed': 'object'})
df_dask_computed = df_dask.compute()  # This forces the actual read
dask_time = time.time() - start_time

# Using Modin
start_time = time.time()
df_modin = mpd.read_csv(file_path)
modin_time = time.time() - start_time

# Using Ray
if not ray.is_initialized():
    ray.init(ignore_reinit_error=True)
@ray.remote
def read_csv(file_path):
    return pd.read_csv(file_path)

start_time = time.time()
future = read_csv.remote(file_path)
df = ray.get(future)
ray_time = time.time() - start_time

# Print the reading times
print(f"Pandas Reading Time: {pandas_time} seconds")
print(f"Dask Reading Time: {dask_time} seconds")
print(f"Modin Reading Time: {modin_time} seconds")
print(f"Ray Reading Time: {ray_time} seconds")

# Pandas Reading Time: 455.1478934288025 seconds
# Dask Reading Time: 108.55801367759705 seconds
# Modin Reading Time: 478.030531167984 seconds
# Ray Reading Time: 707.9684960842133 seconds