### Imports

In [18]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import ray
import time
import yaml
import testutility as util

### Util File

In [10]:
# util file

### YAML File

In [28]:
%%writefile file.yaml
file_type: csv
dataset_name: parking_data
file_path: data/parking_tickets_2017
table_name: parking
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - city
    - price
    - distance

Overwriting file.yaml


In [29]:
# Read config
config_data = util.read_config_file("file.yaml")
config_data

{'file_type': 'csv',
 'dataset_name': 'parking_data',
 'file_path': 'data/parking_tickets_2017',
 'table_name': 'parking',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['city', 'price', 'distance']}

### Comparing reading speed of pandas, dask, modin, and ray

In [31]:
# Get file path
file_path = f"./{config_data['file_path']}.{config_data['file_type']}"
file_path

'./data/parking_tickets_2017.csv'

In [34]:
# Using Pandas
start_time = time.time()
df_pandas = pd.read_csv(file_path)
pandas_time = time.time() - start_time

print(f"Pandas Reading Time: {pandas_time} seconds")
# Pandas Reading Time: 61.86948895454407 seconds



Pandas Reading Time: 61.86948895454407 seconds


In [38]:
# Using Dask
start_time = time.time()
df_dask = dd.read_csv(
    file_path,
    dtype={"House Number": "object", "Time First Observed": "object"}
)
df_dask_computed = df_dask.compute()  # This forces the actual read
dask_time = time.time() - start_time

print(f"Dask Reading Time: {dask_time} seconds")
# Dask Reading Time: 53.906822681427 seconds



Dask Reading Time: 53.03518986701965 seconds


In [40]:
# Using Modin
start_time = time.time()
df_modin = mpd.read_csv(file_path)
modin_time = time.time() - start_time

print(f"Modin Reading Time: {modin_time} seconds")
# Modin Reading Time: 74.55038928985596 seconds

Modin Reading Time: 74.55038928985596 seconds


In [42]:
# Using Ray
if not ray.is_initialized():
    ray.init(ignore_reinit_error=True)


@ray.remote
def read_csv(file_path):
    return pd.read_csv(file_path)


start_time = time.time()
future = read_csv.remote(file_path)
df = ray.get(future)
ray_time = time.time() - start_time

print(f"Ray Reading Time: {ray_time} seconds")

Ray Reading Time: 411.440633058548 seconds


### Data validation

In [12]:
# Perform basic validation on data columns : 
# eg: remove special character , white spaces from the col name