### Imports

In [1]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import ray
import time
import yaml
import testutility as util

### Util File

In [None]:
# util file

### YAML File

In [2]:
%%writefile file.yaml
file_type: csv
dataset_name: parking_data
file_path: data/parking_tickets_2017
table_name: parking
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - city
    - price
    - distance

Overwriting file.yaml


In [3]:
# Read config
config_data = util.read_config_file("file.yaml")
config_data

{'file_type': 'csv',
 'dataset_name': 'parking_data',
 'file_path': 'data/parking_tickets_2017',
 'table_name': 'parking',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['city', 'price', 'distance']}

### Comparing reading speed of pandas, dask, modin, and ray

In [4]:
# Get file path
file_path = f"./{config_data['file_path']}.{config_data['file_type']}"
file_path

'./data/parking_tickets_2017.csv'

In [5]:
# Using Pandas
start_time = time.time()
df_pandas = pd.read_csv(file_path)
pandas_time = time.time() - start_time

print(f"Pandas Reading Time: {pandas_time} seconds")
# Pandas Reading Time: 50.59611439704895 seconds



Pandas Reading Time: 50.59611439704895 seconds


In [None]:
# Using Dask
start_time = time.time()
df_dask = dd.read_csv(
    file_path,
    dtype={"House Number": "object", "Time First Observed": "object"}
)
df_dask_computed = df_dask.compute()  # This forces the actual read
dask_time = time.time() - start_time

print(f"Dask Reading Time: {dask_time} seconds")
# Dask Reading Time: 42.42091226577759 seconds

In [None]:
# Using Modin
start_time = time.time()
df_modin = mpd.read_csv(file_path)
modin_time = time.time() - start_time

print(f"Modin Reading Time: {modin_time} seconds")
# Modin Reading Time: 38.331472635269165 seconds

In [None]:
# Using Ray
if not ray.is_initialized():
    ray.init(ignore_reinit_error=True)


@ray.remote
def read_csv(file_path):
    return pd.read_csv(file_path)


start_time = time.time()
future = read_csv.remote(file_path)
df = ray.get(future)
ray_time = time.time() - start_time

print(f"Ray Reading Time: {ray_time} seconds")
# Ray Reading Time: 168.3240203857422 seconds

In [6]:
df_pandas.head()

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,5092469481,GZH7067,NY,PAS,07/10/2016,7,SUBN,TOYOT,V,0,...,GY,,2001,,0,,FAILURE TO STOP AT RED LIGHT,,,
1,5092451658,GZH7067,NY,PAS,07/08/2016,7,SUBN,TOYOT,V,0,...,GY,,2001,,0,,FAILURE TO STOP AT RED LIGHT,,,
2,4006265037,FZX9232,NY,PAS,08/23/2016,5,SUBN,FORD,V,0,...,BK,,2004,,0,,BUS LANE VIOLATION,,,
3,8478629828,66623ME,NY,COM,06/14/2017,47,REFG,MITSU,T,10610,...,WH,,2007,,0,04,47-Double PKG-Midtown,,,
4,7868300310,37033JV,NY,COM,11/21/2016,69,DELV,INTER,T,10510,...,WHITE,,2007,,0,31 6,69-Failure to Disp Muni Recpt,,,


### Data validation

In [None]:
# Perform basic validation on data columns : 
# eg: remove special character , white spaces from the col name