### Imports

In [1]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import ray
import time

### Util File

In [2]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re

# Supplied by Data Glacier
def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def num_col_validation(df, table_config) -> bool:
    if len(df.columns)== len(table_config['columns']):
        return True
    else:
        return False


def col_header_val(df, table_config) -> bool:
    # sort, strip leading and trailing spaces, and replace space with _
    df_columns = sorted([col.strip().lower().replace(' ', '_') for col in df.columns])
    yaml_columns = sorted([col.strip().lower().replace(' ', '_') for col in table_config['columns']])

    if df_columns == yaml_columns:
        return True
    else:
        # Find the mismatched columns
        mismatched_columns = set(df_columns) ^ set(yaml_columns)
        print(f"Mismatched columns: {list(mismatched_columns)}")
        return False

def summary(df, file_path) -> None:
    # filesize in mb
    file_size_bytes = os.path.getsize(file_path)
    file_size_mb = file_size_bytes / (1024 * 1024)

    # get dimensions
    total_rows = len(df)
    total_columns = len(df.columns)

    print(f"Total number of rows: {total_rows}")
    print(f"Total number of columns: {total_columns}")
    print(f"File size: {file_size_mb:.2f} MB")

Overwriting testutility.py


### YAML File

In [3]:
%%writefile file.yaml
file_type: csv
export_file_type: csv
dataset_name: parking_data
file_path: data/parking_tickets_2017
export_file_path: data/parking_tickets_validated
table_name: parking
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns:
    - Summons Number
    - Plate ID
    - Registration State
    - Plate Type
    - Issue Date
    - Violation Code
    - Vehicle Body Type
    - Vehicle Make
    - Issuing Agency
    - Street Code1
    - Street Code2
    - Street Code3
    - Vehicle Expiration Date
    - Violation Location
    - Violation Precinct
    - Issuer Precinct
    - Issuer Code
    - Issuer Command
    - Issuer Squad
    - Violation Time
    - Time First Observed
    - Violation County
    - Violation In Front Of Or Opposite
    - House Number
    - Street Name
    - Intersecting Street
    - Date First Observed
    - Law Section
    - Sub Division
    - Violation Legal Code
    - Days Parking In Effect
    - From Hours In Effect
    - To Hours In Effect
    - Vehicle Color
    - Unregistered Vehicle?
    - Vehicle Year
    - Meter Number
    - Feet From Curb
    - Violation Post Code
    - Violation Description
    - No Standing or Stopping Violation
    - Hydrant Violation
    - Double Parking Violation

Overwriting file.yaml


In [4]:
# Read config
import testutility as util

config_data = util.read_config_file("file.yaml")

config_data

{'file_type': 'csv',
 'export_file_type': 'csv',
 'dataset_name': 'parking_data',
 'file_path': 'data/parking_tickets_2017',
 'export_file_path': 'data/parking_tickets_validated',
 'table_name': 'parking',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['Summons Number',
  'Plate ID',
  'Registration State',
  'Plate Type',
  'Issue Date',
  'Violation Code',
  'Vehicle Body Type',
  'Vehicle Make',
  'Issuing Agency',
  'Street Code1',
  'Street Code2',
  'Street Code3',
  'Vehicle Expiration Date',
  'Violation Location',
  'Violation Precinct',
  'Issuer Precinct',
  'Issuer Code',
  'Issuer Command',
  'Issuer Squad',
  'Violation Time',
  'Time First Observed',
  'Violation County',
  'Violation In Front Of Or Opposite',
  'House Number',
  'Street Name',
  'Intersecting Street',
  'Date First Observed',
  'Law Section',
  'Sub Division',
  'Violation Legal Code',
  'Days Parking In Effect',
  'From Hours In Effect',
  'To Hours In Effe

### Comparing reading speed of pandas, dask, modin, and ray

In [5]:
# Get file path
file_path = f"./{config_data['file_path']}.{config_data['file_type']}"
file_path

'./data/parking_tickets_2017.csv'

In [6]:
# Using Pandas
start_time = time.time()
df_pandas = pd.read_csv(file_path, delimiter=config_data['inbound_delimiter'])
pandas_time = time.time() - start_time

print(f"Pandas Reading Time: {pandas_time} seconds")
# Pandas Reading Time: 50.59611439704895 seconds



Pandas Reading Time: 52.94884657859802 seconds


In [7]:
# Using Dask
# start_time = time.time()
# df_dask = dd.read_csv(
#     file_path,
#     delimiter=config_data['inbound_delimiter'],
#     dtype={"House Number": "object", "Time First Observed": "object"}
# )
# df_dask_computed = df_dask.compute()  # This forces the actual read
# dask_time = time.time() - start_time

# print(f"Dask Reading Time: {dask_time} seconds")
# Dask Reading Time: 42.42091226577759 seconds

In [8]:
# Using Modin
# start_time = time.time()
# df_modin = mpd.read_csv(file_path, delimiter=config_data["inbound_delimiter"])
# modin_time = time.time() - start_time

# print(f"Modin Reading Time: {modin_time} seconds")
# Modin Reading Time: 38.331472635269165 seconds

In [9]:
# Using Ray
# if not ray.is_initialized():
#     ray.init(ignore_reinit_error=True)

# @ray.remote
# def read_csv(file_path):
#     return pd.read_csv(file_path, delimiter=config_data["inbound_delimiter"])

# start_time = time.time()
# future = read_csv.remote(file_path)
# df = ray.get(future)
# ray_time = time.time() - start_time

# print(f"Ray Reading Time: {ray_time} seconds")
# Ray Reading Time: 168.3240203857422 seconds

In terms of reading time, Modin was the fastest at approximately 38.33 seconds, followed by Dask at around 42.42 seconds, Pandas at about 50.60 seconds, and Ray was the slowest, taking approximately 168.32 seconds. Ray's slower performance in this case is likely due to the overhead of setting up and managing the Ray cluster, which may not be necessary for a straightforward CSV file reading operation. Ray is better suited for more complex and computationally intensive tasks where the advantages of distributed computing are more apparent. For basic file reading tasks like this, using Pandas, Dask, or Modin can provide faster and more straightforward solutions.

In [10]:
pd.set_option("display.max_columns", None)
df_pandas.info(max_cols=50)
df_pandas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10803028 entries, 0 to 10803027
Data columns (total 43 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   Summons Number                     int64  
 1   Plate ID                           object 
 2   Registration State                 object 
 3   Plate Type                         object 
 4   Issue Date                         object 
 5   Violation Code                     int64  
 6   Vehicle Body Type                  object 
 7   Vehicle Make                       object 
 8   Issuing Agency                     object 
 9   Street Code1                       int64  
 10  Street Code2                       int64  
 11  Street Code3                       int64  
 12  Vehicle Expiration Date            int64  
 13  Violation Location                 float64
 14  Violation Precinct                 int64  
 15  Issuer Precinct                    int64  
 16  Issuer Code     

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,5092469481,GZH7067,NY,PAS,07/10/2016,7,SUBN,TOYOT,V,0,0,0,0,,0,0,0,,,0143A,,BX,,,ALLERTON AVE (W/B) @,BARNES AVE,0,1111,D,T,,,,GY,,2001,,0,,FAILURE TO STOP AT RED LIGHT,,,
1,5092451658,GZH7067,NY,PAS,07/08/2016,7,SUBN,TOYOT,V,0,0,0,0,,0,0,0,,,0400P,,BX,,,ALLERTON AVE (W/B) @,BARNES AVE,0,1111,D,T,,,,GY,,2001,,0,,FAILURE TO STOP AT RED LIGHT,,,
2,4006265037,FZX9232,NY,PAS,08/23/2016,5,SUBN,FORD,V,0,0,0,0,,0,0,0,,,0233P,,BX,,,SB WEBSTER AVE @ E 1,94TH ST,0,1111,C,T,,,,BK,,2004,,0,,BUS LANE VIOLATION,,,
3,8478629828,66623ME,NY,COM,06/14/2017,47,REFG,MITSU,T,10610,34330,34350,20180630,14.0,14,14,359594,T102,J,1120A,,NY,O,330.0,7th Ave,,0,408,l2,,Y,0700A,0700P,WH,,2007,,0,04,47-Double PKG-Midtown,,,
4,7868300310,37033JV,NY,COM,11/21/2016,69,DELV,INTER,T,10510,34310,34330,20170228,13.0,13,13,364832,T102,M,0555P,,NY,F,799.0,6th Ave,,0,408,h1,,Y,0700A,0700P,WHITE,,2007,,0,31 6,69-Failure to Disp Muni Recpt,,,


### Data validation

In [11]:
print(f"Columns from {config_data['file_path']}.{config_data['file_type']}:")
print(df_pandas.columns)

print("Columns from file.yaml:")
print(config_data['columns'])

Columns from data/parking_tickets_2017.csv:
Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
       'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
       'Violation Time', 'Time First Observed', 'Violation County',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parkin

In [12]:
if util.num_col_validation(df_pandas, config_data) == True:
    print("Number of columns match!")
else:
    print("Number of columns do not match!")

Number of columns match!


In [13]:
if util.col_header_val(df_pandas, config_data) == True:
    print("Column name validation passed!")
else:
    print("Column name validation failed!")

Column name validation passed!


### Output CSV with | Delimiter

In [14]:
to_csv_file_path = (
    f"{config_data['export_file_path']}.{config_data['export_file_type']}"
)

df_pandas.to_csv(to_csv_file_path, sep=config_data["outbound_delimiter"], index=False)

In [15]:
# testing to see if csv was made succesfully with | delimiter
with open(to_csv_file_path, 'r') as file:
    for i, line in enumerate(file):
        if i < 5:
            print(line)
        else:
            break

Summons Number|Plate ID|Registration State|Plate Type|Issue Date|Violation Code|Vehicle Body Type|Vehicle Make|Issuing Agency|Street Code1|Street Code2|Street Code3|Vehicle Expiration Date|Violation Location|Violation Precinct|Issuer Precinct|Issuer Code|Issuer Command|Issuer Squad|Violation Time|Time First Observed|Violation County|Violation In Front Of Or Opposite|House Number|Street Name|Intersecting Street|Date First Observed|Law Section|Sub Division|Violation Legal Code|Days Parking In Effect    |From Hours In Effect|To Hours In Effect|Vehicle Color|Unregistered Vehicle?|Vehicle Year|Meter Number|Feet From Curb|Violation Post Code|Violation Description|No Standing or Stopping Violation|Hydrant Violation|Double Parking Violation

5092469481|GZH7067|NY|PAS|07/10/2016|7|SUBN|TOYOT|V|0|0|0|0||0|0|0|||0143A||BX|||ALLERTON AVE (W/B) @|BARNES AVE|0|1111|D|T||||GY||2001||0||FAILURE TO STOP AT RED LIGHT|||

5092451658|GZH7067|NY|PAS|07/08/2016|7|SUBN|TOYOT|V|0|0|0|0||0|0|0|||0400P||BX|||AL

In [16]:
util.summary(df_pandas, to_csv_file_path)

Total number of rows: 10803028
Total number of columns: 43
File size: 2005.60 MB
