In [1]:
import gc
import os
import threading
import timeit

In [2]:
import multiprocessing
import ray
ray.init(num_cpus=multiprocessing.cpu_count() // 2) # Circumvent SMT
import modin.pandas as pd
import pandas as realpd

2021-10-28 16:49:13,773	INFO services.py:1263 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


In [7]:
t = pd.read_csv("testinput.csv")

In [15]:
# t.replace(float("NaN"), "test")
t.groupby("day_2").mean()

Unnamed: 0_level_0,label,capacity
day_2,Unnamed: 1_level_1,Unnamed: 2_level_1
Friday,,5.0
Thursday,,5.0
Tuesday,,5.0
Wednesday,,5.0


In [None]:
evt = threading.Event()

In [5]:
def testfun():
    print("hi i'm a thread, sleeping then returning some object")
    time.sleep(5)
    print("now, i awake")
    return {"a": 1}

In [8]:
def runall():
    t = threading.Thread(target=testfun)
    t.start()
    print("joke's on you, i'm going to signal the child thread to stop in 1 second...")
    time.sleep(1)
    
    print("main thread done")

In [7]:
runall()
# check out Event() API?

hi i'm a thread, sleeping then returning some object
race con
now, i awake


In [5]:
# The original taxicab CSV is about 70MB; both Modin and Pandas can do ops on a 250 GB frame but Pandas cannot transpose a 6GB frame efficiently
# Multiplying by 100 puts us at about 7GB
dup_counts = [1, 3, 10]
filenames = [(f"dup_{i}_" if i != 1 else "") + "fhv_tripdata_2021-07.csv" for i in dup_counts]
for i, fn in zip(dup_counts, filenames):
    if not os.path.exists(fn):
        os.system(f"./repeat_csv.sh fhv_tripdata_2021-07.csv {i}")
# t = pd.read_csv("fhv_tripdata_2021-07.csv")

In [12]:
c_DOLocation = "DOLocationID"
c_dropoff_datetime = "dropoff_datetime"
c_PULocation = "PULocationID"

dolocation = t[c_DOLocation]
t1 = t[dolocation.notna()]
t1

Unnamed: 0,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag,Affiliated_base_number
1,B00037,2021-07-01 00:16:15,2021-07-01 00:24:33,,71.0,,B00037
2,B00037,2021-07-01 00:39:00,2021-07-01 00:45:31,,188.0,,B00037
3,B00037,2021-07-01 00:55:26,2021-07-01 01:09:41,,89.0,,B00037
4,B00037,2021-07-01 00:05:22,2021-07-01 00:27:11,,17.0,,B00037
5,B00037,2021-07-01 00:43:19,2021-07-01 01:05:14,,222.0,,B00037
...,...,...,...,...,...,...,...
1197980,B03285,2021-07-31 23:31:02,2021-08-01 00:24:21,252.0,85.0,,B03285
1197981,B03299,2021-07-31 23:01:18,2021-07-31 23:15:54,,188.0,,B03299
1197982,B03299,2021-07-31 23:19:30,2021-07-31 23:29:43,,61.0,,B03299
1197983,B03299,2021-07-31 23:53:49,2021-08-01 00:08:08,,189.0,,B03299


In [6]:
c_DOLocation = "DOLocationID"
c_dropoff_datetime = "dropoff_datetime"
c_PULocation = "PULocationID"

def order_one(df):
    # Filter DOLocationID first (probably slower?)
    dolocation = df[c_DOLocation]
    t1 = df[dolocation.notna()]
    pulocation = t1[c_PULocation]
    return t1[pulocation.notna()]

def order_two(df):
    # Filter PULocationID first (probably faster?)
    pulocation = df[c_PULocation]
    t1 = df[pulocation.notna()]
    dolocation = t1[c_DOLocation]
    return t1[dolocation.notna()]

In [7]:
def make_test_fn(pd_handle, csv_path, test_fn):
    df = pd_handle.read_csv(csv_path)
    return lambda: test_fn(df)

In [None]:
for fn in filenames:
    gc.collect()
    print("Timing order one on naive pandas with input", fn)
    print(timeit.timeit(make_test_fn(realpd, fn, order_one), number=10))
    
    gc.collect()
    print("Timing order one on modin with input", fn)
    print(timeit.timeit(make_test_fn(pd, fn, order_one), number=10))
    
    gc.collect()
    print("Timing order two on naive pandas with input", fn)
    print(timeit.timeit(make_test_fn(realpd, fn, order_two), number=10))
    
    gc.collect()
    print("Timing order two on modin with input", fn)
    print(timeit.timeit(make_test_fn(pd, fn, order_two), number=10))


Timing order one on naive pandas with input fhv_tripdata_2021-07.csv
0.5835004030000164
Timing order one on modin with input fhv_tripdata_2021-07.csv
4.214692429000024
Timing order two on naive pandas with input fhv_tripdata_2021-07.csv
0.24542260699990948
Timing order two on modin with input fhv_tripdata_2021-07.csv
2.4740077320002456
Timing order one on naive pandas with input dup_3_fhv_tripdata_2021-07.csv
1.6913304360000438
Timing order one on modin with input dup_3_fhv_tripdata_2021-07.csv
12.600575197000126
Timing order two on naive pandas with input dup_3_fhv_tripdata_2021-07.csv
0.8846277739999096
Timing order two on modin with input dup_3_fhv_tripdata_2021-07.csv
7.233240428000045
Timing order one on naive pandas with input dup_10_fhv_tripdata_2021-07.csv
5.910752590999891
Timing order one on modin with input dup_10_fhv_tripdata_2021-07.csv
43.03079221999997
Timing order two on naive pandas with input dup_10_fhv_tripdata_2021-07.csv
2.995915960000275
Timing order two on modin 