## Modin

 일주일에 한번씩 날아오는 파이썬 위클리를 메일로 받아보는데 이번주엔 데이터 분석가에게도 흥미로운 내용이 있어서 북마크 해두었다고 오늘 간단히 보기시작했다. 그런데 판다스와 관한 제목이 있어 읽어보니 Modin이라는 분석 라이브러리에 대한 글이었다. 


해당 제목(How we parallelized 600+ pandas functions with Modin)이 나의 주의를 끈 이유는 판다스에 대한 이야기였기 때문이었다. 판다스는 사실 데이터 분석가에게선 뗄레야 뗄수 없는 라이브러리이다. 그래서 데이터 분석가 채용공고를 보면 종종 판다스에 대한 언급이 나오기도 한다.

관련 글 링크 (https://www.notion.so/Modin-582de92464514a70a00f8782218fccef)

In [1]:
#%pip install modin
#%pip install "modin[all]"

In [None]:
import pandas
import modin.pandas as pd

#############################################
### For the purpose of timing comparisons ###
#############################################
import time
import ray
ray.init()
#############################################

In [None]:
# This may take a few minutes to download
import urllib.request
s3_path = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv"
urllib.request.urlretrieve(s3_path, "taxi.csv")

In [None]:
## Faster Data Loading with read_csv

start = time.time()

pandas_df = pandas.read_csv(s3_path, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"], quoting=3)

end = time.time()
pandas_duration = end - start
print("Time to read with pandas: {} seconds".format(round(pandas_duration, 3)))

In [33]:
start = time.time()

modin_df = pd.read_csv(s3_path, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"], quoting=3)

end = time.time()
modin_duration = end - start
print("Time to read with Modin: {} seconds".format(round(modin_duration, 3)))

print("Modin is {}x faster than pandas at `read_csv`!".format(round(pandas_duration / modin_duration, 2)))

2022-03-29 16:00:57,085	INFO services.py:1412 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': None,
 'object_store_address': '/tmp/ray/session_2022-03-29_16-00-53_266561_20099/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-03-29_16-00-53_266561_20099/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2022-03-29_16-00-53_266561_20099',
 'metrics_export_port': 59855,
 'gcs_address': '127.0.0.1:58368',
 'address': '127.0.0.1:58368',
 'node_id': 'b5be9615bf26a0f0a659e50e3e8bb79a26e08e4dfaf92979ba2e71b7'}

In [40]:
type(pandas_df)

pandas.core.frame.DataFrame

In [41]:
type(modin_df)

modin.pandas.dataframe.DataFrame

In [36]:
%%time
pandas_df.head()

CPU times: user 128 µs, sys: 15 µs, total: 143 µs
Wall time: 138 µs


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2.0,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1.0,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2.0,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1.0,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1.0,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1.0,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1.0,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2.0,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1.0,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


In [37]:
%%time
modin_df.head()

CPU times: user 995 µs, sys: 420 µs, total: 1.42 ms
Wall time: 1.04 ms


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2.0,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1.0,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2.0,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1.0,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1.0,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1.0,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1.0,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2.0,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1.0,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


In [38]:
%%time
pandas_df.sort_values('trip_distance')

CPU times: user 468 ms, sys: 272 ms, total: 740 ms
Wall time: 729 ms


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1206966,1.0,2021-01-30 07:17:18,2021-01-30 07:17:27,0.0,0.00,1.0,N,112,112,3.0,2.50,0.00,0.5,0.00,0.00,0.3,3.30,0.0
1193726,2.0,2021-01-29 17:32:10,2021-01-29 17:33:09,1.0,0.00,1.0,N,193,193,1.0,0.00,0.00,0.0,0.00,0.00,0.0,0.00,0.0
866803,2.0,2021-01-22 13:35:54,2021-01-22 13:35:59,1.0,0.00,1.0,N,237,264,1.0,2.50,0.00,0.5,0.66,0.00,0.3,3.96,0.0
396162,1.0,2021-01-11 16:03:12,2021-01-11 17:06:12,1.0,0.00,1.0,N,68,117,1.0,64.20,0.00,0.5,0.00,14.53,0.3,79.53,0.0
576604,1.0,2021-01-15 14:57:18,2021-01-15 15:46:27,1.0,0.00,1.0,N,142,72,1.0,32.20,0.00,0.5,0.00,0.00,0.3,33.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348495,,2021-01-28 14:00:00,2021-01-28 14:17:00,,87507.93,,,170,74,,22.19,2.75,0.5,0.00,0.00,0.3,25.74,0.0
1300347,,2021-01-13 08:16:00,2021-01-13 08:37:00,,112257.10,,,213,248,,14.11,0.00,0.5,2.75,0.00,0.3,17.66,0.0
1243982,2.0,2021-01-31 06:54:06,2021-01-31 07:16:38,1.0,114328.20,5.0,N,78,107,2.0,26.30,0.00,0.5,0.00,6.12,0.3,33.22,0.0
1356238,,2021-01-28 10:10:00,2021-01-28 10:18:00,,140549.58,,,78,78,,12.12,2.75,0.5,0.00,0.00,0.3,15.67,0.0


In [39]:
%%time
modin_df.sort_values('trip_distance')

CPU times: user 1.15 s, sys: 184 ms, total: 1.33 s
Wall time: 2.65 s


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1206966,1.0,2021-01-30 07:17:18,2021-01-30 07:17:27,0.0,0.00,1.0,N,112,112,3.0,2.50,0.00,0.5,0.00,0.00,0.3,3.30,0.0
1193726,2.0,2021-01-29 17:32:10,2021-01-29 17:33:09,1.0,0.00,1.0,N,193,193,1.0,0.00,0.00,0.0,0.00,0.00,0.0,0.00,0.0
866803,2.0,2021-01-22 13:35:54,2021-01-22 13:35:59,1.0,0.00,1.0,N,237,264,1.0,2.50,0.00,0.5,0.66,0.00,0.3,3.96,0.0
396162,1.0,2021-01-11 16:03:12,2021-01-11 17:06:12,1.0,0.00,1.0,N,68,117,1.0,64.20,0.00,0.5,0.00,14.53,0.3,79.53,0.0
576604,1.0,2021-01-15 14:57:18,2021-01-15 15:46:27,1.0,0.00,1.0,N,142,72,1.0,32.20,0.00,0.5,0.00,0.00,0.3,33.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348495,,2021-01-28 14:00:00,2021-01-28 14:17:00,,87507.93,,,170,74,,22.19,2.75,0.5,0.00,0.00,0.3,25.74,0.0
1300347,,2021-01-13 08:16:00,2021-01-13 08:37:00,,112257.10,,,213,248,,14.11,0.00,0.5,2.75,0.00,0.3,17.66,0.0
1243982,2.0,2021-01-31 06:54:06,2021-01-31 07:16:38,1.0,114328.20,5.0,N,78,107,2.0,26.30,0.00,0.5,0.00,6.12,0.3,33.22,0.0
1356238,,2021-01-28 10:10:00,2021-01-28 10:18:00,,140549.58,,,78,78,,12.12,2.75,0.5,0.00,0.00,0.3,15.67,0.0


### 하지만 이상하게도 내 컴퓨터에서는 modin이 성능이 더 떨어진다..