In [2]:
# Import libraries
import pandas as pd
import modin.pandas as mpd
import numpy as np
import time
import os
import warnings

In [None]:
# Create sample data
np.random.seed(42)
n_rows = 100000
n_cols = 20
data = np.random.randn(n_rows, n_cols)

In [None]:
RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0

In [None]:
columns = [f"col_{i}" for i in range(n_cols)]
# Create pandas DataFrame
pdf = pd.DataFrame(data, columns=columns)
print(f"pandas DataFrame: {pdf.shape}")
print(f"Type: {type(pdf)}")
# Create Modin DataFrame
mdf = mpd.DataFrame(data, columns=columns)
print(f"\nModin DataFrame: {mdf.shape}")
print(f"Type: {type(mdf)}")

pandas DataFrame: (100000, 20)
Type: <class 'pandas.core.frame.DataFrame'>

Modin DataFrame: (100000, 20)
Type: <class 'modin.pandas.dataframe.DataFrame'>




In [None]:
# Verify that operations produce identical results
print("Comparing pandas and Modin results:\n")
# Basic statistics
print("Mean of col_0:")
print(f"  pandas: {pdf['col_0'].mean():.10f}")
print(f"  Modin:  {mdf['col_0'].mean():.10f}")
print("\nSum of col_1:")
print(f"  pandas: {pdf['col_1'].sum():.10f}")
print(f"  Modin:  {mdf['col_1'].sum():.10f}")
print("\nStandard deviation of col_2:")
print(f"  pandas: {pdf['col_2'].std():.10f}")
print(f"  Modin:  {mdf['col_2'].std():.10f}")
print("\nResults are identical!")
# More operations - all work the same way
print("Additional operations:\n")
# Filtering
pdf_filtered = pdf[pdf['col_0'] > 0]
mdf_filtered = mdf[mdf['col_0'] > 0]
print(f"Filtered rows (col_0 > 0):")
print(f"  pandas: {len(pdf_filtered):,}")
print(f"  Modin:  {len(mdf_filtered):,}")
# Sorting
pdf_sorted = pdf.sort_values('col_0')
mdf_sorted = mdf.sort_values('col_0')
print(f"\nFirst value after sorting:")
print(f"  pandas: {pdf_sorted['col_0'].iloc[0]:.6f}")
print(f"  Modin:  {mdf_sorted['col_0'].iloc[0]:.6f}")
# GroupBy
pdf['group'] = np.random.choice(['A', 'B', 'C'], len(pdf))
mdf['group'] = np.random.choice(['A', 'B', 'C'], len(mdf))
print(f"\nGroupBy mean (group A, col_0):")
print(f"  pandas: {pdf.groupby('group')['col_0'].mean()['A']:.6f}")
print(f"  Modin:  {mdf.groupby('group')['col_0'].mean()['A']:.6f}")

Comparing pandas and Modin results:

Mean of col_0:
  pandas: 0.0015174473
  Modin:  0.0015174473

Sum of col_1:
  pandas: 218.9499122500
  Modin:  218.9499122500

Standard deviation of col_2:
  pandas: 0.9999137589
  Modin:  0.9999137589

Results are identical!
Additional operations:

Filtered rows (col_0 > 0):
  pandas: 50,136
  Modin:  50,136

First value after sorting:
  pandas: -4.227232
  Modin:  -4.227232

GroupBy mean (group A, col_0):
  pandas: -0.000493
  Modin:  -0.001912


In [None]:
def benchmark(func, name, n_runs=3):
    """Run a function multiple times and return average time."""
    times = []
    for _ in range(n_runs):
        start = time.time()
        result = func()
        times.append(time.time() - start)
    avg_time = sum(times) / len(times)
    return avg_time

In [None]:
def compare_performance(pandas_func, modin_func, operation_name,
n_runs=3):
    """Compare pandas and Modin performance."""
    pandas_time = benchmark(pandas_func, "pandas", n_runs)
    modin_time = benchmark(modin_func, "Modin", n_runs)
    speedup = pandas_time / modin_time if modin_time > 0 else float('inf')
    print(f"{operation_name}:")
    print(f"  pandas: {pandas_time:.4f}s")
    print(f"  Modin:  {modin_time:.4f}s")
    print(f"  Speedup: {speedup:.2f}x")
    print()
    return pandas_time, modin_time

In [None]:
np.random.seed(42)
n_rows = 2000000
n_cols = 20

data = np.random.randn(n_rows,n_cols)

In [None]:
columns = [f"col_{i}" for i in range(n_cols)]
pdf_bench = pd.DataFrame(data, columns=columns)
mdf_bench = mpd.DataFrame(data, columns=columns)



In [None]:
mdf_bench.head(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1.579213,0.767435,-0.469474,0.54256,-0.463418,-0.46573,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247,-0.908024,-1.412304
1,1.465649,-0.225776,0.067528,-1.424748,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,-0.601707,1.852278,-0.013497,-1.057711,0.822545,-1.220844,0.208864,-1.95967,-1.328186,0.196861
2,0.738467,0.171368,-0.115648,-0.301104,-1.478522,-0.719844,-0.460639,1.057122,0.343618,-1.76304,0.324084,-0.385082,-0.676922,0.611676,1.031,0.93128,-0.839218,-0.309212,0.331263,0.975545
3,-0.479174,-0.185659,-1.106335,-1.196207,0.812526,1.35624,-0.07201,1.003533,0.361636,-0.64512,0.361396,1.538037,-0.035826,1.564644,-2.619745,0.821903,0.087047,-0.299007,0.091761,-1.987569
4,-0.219672,0.357113,1.477894,-0.51827,-0.808494,-0.501757,0.915402,0.328751,-0.52976,0.513267,0.097078,0.968645,-0.702053,-0.327662,-0.392108,-1.463515,0.29612,0.261055,0.005113,-0.234587
5,-1.415371,-0.420645,-0.342715,-0.802277,-0.161286,0.404051,1.886186,0.174578,0.25755,-0.074446,-1.918771,-0.026514,0.06023,2.463242,-0.192361,0.301547,-0.034712,-1.168678,1.142823,0.751933
6,0.791032,-0.909387,1.402794,-1.401851,0.586857,2.190456,-0.990536,-0.566298,0.099651,-0.503476,-1.550663,0.068563,-1.062304,0.473592,-0.919424,1.549934,-0.783253,-0.322062,0.813517,-1.230864
7,0.22746,1.307143,-1.607483,0.184634,0.259883,0.781823,-1.236951,-1.320457,0.521942,0.296985,0.250493,0.346448,-0.680025,0.232254,0.293072,-0.714351,1.865775,0.473833,-1.191303,0.656554
8,-0.974682,0.787085,1.158596,-0.820682,0.963376,0.412781,0.82206,1.896793,-0.245388,-0.753736,-0.889514,-0.81581,-0.077102,0.341152,0.276691,0.827183,0.013002,1.453534,-0.264657,2.720169
9,0.625667,-0.857158,-1.070892,0.482472,-0.223463,0.714,0.473238,-0.072829,-0.846794,-1.514847,-0.446515,0.856399,0.214094,-1.245739,0.173181,0.385317,-0.883857,0.153725,0.058209,-1.14297


In [None]:
# Add categorical column for groupby tests
categories = np.random.choice(['A', 'B', 'C', 'D', 'E'], n_rows)
pdf_bench['category'] = categories
mdf_bench['category'] = categories

In [None]:
print(f"DataFrames created: {n_rows:,} rows x {n_cols + 1} columns\n")
print("="*50)
print("PERFORMANCE BENCHMARKS")
print("="*50 + "\n")

DataFrames created: 2,000,000 rows x 21 columns

PERFORMANCE BENCHMARKS



In [None]:
# Benchmark 1: Column statistics
compare_performance(
    lambda: pdf_bench.mean(numeric_only=True),
    lambda: mdf_bench.mean(numeric_only=True),
    "Column-wise mean (all columns)"
)
# Benchmark 2: Row-wise operations
compare_performance(
    lambda: pdf_bench.sum(axis=1, numeric_only=True),
    lambda: mdf_bench.sum(axis=1, numeric_only=True),
    "Row-wise sum"
)
# Benchmark 3: Boolean filtering
compare_performance(
    lambda: pdf_bench[pdf_bench['col_0'] > 0],
    lambda: mdf_bench[mdf_bench['col_0'] > 0],
    "Boolean filtering (col_0 > 0)"
)
# Benchmark 4: GroupBy aggregation
compare_performance(
    lambda: pdf_bench.groupby('category').mean(),
    lambda: mdf_bench.groupby('category').mean(),
    "GroupBy mean"
)
# Benchmark 5: Sorting
compare_performance(
    lambda: pdf_bench.sort_values('col_0'),
    lambda: mdf_bench.sort_values('col_0'),
    "Sort by column"
)
# Benchmark 6: Apply function
compare_performance(
    lambda: pdf_bench['col_0'].apply(lambda x: x ** 2),
    lambda: mdf_bench['col_0'].apply(lambda x: x ** 2),
    "Apply function (square values)"
)
# Benchmark 7: Multiple aggregations
compare_performance(
    lambda: pdf_bench.groupby('category').agg({'col_0': 'mean',
'col_1': 'sum', 'col_2': 'std'}),
    lambda: mdf_bench.groupby('category').agg({'col_0': 'mean',
'col_1': 'sum', 'col_2': 'std'}),
    "GroupBy with multiple aggregations"
)

Column-wise mean (all columns):
  pandas: 0.2582s
  Modin:  0.0610s
  Speedup: 4.23x

Row-wise sum:
  pandas: 0.3395s
  Modin:  0.0688s
  Speedup: 4.94x

Boolean filtering (col_0 > 0):
  pandas: 0.3024s
  Modin:  0.0947s
  Speedup: 3.19x

GroupBy mean:
  pandas: 0.2382s
  Modin:  0.0696s
  Speedup: 3.42x

Sort by column:
  pandas: 4.0960s
  Modin:  0.5043s
  Speedup: 8.12x

Apply function (square values):
  pandas: 0.8969s
  Modin:  0.2154s
  Speedup: 4.16x

GroupBy with multiple aggregations:
  pandas: 0.2458s
  Modin:  1.4467s
  Speedup: 0.17x



(0.24579366048177084, 1.4466920693715413)

Basic Modin Operations

In [None]:
mdf_bench.columns

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_13', 'col_14',
       'col_15', 'col_16', 'col_17', 'col_18', 'col_19', 'category'],
      dtype='object')

[33m(raylet)[0m [2026-01-14 12:21:45,447 E 7761 7761] (raylet) node_manager.cc:3256: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: cbc0462ca20487927fefd467fe89f55e14c055bf6c3729bbf28ce67e, IP: 172.27.21.86) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.27.21.86`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


In [None]:
mdf_bench.drop(columns='category',inplace=True)

In [None]:
mdf_bench.mean()

col_0     0.000157
col_1    -0.000070
col_2    -0.000802
col_3    -0.000204
col_4    -0.001093
col_5     0.000574
col_6     0.000518
col_7     0.000140
col_8     0.000004
col_9    -0.000244
col_10    0.000152
col_11   -0.000478
col_12   -0.000453
col_13    0.000728
col_14   -0.000293
col_15   -0.000204
col_16    0.000757
col_17   -0.001244
col_18    0.000905
col_19   -0.000587
dtype: float64

[36m(pid=16651)[0m [2026-01-14 12:23:35,015 E 16651 16817] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


In [None]:
# Suppose your DataFrame is called df
filtered_df = mdf_bench[(mdf_bench.iloc[:, 0] >= -1) & (mdf_bench.iloc[:, 0] <= 1)]

filtered_df = pd.DataFrame(filtered_df)
filtered_df.shape


(1365501, 20)

In [None]:
# Sum first three columns for each row
filtered_df['sum_first_three'] = filtered_df.iloc[:, 0:3].sum(axis=1)
filtered_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,sum_first_three
0,0.496714,-0.138264,0.647689,1.523030,-0.234153,-0.234137,1.579213,0.767435,-0.469474,0.542560,...,-0.465730,0.241962,-1.913280,-1.724918,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,1.006138
1,0.738467,0.171368,-0.115648,-0.301104,-1.478522,-0.719844,-0.460639,1.057122,0.343618,-1.763040,...,-0.385082,-0.676922,0.611676,1.031000,0.931280,-0.839218,-0.309212,0.331263,0.975545,0.794187
2,-0.479174,-0.185659,-1.106335,-1.196207,0.812526,1.356240,-0.072010,1.003533,0.361636,-0.645120,...,1.538037,-0.035826,1.564644,-2.619745,0.821903,0.087047,-0.299007,0.091761,-1.987569,-1.771168
3,-0.219672,0.357113,1.477894,-0.518270,-0.808494,-0.501757,0.915402,0.328751,-0.529760,0.513267,...,0.968645,-0.702053,-0.327662,-0.392108,-1.463515,0.296120,0.261055,0.005113,-0.234587,1.615335
4,0.791032,-0.909387,1.402794,-1.401851,0.586857,2.190456,-0.990536,-0.566298,0.099651,-0.503476,...,0.068563,-1.062304,0.473592,-0.919424,1.549934,-0.783253,-0.322062,0.813517,-1.230864,1.284439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365496,-0.906307,-0.722584,0.128298,1.229422,-0.711575,-1.393026,-0.044546,1.258667,-0.011681,0.089427,...,0.380772,1.041399,-0.374421,-2.279727,-0.082153,-0.052350,-0.068528,-0.320585,0.094488,-1.500593
1365497,-0.023997,0.315227,1.240792,-0.115047,0.558494,0.593331,-0.677216,-0.866818,1.838681,-1.340788,...,-0.585858,-0.398092,-1.511263,-0.882979,-0.685403,-0.978754,-2.031599,-0.314595,0.725131,1.532021
1365498,-0.148963,-0.609433,0.309803,0.194813,1.601245,0.751290,0.912611,0.684920,-1.050072,-0.671943,...,0.524062,-0.084313,-1.083159,-0.474963,-0.497360,-0.088565,1.041518,-0.108433,0.607095,-0.448593
1365499,0.072208,0.131953,1.741438,0.047827,-0.367225,1.497256,2.590825,-0.637203,1.661386,1.210174,...,-0.751432,-0.830688,0.646192,-0.441325,2.274215,-0.133746,-0.052080,1.110045,-0.379214,1.945598


[33m(raylet)[0m [2026-01-14 12:26:45,469 E 7761 7761] (raylet) node_manager.cc:3256: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: cbc0462ca20487927fefd467fe89f55e14c055bf6c3729bbf28ce67e, IP: 172.27.21.86) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.27.21.86`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


### NYC Taxi Dataset