In [3]:
# Import libraries
import pandas as pd
import modin.pandas as mpd
import numpy as np
import time
import os
import warnings

In [4]:
# Create sample data
np.random.seed(42)
n_rows = 100000
n_cols = 20
data = np.random.randn(n_rows, n_cols)

In [7]:
RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0

In [8]:
columns = [f"col_{i}" for i in range(n_cols)]
# Create pandas DataFrame
pdf = pd.DataFrame(data, columns=columns)
print(f"pandas DataFrame: {pdf.shape}")
print(f"Type: {type(pdf)}")
# Create Modin DataFrame
mdf = mpd.DataFrame(data, columns=columns)
print(f"\nModin DataFrame: {mdf.shape}")
print(f"Type: {type(mdf)}")

pandas DataFrame: (100000, 20)
Type: <class 'pandas.core.frame.DataFrame'>

Modin DataFrame: (100000, 20)
Type: <class 'modin.pandas.dataframe.DataFrame'>




In [9]:
# Verify that operations produce identical results
print("Comparing pandas and Modin results:\n")
# Basic statistics
print("Mean of col_0:")
print(f"  pandas: {pdf['col_0'].mean():.10f}")
print(f"  Modin:  {mdf['col_0'].mean():.10f}")
print("\nSum of col_1:")
print(f"  pandas: {pdf['col_1'].sum():.10f}")
print(f"  Modin:  {mdf['col_1'].sum():.10f}")
print("\nStandard deviation of col_2:")
print(f"  pandas: {pdf['col_2'].std():.10f}")
print(f"  Modin:  {mdf['col_2'].std():.10f}")
print("\nResults are identical!")
# More operations - all work the same way
print("Additional operations:\n")
# Filtering
pdf_filtered = pdf[pdf['col_0'] > 0]
mdf_filtered = mdf[mdf['col_0'] > 0]
print(f"Filtered rows (col_0 > 0):")
print(f"  pandas: {len(pdf_filtered):,}")
print(f"  Modin:  {len(mdf_filtered):,}")
# Sorting
pdf_sorted = pdf.sort_values('col_0')
mdf_sorted = mdf.sort_values('col_0')
print(f"\nFirst value after sorting:")
print(f"  pandas: {pdf_sorted['col_0'].iloc[0]:.6f}")
print(f"  Modin:  {mdf_sorted['col_0'].iloc[0]:.6f}")
# GroupBy
pdf['group'] = np.random.choice(['A', 'B', 'C'], len(pdf))
mdf['group'] = np.random.choice(['A', 'B', 'C'], len(mdf))
print(f"\nGroupBy mean (group A, col_0):")
print(f"  pandas: {pdf.groupby('group')['col_0'].mean()['A']:.6f}")
print(f"  Modin:  {mdf.groupby('group')['col_0'].mean()['A']:.6f}")

Comparing pandas and Modin results:

Mean of col_0:
  pandas: 0.0015174473
  Modin:  0.0015174473

Sum of col_1:
  pandas: 218.9499122500
  Modin:  218.9499122500

Standard deviation of col_2:
  pandas: 0.9999137589
  Modin:  0.9999137589

Results are identical!
Additional operations:

Filtered rows (col_0 > 0):
  pandas: 50,136
  Modin:  50,136

First value after sorting:
  pandas: -4.227232
  Modin:  -4.227232

GroupBy mean (group A, col_0):
  pandas: -0.000493
  Modin:  -0.001912


In [10]:
def benchmark(func, name, n_runs=3):
    """Run a function multiple times and return average time."""
    times = []
    for _ in range(n_runs):
        start = time.time()
        result = func()
        times.append(time.time() - start)
    avg_time = sum(times) / len(times)
    return avg_time

In [12]:
def compare_performance(pandas_func, modin_func, operation_name, 
n_runs=3):
    """Compare pandas and Modin performance."""
    pandas_time = benchmark(pandas_func, "pandas", n_runs)
    modin_time = benchmark(modin_func, "Modin", n_runs)
    speedup = pandas_time / modin_time if modin_time > 0 else float('inf')
    print(f"{operation_name}:")
    print(f"  pandas: {pandas_time:.4f}s")
    print(f"  Modin:  {modin_time:.4f}s")
    print(f"  Speedup: {speedup:.2f}x")
    print()
    return pandas_time, modin_time

In [14]:
np.random.seed(42)
n_rows = 2000000
n_cols = 20

data = np.random.randn(n_cols,n_rows)