In [1]:
import dask.dataframe as dd
import pandas as pd
import time

# Generate a large CSV file (Optional - if you need sample data)
import numpy as np
num_rows = 10**7  # 10 million rows
data = {
    'id': np.arange(num_rows),
    'value1': np.random.rand(num_rows) * 100,
    'value2': np.random.rand(num_rows) * 100
}
pd.DataFrame(data).to_csv('large_dataset.csv', index=False)

# Load dataset using Pandas
start_time = time.time()
pdf = pd.read_csv('large_dataset.csv')
pandas_mean = pdf[['value1', 'value2']].mean()
pandas_time = time.time() - start_time

# Load dataset using Dask
start_time = time.time()
ddf = dd.read_csv('large_dataset.csv')
dask_mean = ddf[['value1', 'value2']].mean().compute()
dask_time = time.time() - start_time

# Output comparison
print("Pandas Computation Time:", pandas_time)
print("Dask Computation Time:", dask_time)
print("Pandas Mean:", pandas_mean)
print("Dask Mean:", dask_mean)


Pandas Computation Time: 8.732322216033936
Dask Computation Time: 6.572553396224976
Pandas Mean: value1    50.000836
value2    50.002596
dtype: float64
Dask Mean: value1    50.000836
value2    50.002596
dtype: float64
