In [None]:
import pandas as pd
import numpy as np
import time


#  Load the dataset

df = pd.read_csv("dataset.csv")
print("Original Dataset:\n", df)


#  Handle Missing Values

# Fill NaNs with column mean
df_filled = df.fillna(df.mean(numeric_only=True))
print("\nDataset After Filling Missing Values:\n", df_filled)

# Convert to NumPy for faster operations
data = df_filled.to_numpy()


#  Column-wise Statistics (Loops)

start_time = time.time()
means_loops = []
medians_loops = []
stds_loops = []

rows, cols = data.shape
for col in range(cols):
    column_data = data[:, col]
    means_loops.append(np.mean(column_data))
    medians_loops.append(np.median(column_data))
    stds_loops.append(np.std(column_data))

loop_time = time.time() - start_time

print("\n--- Using Loops ---")
print("Means:", means_loops)
print("Medians:", medians_loops)
print("Std Devs:", stds_loops)
print("Time Taken (loops): {:.6f} seconds".format(loop_time))


#Column-wise Statistics (Vectorized)

start_time = time.time()
means_vec = np.mean(data, axis=0)
medians_vec = np.median(data, axis=0)
stds_vec = np.std(data, axis=0)
vector_time = time.time() - start_time

print("\n--- Using Vectorization ---")
print("Means:", means_vec)
print("Medians:", medians_vec)
print("Std Devs:", stds_vec)
print("Time Taken (vectorized): {:.6f} seconds".format(vector_time))


# Performance Comparison
print("\nPerformance Improvement: {:.2f}x faster".format(loop_time / vector_time))


Original Dataset:
       A     B     C     D
0   1.0   2.0   NaN   4.0
1   5.0   NaN   6.0   8.0
2   9.0  10.0  11.0   NaN
3   NaN  14.0  15.0  16.0
4  17.0  18.0  19.0  20.0

Dataset After Filling Missing Values:
       A     B      C     D
0   1.0   2.0  12.75   4.0
1   5.0  11.0   6.00   8.0
2   9.0  10.0  11.00  12.0
3   8.0  14.0  15.00  16.0
4  17.0  18.0  19.00  20.0

--- Using Loops ---
Means: [np.float64(8.0), np.float64(11.0), np.float64(12.75), np.float64(12.0)]
Medians: [np.float64(8.0), np.float64(11.0), np.float64(12.75), np.float64(12.0)]
Std Devs: [np.float64(5.291502622129181), np.float64(5.291502622129181), np.float64(4.306971093471606), np.float64(5.656854249492381)]
Time Taken (loops): 0.000768 seconds

--- Using Vectorization ---
Means: [ 8.   11.   12.75 12.  ]
Medians: [ 8.   11.   12.75 12.  ]
Std Devs: [5.29150262 5.29150262 4.30697109 5.65685425]
Time Taken (vectorized): 0.000383 seconds

Performance Improvement: 2.00x faster
