In [21]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append(r"D:\Vscode\Self\Data Science\Book Notes\IntroDS\23127538\src")


from config import START_ID, END_ID, START_YEAR_MONTH, END_YEAR_MONTH

In [22]:
csv_path = r"D:\Vscode\Self\Data Science\Book Notes\IntroDS\23127538\stats\stats.csv"
df = pd.read_csv(csv_path)

bytes_cols = {
    "mem_before_rss": "mem_before_mb",
    "mem_after_rss": "mem_after_mb",
    "mem_max_rss": "mem_max_mb",
    "mem_avg_rss": "mem_avg_mb",
    "size_before_bytes": "size_before_mb",
    "size_after_bytes": "size_after_mb",
    "disk_max_bytes": "disk_max_mb",
    "final_output_size_bytes": "final_output_size_mb"
}

for old_col, new_col in bytes_cols.items():
    if old_col in df.columns:
        df[new_col] = df[old_col] / (1024 * 1024)  # bytes â†’ MB
        df.drop(columns=[old_col], inplace=True)


print(df.info())

cols = [
    "runtime_s", "size_before_mb", "size_after_mb",
    "mem_max_mb", "mem_avg_mb", "disk_max_mb",
    "final_output_size_mb", "references_count"
]

for c in cols:
    if c in df.columns:
        print(f"\nColumn: {c}")
        print(df[c].agg(['count','mean','median','std','min','max']).to_frame().T)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   arxiv_id              5000 non-null   float64
 1   success               5000 non-null   bool   
 2   runtime_s             5000 non-null   float64
 3   references_count      5000 non-null   int64  
 4   mem_before_mb         5000 non-null   float64
 5   mem_after_mb          5000 non-null   float64
 6   mem_max_mb            5000 non-null   float64
 7   mem_avg_mb            5000 non-null   float64
 8   size_before_mb        5000 non-null   float64
 9   size_after_mb         5000 non-null   float64
 10  disk_max_mb           5000 non-null   float64
 11  final_output_size_mb  5000 non-null   float64
dtypes: bool(1), float64(10), int64(1)
memory usage: 434.7 KB
None

Column: runtime_s
            count       mean  median        std  min    max
runtime_s  5000.0  21.409996  14.

In [23]:
# ========== SUCCESS METRICS ==========
total = len(df)
num_success = df["success"].sum()
num_failed = total - num_success
success_rate = num_success / total * 100

print("=== SUCCESS METRICS ===")
print("Total papers:", total)
print("Success:", num_success)
print("Failed:", num_failed)
print(f"Success rate: {success_rate:.2f}%\n")


# ========== RUNTIME METRICS ==========
total_runtime = df["runtime_s"].sum()
avg_runtime = df["runtime_s"].mean()
avg_runtime_success_only = df[df["success"]==True]["runtime_s"].mean()
min_runtime = df["runtime_s"].min()
max_runtime = df["runtime_s"].max()

print("=== RUNTIME METRICS ===")
print(f"Total runtime (seconds): {total_runtime:.2f}")
print(f"Total runtime (hours): {total_runtime/3600:.2f} h")
print(f"Average runtime per paper: {avg_runtime:.2f} s")
print(f"Average runtime per SUCCESS paper: {avg_runtime_success_only:.2f} s")
print(f"Min runtime: {min_runtime:.2f} s")
print(f"Max runtime: {max_runtime:.2f} s\n")


# ========== MEMORY FOOTPRINT ==========
max_ram_used = df["mem_max_mb"].max()
avg_ram_consumption = df["mem_avg_mb"].mean()
min_ram = df["mem_avg_mb"].min()

max_disk_storage = df["disk_max_mb"].max()
avg_disk_storage = df["disk_max_mb"].mean()

final_output_size_max = df["final_output_size_mb"].max()
final_output_size_mean = df["final_output_size_mb"].mean()

print("=== MEMORY FOOTPRINT ===")
print(f"Maximum RAM used (MB): {max_ram_used:.2f}")
print(f"Average RAM consumption (MB): {avg_ram_consumption:.2f}")
print(f"Minimum RAM consumption (MB): {min_ram:.2f}")

print(f"\nMaximum disk storage required (MB): {max_disk_storage:.2f}")
print(f"Average disk storage used (MB): {avg_disk_storage:.2f}")

print(f"\nFinal output storage size (MAX MB): {final_output_size_max:.2f}")
print(f"Final output storage size (AVG MB): {final_output_size_mean:.2f}\n")


# ========== REFERENCES METRICS ==========
avg_refs = df["references_count"].mean()
min_refs = df["references_count"].min()
max_refs = df["references_count"].max()

print("=== REFERENCES STATISTICS ===")
print(f"Average references per paper: {avg_refs:.2f}")
print(f"Min references: {min_refs}")
print(f"Max references: {max_refs}")


=== SUCCESS METRICS ===
Total papers: 5000
Success: 4972
Failed: 28
Success rate: 99.44%

=== RUNTIME METRICS ===
Total runtime (seconds): 107049.98
Total runtime (hours): 29.74 h
Average runtime per paper: 21.41 s
Average runtime per SUCCESS paper: 20.62 s
Min runtime: 9.00 s
Max runtime: 970.30 s

=== MEMORY FOOTPRINT ===
Maximum RAM used (MB): 46.64
Average RAM consumption (MB): 36.45
Minimum RAM consumption (MB): 19.93

Maximum disk storage required (MB): 5068.94
Average disk storage used (MB): 2489.28

Final output storage size (MAX MB): 5059.69
Final output storage size (AVG MB): 2486.06

=== REFERENCES STATISTICS ===
Average references per paper: 18.77
Min references: 0
Max references: 357


In [31]:
START_FLOAT = float(f"{START_YEAR_MONTH}.{START_ID}")
END_FLOAT   = float(f"{END_YEAR_MONTH}.{END_ID}")

percent = 0.1 

df_range = df[(df["arxiv_id"] >= START_FLOAT) & (df["arxiv_id"] <= END_FLOAT)].copy()

# Safety sort
df_range = df_range.sort_values("arxiv_id").reset_index(drop=True)

# group size = 10%
group_size = int(len(df_range) * percent)
num_groups = int(1 / percent)
print(group_size, num_groups)

group_runtimes = []

print("=== TOTAL RUNTIME PER 10%-PAPER GROUP (BY ARXIV ID) ===")

for i in range(num_groups):
    start_idx = i * group_size
    end_idx = start_idx + group_size
    
    group_df = df_range.iloc[start_idx:end_idx]

    id_start = str(group_df["arxiv_id"].iloc[0])
    id_end   = str(group_df["arxiv_id"].iloc[-1])

    total_runtime = group_df["runtime_s"].sum()

    group_runtimes.append({
        "group_index": i + 1,
        "arxiv_start": id_start,
        "arxiv_end": id_end,
        "total_runtime_s": total_runtime
    })

    print(f"{id_start} - {id_end}: {total_runtime:.2f} s")


# ====== GROUP WITH SMALLEST TOTAL RUNTIME ======
min_group = min(group_runtimes, key=lambda x: x["total_runtime_s"])

print("\n=== GROUP WITH SMALLEST TOTAL RUNTIME ===")
print(f"Group {min_group['group_index']} "
      f"({min_group['arxiv_start']} - {min_group['arxiv_end']}): "
      f"{min_group['total_runtime_s']:.2f} s")


500 10
=== TOTAL RUNTIME PER 10%-PAPER GROUP (BY ARXIV ID) ===
2307.11657 - 2307.12156: 10422.35 s
2307.12157 - 2307.12656: 12192.32 s
2307.12657 - 2307.13156: 11508.74 s
2307.13157 - 2307.13656: 9634.45 s
2307.13657 - 2307.14156: 9491.83 s
2307.14157 - 2307.14656: 17135.57 s
2307.14657 - 2307.15156: 8643.24 s
2307.15157 - 2307.15656: 9498.67 s
2307.15657 - 2307.16156: 9239.33 s
2307.16157 - 2307.16656: 9283.48 s

=== GROUP WITH SMALLEST TOTAL RUNTIME ===
Group 7 (2307.14657 - 2307.15156): 8643.24 s


Okay, so I will choose 14657 - 15156 to run 10% as the teacher's new requirement.

Maybe I will try to run on Colab.

But for the performance report, we can easily that just get the statistics from 14657 -  15156.

In [33]:
start_year_run_time = float(f"{START_YEAR_MONTH}.{14657}")
end_year_run_time  = float(f"{END_YEAR_MONTH}.{15156}")

df_range_min = df[(df["arxiv_id"] >= start_year_run_time) & (df["arxiv_id"] <= end_year_run_time)].copy()

In [34]:
print(df_range_min.info())

cols = [
    "runtime_s", "size_before_mb", "size_after_mb",
    "mem_max_mb", "mem_avg_mb", "disk_max_mb",
    "final_output_size_mb", "references_count"
]

for c in cols:
    if c in df_range_min.columns:
        print(f"\nColumn: {c}")
        print(df_range_min[c].agg(['count','mean','median','std','min','max']).to_frame().T)

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 3000 to 3499
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   arxiv_id              500 non-null    float64
 1   success               500 non-null    bool   
 2   runtime_s             500 non-null    float64
 3   references_count      500 non-null    int64  
 4   mem_before_mb         500 non-null    float64
 5   mem_after_mb          500 non-null    float64
 6   mem_max_mb            500 non-null    float64
 7   mem_avg_mb            500 non-null    float64
 8   size_before_mb        500 non-null    float64
 9   size_after_mb         500 non-null    float64
 10  disk_max_mb           500 non-null    float64
 11  final_output_size_mb  500 non-null    float64
dtypes: bool(1), float64(10), int64(1)
memory usage: 47.4 KB
None

Column: runtime_s
           count      mean  median        std  min     max
runtime_s  500.0  17.28648  13.975  13

In [35]:
# ========== SUCCESS METRICS ==========
total = len(df_range_min)
num_success = df_range_min["success"].sum()
num_failed = total - num_success
success_rate = num_success / total * 100

print("=== SUCCESS METRICS ===")
print("Total papers:", total)
print("Success:", num_success)
print("Failed:", num_failed)
print(f"Success rate: {success_rate:.2f}%\n")


# ========== RUNTIME METRICS ==========
total_runtime = df_range_min["runtime_s"].sum()
avg_runtime = df_range_min["runtime_s"].mean()
avg_runtime_success_only = df_range_min[df_range_min["success"]==True]["runtime_s"].mean()
min_runtime = df_range_min["runtime_s"].min()
max_runtime = df_range_min["runtime_s"].max()

print("=== RUNTIME METRICS ===")
print(f"Total runtime (seconds): {total_runtime:.2f}")
print(f"Total runtime (hours): {total_runtime/3600:.2f} h")
print(f"Average runtime per paper: {avg_runtime:.2f} s")
print(f"Average runtime per SUCCESS paper: {avg_runtime_success_only:.2f} s")
print(f"Min runtime: {min_runtime:.2f} s")
print(f"Max runtime: {max_runtime:.2f} s\n")


# ========== MEMORY FOOTPRINT ==========
max_ram_used = df_range_min["mem_max_mb"].max()
avg_ram_consumption = df_range_min["mem_avg_mb"].mean()
min_ram = df_range_min["mem_avg_mb"].min()

max_disk_storage = df_range_min["disk_max_mb"].max()
avg_disk_storage = df_range_min["disk_max_mb"].mean()

final_output_size_max = df_range_min["final_output_size_mb"].max()
final_output_size_mean = df_range_min["final_output_size_mb"].mean()

print("=== MEMORY FOOTPRINT ===")
print(f"Maximum RAM used (MB): {max_ram_used:.2f}")
print(f"Average RAM consumption (MB): {avg_ram_consumption:.2f}")
print(f"Minimum RAM consumption (MB): {min_ram:.2f}")

print(f"\nMaximum disk storage required (MB): {max_disk_storage:.2f}")
print(f"Average disk storage used (MB): {avg_disk_storage:.2f}")

print(f"\nFinal output storage size (MAX MB): {final_output_size_max:.2f}")
print(f"Final output storage size (AVG MB): {final_output_size_mean:.2f}\n")


# ========== REFERENCES METRICS ==========
avg_refs = df_range_min["references_count"].mean()
min_refs = df_range_min["references_count"].min()
max_refs = df_range_min["references_count"].max()

print("=== REFERENCES STATISTICS ===")
print(f"Average references per paper: {avg_refs:.2f}")
print(f"Min references: {min_refs}")
print(f"Max references: {max_refs}")

=== SUCCESS METRICS ===
Total papers: 500
Success: 500
Failed: 0
Success rate: 100.00%

=== RUNTIME METRICS ===
Total runtime (seconds): 8643.24
Total runtime (hours): 2.40 h
Average runtime per paper: 17.29 s
Average runtime per SUCCESS paper: 17.29 s
Min runtime: 9.00 s
Max runtime: 128.62 s

=== MEMORY FOOTPRINT ===
Maximum RAM used (MB): 41.29
Average RAM consumption (MB): 35.44
Minimum RAM consumption (MB): 28.35

Maximum disk storage required (MB): 3622.80
Average disk storage used (MB): 3245.09

Final output storage size (MAX MB): 3609.85
Final output storage size (AVG MB): 3242.50

=== REFERENCES STATISTICS ===
Average references per paper: 21.07
Min references: 0
Max references: 291
