In [1]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.callback import EarlyStopping
import json


In [2]:
CAT_COLS = ['type', 'subtype', 'size_category', 'temperature', 'above_median_access_count', 'above_median_filesize']
NUM_COLS = ['stddev_access_date',
 'dt_last_access_date',
 'dt_second_last_access_date',
 'dt_third_last_access_date',
 'dt_fourth_last_access_date',
 'dt_fifth_last_access_date',
 'normalized_access_count',
 'normalized_filesize',
 'access_count_last_1_day',
 'access_count_last_3_days',
 'access_count_last_7_days',
 'access_count_last_15_days',
 'lifetime',
 'access_count',
 'read_data_per_second']

In [3]:
df = pd.read_csv('/data/astro/scratch/msantama/tfm/data3.csv')

In [4]:
# We will predict for 10 days into the future, so we will drop the columns that are not needed
df.drop(columns=['m_date_window'], inplace=True)

In [5]:
df[CAT_COLS] = df[CAT_COLS].astype('category')

In [6]:
dt_cols = [
    'dt_last_access_date', 'dt_second_last_access_date',
    'dt_third_last_access_date', 'dt_fourth_last_access_date',
    'dt_fifth_last_access_date', 'lifetime'
]
for col in dt_cols:
    df[f'log_{col}'] = np.log1p(df[col])
df.drop(columns=dt_cols, inplace=True, errors="ignore")
access_cols = [
    'access_count_last_1_day', 'access_count_last_3_days',
    'access_count_last_7_days', 'access_count_last_15_days'
]
df[access_cols] = df[access_cols].fillna(0)

df['ratio_1d_15d'] = df['access_count_last_1_day'] / (df['access_count_last_15_days'] + 1)
df['ratio_3d_15d'] = df['access_count_last_3_days'] / (df['access_count_last_15_days'] + 1)
df['ratio_1d_3d'] = df['access_count_last_1_day'] / (df['access_count_last_3_days'] + 1)
df['ratio_3d_7d'] = df['access_count_last_3_days'] / (df['access_count_last_7_days'] + 1)

df['access_trend'] = (
    df['access_count_last_1_day'] - df['access_count_last_3_days']/3 +
    df['access_count_last_3_days'] - df['access_count_last_7_days']/7
)
# Ordered mapping (ensure the order is consistent with your definitions)
temperature_map = {'cold': 0, 'cold-warm': 1, 'warm-hot': 2, 'hot': 3}
size_category_map = {'small': 0, 'medium': 1, 'large': 2, 'xlarge': 3}

df['temperature_encoded'] = df['temperature'].map(temperature_map)
df['size_category_encoded'] = df['size_category'].map(size_category_map)

df.drop(columns=['temperature', 'size_category'], inplace=True, errors='ignore')
df['type_binary'] = (df['type'] == 'mc').astype(int)
df.drop(columns='type', inplace=True, errors='ignore')

In [15]:
import numpy as np
import pandas as pd
from collections import Counter, deque

def aggregate_metrics_fast(df: pd.DataFrame, train_len: int = 13, start_min: int = 13):
    """
    Efficient rolling computation of split metrics:
      - Train: periods [s .. s+train_len]
      - Val:   period  (s+train_len)
      - Test:  period  (s+train_len+1)
    Returns a DataFrame with mean ± std across all valid starts.
    Required columns in df: period, pnfsid, size, y1
    """

    # --- Precompute period-level summaries ---
    # Sort periods to get a dense, ordered axis
    periods = np.sort(df['period'].unique())
    p_to_idx = {p: i for i, p in enumerate(periods)}
    pmin, pmax = periods.min(), periods.max()

    # Per-period rows & positive labels
    per_period = df.groupby('period').agg(
        rows_in_period=('y1', 'size'),
        pos_in_period=('y1', 'sum')
    ).reset_index()

    # Per-period set of files
    files_by_period = (
        df.groupby('period')['pnfsid']
          .apply(lambda s: set(s.values))
          .to_dict()
    )

    # Global size per file (max, in case duplicates exist)
    size_by_file = df.groupby('pnfsid')['size'].max().to_dict()

    # Valid starts: need val at end = s+train_len, and test at end+1
    # i.e., s must satisfy s+train_len+1 ≤ pmax
    # Also enforce s ≥ start_min
    valid_starts = [s for s in periods if (s >= start_min) and (s + train_len + 1 <= pmax)]
    if not valid_starts:
        raise ValueError("No valid starts for the given train_len and data span.")

    # --- Helpers to maintain rolling unions efficiently ---
    def add_period_files(counter: Counter, files: set, size_by_file: dict) -> int:
        """Increment counts for files; return bytes added when a file goes 0->1."""
        added_bytes = 0
        for f in files:
            prev = counter.get(f, 0)
            counter[f] = prev + 1
            if prev == 0:
                added_bytes += size_by_file.get(f, 0)
        return added_bytes

    def remove_period_files(counter: Counter, files: set, size_by_file: dict) -> int:
        """Decrement counts for files; return bytes removed when a file goes 1->0."""
        removed_bytes = 0
        for f in files:
            prev = counter.get(f, 0)
            if prev <= 1:
                # going to 0
                if prev == 1:
                    removed_bytes += size_by_file.get(f, 0)
                counter.pop(f, None)
            else:
                counter[f] = prev - 1
        return removed_bytes

    # --- Rolling accumulators for metrics across starts ---
    # We collect per-start metrics for Train/Val/Test and for union bytes across all splits
    coll = {
        "Train": {"rows": [], "uniq_files": [], "pos_ratio": [], "unique_periods": []},
        "Validation": {"rows": [], "uniq_files": [], "pos_ratio": [], "unique_periods": []},
        "Test": {"rows": [], "uniq_files": [], "pos_ratio": [], "unique_periods": []},
        "TotalBytesUnion": []  # Train ∪ Val ∪ Test bytes per start
    }

    # --- Initialize rolling Train window (first start) ---
    # We'll maintain:
    #  - running sums of rows and positives for Train via arrays + prefix sums
    #  - train Counter/set union for unique files and bytes
    # Val/Test are single periods: compute on the fly per start.
    rows_map = dict(zip(per_period['period'], per_period['rows_in_period']))
    pos_map  = dict(zip(per_period['period'], per_period['pos_in_period']))

    # Prefix sums for rows/pos to get Train rows/pos quickly
    rows_arr = np.array([rows_map.get(p, 0) for p in periods], dtype=np.int64)
    pos_arr  = np.array([pos_map.get(p, 0) for p in periods], dtype=np.int64)
    rows_cum = np.cumsum(rows_arr)
    pos_cum  = np.cumsum(pos_arr)

    def sum_range(cum, i, j):
        # inclusive i..j on periods index
        if i > j: return 0
        return cum[j] - (cum[i-1] if i > 0 else 0)

    # Train union (files) rolling
    train_counts = Counter()
    train_union_bytes = 0
    train_window = deque()  # store the period values currently in Train

    first_start = valid_starts[0]
    first_end = first_start + train_len

    # Seed the Train window with periods [first_start..first_end]
    seed_periods = [p for p in periods if (first_start <= p <= first_end)]
    for p in seed_periods:
        train_union_bytes += add_period_files(train_counts, files_by_period.get(p, set()), size_by_file)
        train_window.append(p)

    # --- Iterate over starts (rolling) ---
    for s in valid_starts:
        end = s + train_len
        val_p = end
        test_p = end + 1

        # Roll Train window if we advanced s
        if s != train_window[0]:
            # Remove leftmost period(s) until left == s
            while train_window and train_window[0] < s:
                p_out = train_window.popleft()
                train_union_bytes -= remove_period_files(train_counts, files_by_period.get(p_out, set()), size_by_file)
            # Add new right edge periods until right == end
            while (not train_window) or (train_window[-1] < end):
                p_in = (train_window[-1] + 1) if train_window else s
                train_union_bytes += add_period_files(train_counts, files_by_period.get(p_in, set()), size_by_file)
                train_window.append(p_in)

        # TRAIN metrics (rows/pos via prefix sums; uniq files via counter)
        i = p_to_idx[s]
        j = p_to_idx[end]
        train_rows = int(sum_range(rows_cum, i, j))
        train_pos  = int(sum_range(pos_cum,  i, j))
        train_pos_ratio = (train_pos / train_rows) if train_rows > 0 else np.nan
        train_unique_files = len(train_counts)
        train_unique_periods = len(train_window)  # observed span in periods

        coll["Train"]["rows"].append(train_rows)
        coll["Train"]["uniq_files"].append(train_unique_files)
        coll["Train"]["pos_ratio"].append(train_pos_ratio)
        coll["Train"]["unique_periods"].append(train_unique_periods)

        # VALIDATION (single period = end)
        val_files = files_by_period.get(val_p, set())
        val_rows = rows_map.get(val_p, 0)
        val_pos  = pos_map.get(val_p, 0)
        val_pos_ratio = (val_pos / val_rows) if val_rows > 0 else np.nan
        coll["Validation"]["rows"].append(val_rows)
        coll["Validation"]["uniq_files"].append(len(val_files))
        coll["Validation"]["pos_ratio"].append(val_pos_ratio)
        coll["Validation"]["unique_periods"].append(1)

        # TEST (single period = end+1)
        test_files = files_by_period.get(test_p, set())
        test_rows = rows_map.get(test_p, 0)
        test_pos  = pos_map.get(test_p, 0)
        test_pos_ratio = (test_pos / test_rows) if test_rows > 0 else np.nan
        coll["Test"]["rows"].append(test_rows)
        coll["Test"]["uniq_files"].append(len(test_files))
        coll["Test"]["pos_ratio"].append(test_pos_ratio)
        coll["Test"]["unique_periods"].append(1)

        # TOTAL BYTES over Train ∪ Val ∪ Test (dedup pnfsid)
        # Efficiently: overlay Val/Test files on current Train counter without mutating it
        # We simulate increments to count 0->1 to measure extra bytes, then revert.
        delta_bytes = 0
        temp_adds = []
        for f in val_files:
            if train_counts.get(f, 0) == 0:
                delta_bytes += size_by_file.get(f, 0)
                temp_adds.append(f)
        for f in test_files:
            # If also in val temp adds, don't double-add; but if in Train already, skip
            if (train_counts.get(f, 0) == 0) and (f not in temp_adds):
                delta_bytes += size_by_file.get(f, 0)

        total_bytes_union = train_union_bytes + delta_bytes
        coll["TotalBytesUnion"].append(total_bytes_union)

    # --- Aggregate mean ± std over starts ---
    def mean_std_fmt(a, kind="int"):
        a = np.array(a, dtype=float)
        m = np.nanmean(a)
        s = np.nanstd(a, ddof=1) if len(a) > 1 else 0.0
        if kind == "int":
            return f"{int(round(m))} ± {int(round(s))}"
        if kind == "pct":
            return f"{m:.4f} ± {s:.4f}"
        if kind == "float2":
            return f"{m:.2f} ± {s:.2f}"
        if kind == "bytes":
            return f"{m:.3e} ± {s:.3e}"
        return f"{m:.3f} ± {s:.3f}"

    def pack(split):
        return {
            "Rows (instances)": mean_std_fmt(coll[split]["rows"], "int"),
            "Unique files":     mean_std_fmt(coll[split]["uniq_files"], "int"),
            "Time span":        mean_std_fmt(coll[split]["unique_periods"], "float2"),  # periods
            "Pos ratio (y=1)":  mean_std_fmt(coll[split]["pos_ratio"], "pct"),
        }

    out = pd.DataFrame(
        [
            {"Split": "Train (rolling 13)",     **pack("Train")},
            {"Split": "Validation (last)",      **pack("Validation")},
            {"Split": "Test (next)",            **pack("Test")},
            {"Split": "Total bytes (Train∪Val∪Test)", 
             "Rows (instances)": "",
             "Unique files":     "",
             "Time span":        "",
             "Pos ratio (y=1)":  mean_std_fmt(coll["TotalBytesUnion"], "bytes")}
        ]
    ).set_index("Split")

    return out

# ---- Usage ----
metrics_df = aggregate_metrics_fast(df, train_len=13, start_min=13)
print(metrics_df)


                               Rows (instances)     Unique files  \
Split                                                              
Train (rolling 13)            4005560 ± 1419392  468342 ± 148563   
Validation (last)               288325 ± 114117  285196 ± 110446   
Test (next)                     288541 ± 114022  285412 ± 110354   
Total bytes (Train∪Val∪Test)                                       

                                 Time span        Pos ratio (y=1)  
Split                                                              
Train (rolling 13)            14.00 ± 0.00        0.0764 ± 0.0212  
Validation (last)              1.00 ± 0.00        0.0791 ± 0.0365  
Test (next)                    1.00 ± 0.00        0.0790 ± 0.0366  
Total bytes (Train∪Val∪Test)                1.679e+15 ± 4.423e+14  
