In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.callback import EarlyStopping
from sklearn.metrics import *

In [3]:
CAT_COLS = ['type', 'subtype', 'size_category', 'temperature', 'above_median_access_count', 'above_median_filesize']
NUM_COLS = ['stddev_access_date',
 'dt_last_access_date',
 'dt_second_last_access_date',
 'dt_third_last_access_date',
 'dt_fourth_last_access_date',
 'dt_fifth_last_access_date',
 'normalized_access_count',
 'normalized_filesize',
 'access_count_last_1_day',
 'access_count_last_3_days',
 'access_count_last_7_days',
 'access_count_last_15_days',
 'lifetime',
 'access_count',
 'read_data_per_second']

In [17]:
df = pd.read_csv('/data/astro/scratch/msantama/tfm/data.csv')

In [18]:
df.columns = [col.replace('x.', '') if col.startswith('x.') else col for col in df.columns]

In [19]:
all_dates = sorted(df['m_date_window'].unique())
date_to_int = {date: idx for idx, date in enumerate(all_dates)}
df['period'] = df['m_date_window'].map(date_to_int).astype(int)

In [21]:
# We will predict for 10 days into the future, so we will drop the columns that are not needed
df.drop(columns=['m_date_window','pnfsid'], inplace=True)

In [22]:
df[CAT_COLS] = df[CAT_COLS].astype('category')

In [31]:
TRAINING_PERIODS = 12
MAXIMUM_PERIOD = 365 // 5 + 10

results2 = []

for p in np.arange(TRAINING_PERIODS, MAXIMUM_PERIOD + 1):
    #---- 1. Split dataset ----
    df_seen = df[(df['period'] < p) & (df['period'] > p - TRAINING_PERIODS - 1)]
    df_unseen = df[df['period'] == p]

    print(f"\n--- Period {p} ---")
    print(f"Història: {len(df_seen['period'].unique())} períodes")
    print(f"Train set (DF) y=1: {df_seen['y'].mean():.2f}")
    print(f"Unseen (p) y=1: {df_unseen['y'].mean():.2f}")

    #---- 2. Train/test split and undersample ----
    X = df_seen.drop(columns="y")
    y = df_seen['y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    print(f"Train balançejat y=1: {y_train_resampled.mean():.2f}")

    #---- 3. Train model ----
    dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled, enable_categorical=True)
    dval = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.1,
        'max_depth': 5,
        'tree_method': 'auto',
        'seed': 42
    }

    early_stop = EarlyStopping(rounds=10, min_delta=0.01, save_best=True)
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dval, 'eval')],
        callbacks=[early_stop],
        verbose_eval=False
    )

    # ---- 4. Predict on validation and unseen sets ----
    y_pred_proba_val = model.predict(dval)
    y_pred_val = (y_pred_proba_val >= 0.5).astype(int)

    X_unseen = df_unseen.drop(columns="y")
    y_true_unseen = df_unseen['y']
    dunseen = xgb.DMatrix(X_unseen, enable_categorical=True)

    y_pred_proba_unseen = model.predict(dunseen)
    y_pred_unseen = (y_pred_proba_unseen >= 0.5).astype(int)

    # ---- 5. Plot side-by-side ----
    # fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # sns.kdeplot(y_pred_proba_val[y_test == 0], label='Negative class', color='blue', fill=True, ax=axes[0])
    # sns.kdeplot(y_pred_proba_val[y_test == 1], label='Positive class', color='red', fill=True, ax=axes[0])
    # axes[0].set_title('Validation Set')
    # axes[0].set_xlabel('Predicted Probability')
    # axes[0].set_ylabel('Density')
    # axes[0].legend()
    # axes[0].grid(True)

    # sns.kdeplot(y_pred_proba_unseen[y_true_unseen == 0], label='Negative class', color='blue', fill=True, ax=axes[1])
    # sns.kdeplot(y_pred_proba_unseen[y_true_unseen == 1], label='Positive class', color='red', fill=True, ax=axes[1])
    # axes[1].set_title(f'Unseen Period {p}')
    # axes[1].set_xlabel('Predicted Probability')
    # axes[1].set_ylabel('Density')
    # axes[1].legend()
    # axes[1].grid(True)

    # plt.suptitle(f'Score Distribution (Period {p})', fontsize=14)
    # plt.tight_layout(rect=[0, 0, 1, 0.95])
    # plt.show()

    # ---- 6. Save metrics ----
    results2.append({
        'period': p,
        'val_auc': roc_auc_score(y_test, y_pred_proba_val),
        'val_recall': recall_score(y_test, y_pred_val),
        'val_accuracy': accuracy_score(y_test, y_pred_val),
        'unseen_auc': roc_auc_score(y_true_unseen, y_pred_proba_unseen),
        'unseen_recall': recall_score(y_true_unseen, y_pred_unseen),
        'unseen_accuracy': accuracy_score(y_true_unseen, y_pred_unseen),
        'number of files': len(df_unseen)
    })


--- Period 12 ---
Història: 12 períodes
Train set (DF) y=1: 0.14
Unseen (p) y=1: 0.06
Train balançejat y=1: 0.50

--- Period 13 ---
Història: 12 períodes
Train set (DF) y=1: 0.13
Unseen (p) y=1: 0.13
Train balançejat y=1: 0.50

--- Period 14 ---
Història: 12 períodes
Train set (DF) y=1: 0.14
Unseen (p) y=1: 0.18
Train balançejat y=1: 0.50

--- Period 15 ---
Història: 12 períodes
Train set (DF) y=1: 0.15
Unseen (p) y=1: 0.11
Train balançejat y=1: 0.50

--- Period 16 ---
Història: 12 períodes
Train set (DF) y=1: 0.15
Unseen (p) y=1: 0.08
Train balançejat y=1: 0.50

--- Period 17 ---
Història: 12 períodes
Train set (DF) y=1: 0.15
Unseen (p) y=1: 0.08
Train balançejat y=1: 0.50

--- Period 18 ---
Història: 12 períodes
Train set (DF) y=1: 0.14
Unseen (p) y=1: 0.10
Train balançejat y=1: 0.50

--- Period 19 ---
Història: 12 períodes
Train set (DF) y=1: 0.14
Unseen (p) y=1: 0.18
Train balançejat y=1: 0.50

--- Period 20 ---
Història: 12 períodes
Train set (DF) y=1: 0.15
Unseen (p) y=1: 0.28
T

In [32]:
results_df2 = pd.DataFrame(results2)

In [33]:
results_df2.to_csv('scoresN20N10N5.csv', mode='a', header=False, index=False)