In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

RANDOM_STATE = 42

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
df = pd.read_csv("/data/astro/scratch/msantama/tfm/data.csv")
df

Unnamed: 0,x.pnfsid,x.type,x.subtype,x.stddev_access_date,x.dt_last_access_date,x.dt_second_last_access_date,x.dt_third_last_access_date,x.dt_fourth_last_access_date,x.dt_fifth_last_access_date,x.normalized_access_count,x.normalized_filesize,x.temperature,x.size_category,x.above_median_access_count,x.above_median_filesize,x.access_count_last_1_day,x.access_count_last_3_days,x.access_count_last_7_days,x.access_count_last_15_days,x.lifetime,x.access_count,x.read_data_per_second,y,m_date_window
0,000000019BAA63584725A864DB1B460EB510,data,AOD,0.000000,254352,,,,,0.000000,0.503096,cold-warm,xlarge,0,1,0,1,1,1,254352,1,16242.631896,1,2021-01-16
1,0000000EF04D47B344FA91F943950384AFAB,mc,AODSIM,0.000000,347232,,,,,0.272727,0.246947,hot,medium,1,0,0,0,1,1,1731415,4,1488.246422,0,2021-01-16
2,00000012A42AE234405DB0FC82389E1C7596,mc,AODSIM,0.000000,364406,,,,,0.000000,0.383230,warm-hot,large,0,1,0,0,1,1,364406,1,10924.823826,0,2021-01-16
3,000000140E53D58F44FD986BFA14AC2ED2C1,data,MINIAOD,1714.017503,1291371,1294799.0,,,,0.003922,0.504987,warm-hot,small,1,0,0,0,0,2,1294799,2,3346.479006,1,2021-01-16
4,00000018DA8A1C2D4A02B336E00D3A308773,mc,MINIAODSIM,1840.041739,485623,486241.0,486707.0,487578.0,488132.0,0.050657,0.071942,warm-hot,small,1,0,0,0,10,10,8433588,28,6.472632,1,2021-01-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14418330,0000FFF85E93AD4A4D26A79151831A5DE80E,data,RAW,0.000000,279173,,,,,0.000000,0.416933,cold-warm,small,0,0,0,0,1,1,8044810,2,756.514313,0,2023-12-23
14418331,0000FFF8F85576294065A64818287318A706,data,ALCARECO,2590.518095,807466,812647.0,,,,0.055556,0.072080,hot,medium,1,0,0,0,0,2,812647,2,371.153404,0,2023-12-23
14418332,0000FFFD364BF9574D6FBF6C7C21508A0CF5,data,RAW,13226.010736,237817,264269.0,,,,0.001555,0.798885,cold-warm,xlarge,0,1,0,1,2,2,7992906,3,1404.442556,0,2023-12-23
14418333,0000FFFDAF37D1204498ABB4F0A688C9E0AA,data,RAW,802.236873,268330,269934.0,,,,0.001555,0.832457,cold-warm,xlarge,0,1,0,0,2,2,8002944,3,1454.809582,0,2023-12-23


In [4]:
df.columns = [col.replace('x.', '') if col.startswith('x.') else col for col in df.columns]

In [5]:
df.drop(columns=['pnfsid'], inplace=True)

In [6]:
all_dates = sorted(df['m_date_window'].unique())

In [7]:
date_to_int = {date: idx for idx, date in enumerate(all_dates)}
df['m_date_window'] = df['m_date_window'].map(date_to_int).astype(int)

In [64]:
df_1_15 = df[df['m_date_window'] < 15]

X = df_1_15.drop(columns=['y'])
y = df_1_15['y']

In [57]:
cat_columns = ['above_median_filesize', 'above_median_access_count', 'size_category', 'temperature', 'type', 'subtype']
num_columns = [col for col in X.columns if col not in cat_columns]

In [85]:
class dt_imputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns and col.startswith('dt'):
                flag_col = f"{col}_flag"
                # Set flag to 1 where NaN, else 0
                X_transformed[flag_col] = X_transformed[col].isna().astype(int)
                # Fill NaN with lifetime
                X_transformed[col] = X_transformed[col].fillna(X_transformed['lifetime'])
        return X_transformed

In [80]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(), cat_columns)
], remainder='passthrough', force_int_remainder_cols=False)

In [81]:
from sklearn.linear_model import LogisticRegression

algorithm = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=RANDOM_STATE)


In [96]:
import imblearn
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
pipe = Pipeline([
    ('dt_imputer', dt_imputer(columns=[col for col in X.columns if col.startswith('dt')])),
    ('column_transformer', column_transformer),
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('smote', imblearn.over_sampling.SMOTE(sampling_strategy='auto', random_state=RANDOM_STATE)),
    ('alg', algorithm)
])


In [97]:
pipe.fit(X_train, y_train)

In [98]:
for i in range(25):
    df_i = df[df['m_date_window'] == i]

    X_test = df_i.drop(columns=['y'])
    y_test = df_i['y']

    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"Date: {i}, Recall: {recall:.4f}, AUC: {auc:.4f}, y_percent {y_test.mean():.4f}")


Date: 0, Recall: 0.1627, AUC: 0.7353, y_percent 0.2419
Date: 1, Recall: 0.2042, AUC: 0.2030, y_percent 0.0806
Date: 2, Recall: 0.3938, AUC: 0.7952, y_percent 0.0601
Date: 3, Recall: 0.5688, AUC: 0.7647, y_percent 0.0818
Date: 4, Recall: 0.1855, AUC: 0.6470, y_percent 0.1183
Date: 5, Recall: 0.4021, AUC: 0.7538, y_percent 0.1050
Date: 6, Recall: 0.8209, AUC: 0.7987, y_percent 0.1089
Date: 7, Recall: 0.6227, AUC: 0.5717, y_percent 0.0887
Date: 8, Recall: 0.7207, AUC: 0.8568, y_percent 0.0814
Date: 9, Recall: 0.9300, AUC: 0.8259, y_percent 0.2649
Date: 10, Recall: 0.4818, AUC: 0.6641, y_percent 0.3645
Date: 11, Recall: 0.8819, AUC: 0.5727, y_percent 0.0642
Date: 12, Recall: 0.5127, AUC: 0.7789, y_percent 0.0649
Date: 13, Recall: 0.7302, AUC: 0.7258, y_percent 0.1310
Date: 14, Recall: 0.3770, AUC: 0.6017, y_percent 0.1765
Date: 15, Recall: 0.6565, AUC: 0.7066, y_percent 0.1087
Date: 16, Recall: 0.8047, AUC: 0.6617, y_percent 0.0846
Date: 17, Recall: 0.7016, AUC: 0.7421, y_percent 0.0789
Da

In [104]:
importance = pd.Series(np.abs(pipe[-1].coef_[0]))
importance = importance.sort_values(ascending=False)
importance

42    4.010661e-02
40    1.910968e-02
38    1.419121e-02
37    1.012384e-02
36    5.842095e-03
15    3.834373e-03
35    3.766096e-03
9     3.423547e-03
2     3.063240e-03
12    2.772355e-03
47    2.453473e-03
46    2.405561e-03
1     2.378414e-03
45    2.375142e-03
10    2.094379e-03
34    1.632529e-03
7     1.266489e-03
17    1.261543e-03
44    1.188132e-03
8     1.140496e-03
4     1.112511e-03
21    1.069760e-03
0     8.676943e-04
14    8.044651e-04
11    7.764440e-04
16    7.503141e-04
6     5.408173e-04
13    4.737527e-04
24    3.506795e-04
5     3.262904e-04
33    2.740232e-04
25    2.075686e-04
20    1.948318e-04
3     1.828683e-04
22    1.037573e-04
23    7.986982e-05
18    4.909604e-05
19    4.634702e-05
27    1.921168e-06
28    1.552865e-06
39    5.576924e-08
41    5.484277e-08
26    5.243930e-08
30    5.103983e-08
29    3.908657e-08
31    2.752589e-08
32    2.189920e-08
43    0.000000e+00
dtype: float64

In [65]:
from sklearn.model_selection import PredefinedSplit, GridSearchCV


X_train = df_1_15.drop(columns=['y'])
y_train = df_1_15['y']

val_df = df[df['m_date_window'] == 16]


X_val = val_df.drop(columns=['y'])
y_val = val_df['y']

# Combine train and validation for PredefinedSplit
X_combined = pd.concat([X_train, X_val])
y_combined = pd.concat([y_train, y_val])

# Create test_fold: -1 for train, 0 for validation
test_fold = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold)

param_grid = {
    'alg__n_estimators': [200, 250],
    'alg__max_depth': [8, 9, 10],
    'alg__learning_rate': [0.005, 0.01, 0.02],
    'alg__subsample': [0.9, 1.0],
    'alg__colsample_bytree': [1.0],
    'alg__gamma': [2, 3, 4],
    'alg__reg_alpha': [0],
    'alg__reg_lambda': [1]
}

grid = GridSearchCV(pipe, param_grid, cv=ps, scoring='roc_auc', n_jobs=15, verbose=1)
grid.fit(X_combined, y_combined)

print("Best params:", grid.best_params_)
print("Best validation AUC:", grid.best_score_)

Fitting 1 folds for each of 108 candidates, totalling 108 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the dev

Best params: {'alg__colsample_bytree': 1.0, 'alg__gamma': 2, 'alg__learning_rate': 0.02, 'alg__max_depth': 10, 'alg__n_estimators': 250, 'alg__reg_alpha': 0, 'alg__reg_lambda': 1, 'alg__subsample': 0.9}
Best validation AUC: 0.7472234371438952


In [90]:
for i in range(25):
    df_i = df[df['m_date_window'] == i]

    X_test = df_i.drop(columns=['y'])
    y_test = df_i['y']

    y_pred = grid.best_estimator_.predict(X_test)
    y_proba = grid.best_estimator_.predict_proba(X_test)[:, 1]
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"Date: {i}, Recall: {recall:.4f}, AUC: {auc:.4f}, y_percent {y_test.mean():.4f}")


Date: 0, Recall: 0.9648, AUC: 0.9394, y_percent 0.2419
Date: 1, Recall: 0.9353, AUC: 0.9855, y_percent 0.0806
Date: 2, Recall: 0.8559, AUC: 0.9435, y_percent 0.0601
Date: 3, Recall: 0.8343, AUC: 0.9383, y_percent 0.0818
Date: 4, Recall: 0.8458, AUC: 0.9415, y_percent 0.1183
Date: 5, Recall: 0.7885, AUC: 0.9530, y_percent 0.1050
Date: 6, Recall: 0.8983, AUC: 0.9547, y_percent 0.1089
Date: 7, Recall: 0.8965, AUC: 0.9686, y_percent 0.0887
Date: 8, Recall: 0.9492, AUC: 0.9928, y_percent 0.0814
Date: 9, Recall: 0.9363, AUC: 0.9706, y_percent 0.2649
Date: 10, Recall: 0.9827, AUC: 0.9962, y_percent 0.3645
Date: 11, Recall: 0.9027, AUC: 0.9790, y_percent 0.0642
Date: 12, Recall: 0.8848, AUC: 0.9810, y_percent 0.0649
Date: 13, Recall: 0.9027, AUC: 0.9619, y_percent 0.1310
Date: 14, Recall: 0.9053, AUC: 0.9336, y_percent 0.1765
Date: 15, Recall: 0.4532, AUC: 0.6824, y_percent 0.1087
Date: 16, Recall: 0.7331, AUC: 0.9266, y_percent 0.0846
Date: 17, Recall: 0.5791, AUC: 0.7565, y_percent 0.0789
Da