In [61]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
# from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import matplotlib

# for interactive plots in Jupyter:
# matplotlib.use('qtagg')

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

from transform_filter_data import transform_filter_data

In [62]:
# history_len: 365, 270, 180, 90 days
# span_threshold: 90, 60, 40, 20 days

# trained on 2022-05-28 cutoff
# validated on 2022-11-19 cutoff
#                                           ACC    f1-0  f1-1
# 1 year:   model always says '0' (testing: 0.902, 0.93, 0.82)
# 9 months: model always says '0' (testing: 0.906, 0.94, 0.82)
# 6 months: model always says '0' (testing: 0.898, 0.94, 0.69)
# 3 months: 0.871, 0.93, 0.00     (testing: 0.886, 0.93, 0.64)

history_len = pd.Timedelta('365 days')
span_threshold = pd.Timedelta('90 days')
accesses_threshold = 30
source_csv = 'datasets_popularity_DAOD_PHYS_mc16_13TeV_202308141254.csv'

objs_df = transform_filter_data(
    source_csv = source_csv,
    finish_date = pd.Timestamp('2022-05-28'),
    horizon_date = pd.Timestamp('2022-06-04'),
    span_threshold = span_threshold,
    history_len = history_len,
    accesses_threshold = accesses_threshold
)

filling 'y' column


100%|██████████████████████████████████| 18143/18143 [00:01<00:00, 12134.05it/s]


after filtering by access number: 593 / 2659
filtering by history presence


100%|█████████████████████████████████████| 2659/2659 [00:00<00:00, 3362.67it/s]


after filtering: 573 / 2108
transforming date arrays to timeseries


100%|██████████████████████████████████████| 2108/2108 [00:05<00:00, 366.81it/s]


In [63]:
objs_df = objs_df.sample(frac=1, random_state=42)

In [64]:
# last_dates_relative = objs_df['last_date'] - pd.Timestamp(2010, 1, 1)
# objs_df['last_date'] = last_dates_relative.apply(getattr, args=('days',))

In [65]:
X = np.stack(objs_df['history_ts'].to_numpy())

In [66]:
y = (objs_df['y'].to_numpy() > 0).astype('int')

In [67]:
Ntest = 500

Xtrain = X[:-Ntest]
Xtest = X[-Ntest:]
Ytrain = y[:-Ntest]
Ytest = y[-Ntest:]

In [68]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(Xtrain, Ytrain)

In [69]:
Ypred = clf.predict(Xtest)

In [70]:
print(f'Accuracy: {accuracy_score(Ytest, Ypred)}')
print('Confusion Matrix')
print(confusion_matrix(Ytest, Ypred))
print(classification_report(Ytest, Ypred))

Accuracy: 0.902
Confusion Matrix
[[343  13]
 [ 36 108]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       356
           1       0.89      0.75      0.82       144

    accuracy                           0.90       500
   macro avg       0.90      0.86      0.87       500
weighted avg       0.90      0.90      0.90       500



### Try to validate on another time cutoff

In [71]:
objs_df_val = transform_filter_data(
    source_csv = source_csv,
    finish_date = pd.Timestamp('2022-11-19'),
    horizon_date = pd.Timestamp('2022-11-26'),
    span_threshold = span_threshold,
    history_len = history_len,
    accesses_threshold = accesses_threshold
)

filling 'y' column


100%|██████████████████████████████████| 22668/22668 [00:01<00:00, 14904.78it/s]


after filtering by access number: 392 / 3652
filtering by history presence


100%|█████████████████████████████████████| 3652/3652 [00:02<00:00, 1443.15it/s]


after filtering: 392 / 2716
transforming date arrays to timeseries


100%|██████████████████████████████████████| 2716/2716 [00:05<00:00, 536.84it/s]


In [72]:
Xval = np.stack(objs_df_val['history_ts'].to_numpy())
yval = (objs_df_val['y'].to_numpy() > 0).astype('int')

In [73]:
ypred_val = clf.predict(Xval)

In [74]:
np.sum(ypred_val)

0

In [75]:
print(f'Accuracy: {accuracy_score(yval, ypred_val)}')
print('Confusion Matrix')
print(confusion_matrix(yval, ypred_val))
print(classification_report(yval, ypred_val))

Accuracy: 0.8556701030927835
Confusion Matrix
[[2324    0]
 [ 392    0]]
              precision    recall  f1-score   support

           0       0.86      1.00      0.92      2324
           1       0.00      0.00      0.00       392

    accuracy                           0.86      2716
   macro avg       0.43      0.50      0.46      2716
weighted avg       0.73      0.86      0.79      2716



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
