In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import pickle
import timm
import torch
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold, GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.metrics import log_loss, roc_auc_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import Pipeline

from src.data.visiomel_datamodule import VisiomelDatamodule
from src.data.visiomel_datamodule_emb import VisiomelDatamoduleEmb
from src.utils.utils import extract_features_patches, load_embeddings, check_unique_pathes_same, get_X_y_groups
from src.utils.validation_upsampled import cross_validate_upsampled

# Pretrained + SVC

In [3]:
model = timm.create_model(
    'swinv2_large_window12to24_192to384_22kft1k', 
    pretrained=True, 
    num_classes=0
).cuda().eval()

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [34]:
datamodule = VisiomelDatamodule(
    task='raw',
    data_dir_train = '/workspace/data/images_page_4/',	
    k = None,
    fold_index = 0,
    data_dir_test = None,
    img_size = 384 * 4,
    shrink_preview_scale = 8,
    batch_size = 1,
    split_seed = 0,
    num_workers = 4,
    pin_memory = False,
    prefetch_factor = None,
    persistent_workers = False,
    sampler = None,
    data_shrinked=False,
    num_workers_saturated=4,
    enable_caching=False,
    train_resize_type='resize',
    train_transform_n_repeats=1,
)
datamodule.setup()
train_dataloader = datamodule.train_dataloader()

In [5]:
X, y = extract_features_patches(model, train_dataloader)

100%|██████████| 1342/1342 [28:28<00:00,  1.27s/it]


In [6]:
X, y = X.numpy(), y.numpy()

In [35]:
pathes = np.array([sample[0] for sample in train_dataloader.dataset.samples])
pathes = pathes[np.array(list(train_dataloader.sampler))]

In [36]:
X.shape, y.shape, pathes.shape

((1342, 110400), (1342,), (1342,))

In [47]:
df = pd.DataFrame(
    {
        'path': pathes, 
        'label': y,
        'features': list(X.reshape(y.shape[0], 25, -1)),
    }
)
df['label'] = df['label'].apply(torch.tensor)
df['features'] = df['features'].apply(torch.tensor)

In [48]:
df.head()

Unnamed: 0,path,label,features
0,/workspace/data/images_page_4/0/8l601uzw.png,tensor(0),"[[tensor(0.0155), tensor(0.0167), tensor(-0.08..."
1,/workspace/data/images_page_4/1/qdgmoqgg.png,tensor(0),"[[tensor(0.0182), tensor(0.0942), tensor(-0.07..."
2,/workspace/data/images_page_4/0/uf0adhgl.png,tensor(0),"[[tensor(0.0146), tensor(0.0736), tensor(-0.08..."
3,/workspace/data/images_page_4/0/6t5gt51a.png,tensor(0),"[[tensor(0.0040), tensor(0.0237), tensor(-0.09..."
4,/workspace/data/images_page_4/0/ltr6slec.png,tensor(0),"[[tensor(0.0189), tensor(0.0700), tensor(-0.07..."


In [49]:
df.to_pickle('/workspace/visiomel-2023/weights/pretrained/embeddings/all.pkl')

In [46]:
X.reshape(y.shape[0], 25, -1).shape

(1342, 25, 4416)

In [18]:
X = X.reshape(y.shape[0], -1)

In [20]:
X.shape

(1342, 110400)

In [None]:
df = pd.DataFrame(X)

In [19]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_index, (train_index, val_index) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    clf = SVC(kernel='linear', C=1, probability=True)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict_proba(X_val)
    print(
        f'fold: {fold_index}, '
        f'logloss: {log_loss(y_val, y_val_pred, eps=1e-16)}, '
        f'roc_auc: {roc_auc_score(y_val, y_val_pred[:, 1])}, '
        f'f1: {f1_score(y_val, clf.predict(X_val))}'
    )

fold: 0, logloss: 0.42887709268732954, roc_auc: 0.6718460588598477, f1: 0.28169014084507044
fold: 1, logloss: 0.39648552544221144, roc_auc: 0.7126980860259313, f1: 0.3


In [None]:
clf = SVC(probability=True)
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'class_weight': [None, 'balanced'],
    'gamma': ['scale', 'auto'],
}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid = GridSearchCV(clf, param_grid, scoring='neg_log_loss', n_jobs=10, cv=kfold.split(X, y), verbose=10)

# SSUP + SVC + fold 0 train and val

In [14]:
train_path = '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_0/embeddings/train.pkl'
val_path = '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_0/embeddings/val_aug.pkl'

### Train + val

In [15]:
with open(train_path, 'rb') as f:
    df_train = pickle.load(f)
with open(val_path, 'rb') as f:
    df_val = pickle.load(f)

In [20]:
np.allclose(
    df_train.path.str.split('/').apply(lambda x: x[-2]).astype(int), 
    df_train.label.apply(lambda x: x.item()).astype(int))

True

In [None]:
df_train.head()

Unnamed: 0,path,label,features
0,/workspace/data/images_page_4_shrink/1/hzkjmsp...,tensor(1),"[[[tensor(-0.1717), tensor(0.1906), tensor(-0...."
1,/workspace/data/images_page_4_shrink/1/hzkjmsp...,tensor(1),"[[[tensor(-0.9721), tensor(-0.4099), tensor(-1..."
2,/workspace/data/images_page_4_shrink/1/hzkjmsp...,tensor(1),"[[[tensor(-0.9645), tensor(-0.4061), tensor(-1..."
3,/workspace/data/images_page_4_shrink/1/hzkjmsp...,tensor(1),"[[[tensor(-0.9717), tensor(-0.4100), tensor(-1..."
4,/workspace/data/images_page_4_shrink/1/hzkjmsp...,tensor(1),"[[[tensor(-0.9721), tensor(-0.4099), tensor(-1..."


In [None]:
df_val.head()

Unnamed: 0,path,label,features
0,/workspace/data/images_page_4_shrink/0/09lh5ig...,tensor(0),"[[[tensor(-0.0191), tensor(0.1667), tensor(-1...."
1,/workspace/data/images_page_4_shrink/0/09lh5ig...,tensor(0),"[[[tensor(0.0051), tensor(0.1823), tensor(-1.3..."
2,/workspace/data/images_page_4_shrink/0/09lh5ig...,tensor(0),"[[[tensor(0.0060), tensor(0.1814), tensor(-1.3..."
3,/workspace/data/images_page_4_shrink/0/09lh5ig...,tensor(0),"[[[tensor(-0.0298), tensor(0.1509), tensor(-1...."
4,/workspace/data/images_page_4_shrink/0/09lh5ig...,tensor(0),"[[[tensor(0.0051), tensor(0.1823), tensor(-1.3..."


In [None]:
df_val['features'].iloc[0].shape

torch.Size([1, 4, 1024])

In [None]:
np.mean(df_val['features'].iloc[0].squeeze().numpy(), axis=0)

(1024,)

In [None]:
X, y, groups = get_X_y_groups(pd.concat([df_train, df_val], axis=0))

In [None]:
kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=0)
for train_index, val_index in kfold.split(X, y, groups):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    clf = SVC(kernel='linear', C=1, probability=True)
    clf.fit(X_train, y_train)
    print(log_loss(y_val, clf.predict_proba(X_val), eps=1e-16))

0.3619276913635527
0.36976809121450593
0.3678880951683179
0.42685247488728684
0.3686381290056779


### Train only + val as test

In [None]:
X_train, y_train, _ = get_X_y_groups(df_train)
X_val, y_val, _ = get_X_y_groups(df_val)

clf = SVC(kernel='linear', C=1, probability=True)
clf.fit(X_train, y_train)
print(log_loss(y_val, clf.predict_proba(X_val), eps=1e-16))

0.423128091028227


### Pretrained

In [3]:
all_path = '/workspace/visiomel-2023/weights/pretrained/embeddings/all.pkl'
with open(all_path, 'rb') as f:
    df_all = pickle.load(f)

In [21]:
np.allclose(
    df_all.path.str.split('/').apply(lambda x: x[-2]).astype(int), 
    df_all.label.apply(lambda x: x.item()).astype(int))

False

In [25]:
df_all.label.apply(lambda x: x.item()).astype(int).value_counts()

0    1129
1     213
Name: label, dtype: int64

In [22]:
df_all.path.str.split('/').apply(lambda x: x[-2]).astype(int)

0       0
1       1
2       0
3       0
4       0
       ..
1337    1
1338    0
1339    0
1340    0
1341    0
Name: path, Length: 1342, dtype: int64

In [9]:
df_all['features'].iloc[0].shape

torch.Size([25, 4416])

In [6]:
X, y, _ = get_X_y_groups(df_all)

In [7]:
X.shape, y.shape

((1342, 22080), (1342,), (1342,))

In [10]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_index, val_index in kfold.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    clf = SVC(kernel='linear', C=1, probability=True)
    clf.fit(X_train, y_train)
    print(log_loss(y_val, clf.predict_proba(X_val), eps=1e-16))

0.4502000067627272
0.3465960954837472
0.3939392515828534
0.4786956632502816
0.3685360498213115


# SSUP + SVC + folds 0-4 out-of-fold features

**Note**: fold 3 model is selected by SSUP val metric (possible overfitting)

### Image features

In [5]:
all_pathes = [
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_0/embeddings/val.pkl',
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_1/embeddings/val.pkl',
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_2/embeddings/val.pkl',
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_3/embeddings/val.pkl',
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_4/embeddings/val.pkl',
]
all_pathes_upsampled = [
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_0/embeddings/val_aug.pkl',
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_1/embeddings/val_aug.pkl',
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_2/embeddings/val_aug.pkl',
    '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_3/embeddings/val_aug.pkl',
    # '/workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_4/embeddings/val_aug.pkl',
]

dfs = load_embeddings(all_pathes)
df_all = pd.concat(list(dfs.values()), axis=0)

dfs_upsampled = load_embeddings(all_pathes_upsampled)
df_upsampled_all = pd.concat(list(dfs_upsampled.values()), axis=0)

for path, path_upsampled in zip(all_pathes, all_pathes_upsampled):
    check_unique_pathes_same(dfs[path], dfs_upsampled[path_upsampled])

path: /workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_0/embeddings/val.pkl
	raw shape: (269, 3)
	deduplicated shape: (269, 3)
path: /workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_1/embeddings/val.pkl
	raw shape: (269, 3)
	deduplicated shape: (269, 3)
path: /workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_2/embeddings/val.pkl
	raw shape: (268, 3)
	deduplicated shape: (268, 3)
path: /workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_3/embeddings/val.pkl
	raw shape: (268, 3)
	deduplicated shape: (268, 3)
path: /workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_4/embeddings/val.pkl
	raw shape: (268, 3)
	deduplicated shape: (268, 3)
path: /workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_0/embeddings/val_aug.pkl
	raw shape: (1345, 3)
	deduplicated shape: (1180, 3)
path: /workspace/visiomel-2023/weights/val_ssup_patches_aug_fold_1/embeddings/val_aug.pkl
	raw shape: (1345, 3)
	deduplicated shape: (1179, 3)
path: /workspace/visiomel-202

In [6]:
df_upsampled_all['path'].nunique(), df_all['path'].nunique(), df_upsampled_all['path'].shape, df_all['path'].shape

(1074, 1342, (4715,), (1342,))

In [7]:
np.allclose(
    df_all.path.str.split('/').apply(lambda x: x[-2]).astype(int), 
    df_all.label.apply(lambda x: x.item()).astype(int)), \
np.allclose(
    df_upsampled_all.path.str.split('/').apply(lambda x: x[-2]).astype(int),
    df_upsampled_all.label.apply(lambda x: x.item()).astype(int))

(True, True)

In [10]:
path = df_upsampled_all.sample(1)['path'].iloc[0]

In [11]:
df_upsampled_all[df_upsampled_all['path'] == path]

Unnamed: 0,path,label,features
995,/workspace/data/images_page_4_shrink/0/v52wl5q...,tensor(0),"[[[tensor(-0.8763), tensor(-0.2675), tensor(-1..."
996,/workspace/data/images_page_4_shrink/0/v52wl5q...,tensor(0),"[[[tensor(-0.7644), tensor(-0.1519), tensor(-1..."
997,/workspace/data/images_page_4_shrink/0/v52wl5q...,tensor(0),"[[[tensor(0.9339), tensor(0.2271), tensor(1.81..."


In [12]:
df_upsampled_all[df_upsampled_all['path'] == path]

Unnamed: 0,path,label,features
995,/workspace/data/images_page_4_shrink/0/v52wl5q...,tensor(0),"[[[tensor(-0.8763), tensor(-0.2675), tensor(-1..."
996,/workspace/data/images_page_4_shrink/0/v52wl5q...,tensor(0),"[[[tensor(-0.7644), tensor(-0.1519), tensor(-1..."
997,/workspace/data/images_page_4_shrink/0/v52wl5q...,tensor(0),"[[[tensor(0.9339), tensor(0.2271), tensor(1.81..."


In [13]:
df_all.label.apply(lambda x: x.item()).astype(int).value_counts(), \
df_upsampled_all.label.apply(lambda x: x.item()).astype(int).value_counts()

(0    1129
 1     213
 Name: label, dtype: int64,
 0    3968
 1     747
 Name: label, dtype: int64)

In [14]:
df_all.path.str.split('/').apply(lambda x: x[-2]).astype(int)

0      0
1      0
2      0
3      0
4      0
      ..
263    1
264    1
265    1
266    1
267    1
Name: path, Length: 1342, dtype: int64

In [15]:
df_all['features'].iloc[0].shape

torch.Size([1, 4, 1024])

In [16]:
X_upsampled, y_upsampled, groups_upsampled = get_X_y_groups(df_upsampled_all)
X, y, groups = get_X_y_groups(df_all)

In [17]:
np.unique(groups).shape

(1342,)

In [18]:
groups[:10]

array(['/workspace/data/images_page_4_shrink/0/09lh5igz.png',
       '/workspace/data/images_page_4_shrink/0/0ezpjmed.png',
       '/workspace/data/images_page_4_shrink/0/1fzfhtcc.png',
       '/workspace/data/images_page_4_shrink/0/1njoi9iq.png',
       '/workspace/data/images_page_4_shrink/0/1ojip5y7.png',
       '/workspace/data/images_page_4_shrink/0/1qvd8x3f.png',
       '/workspace/data/images_page_4_shrink/0/1vig9enh.png',
       '/workspace/data/images_page_4_shrink/0/22it8737.png',
       '/workspace/data/images_page_4_shrink/0/24xxi0a4.png',
       '/workspace/data/images_page_4_shrink/0/2fee9717.png'],
      dtype=object)

In [19]:
X.shape, y.shape

((1342, 5120), (1342,))

In [20]:
SEED = 0

In [21]:
kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
clf = SVC(kernel='linear', C=0.01, probability=True, class_weight='balanced', random_state=SEED)
cross_validate(clf, X, y, cv=kfold.split(X, y, groups), scoring=('roc_auc', 'f1', 'neg_log_loss'), n_jobs=5)

{'fit_time': array([24.89175677, 24.64705157, 23.77119088, 24.51277971, 24.5971911 ]),
 'score_time': array([2.85961413, 2.93563032, 2.98725462, 2.9428854 , 2.92034864]),
 'test_roc_auc': array([0.75684297, 0.73963824, 0.68531395, 0.70487755, 0.75726928]),
 'test_f1': array([0.43396226, 0.44      , 0.28828829, 0.34862385, 0.42696629]),
 'test_neg_log_loss': array([-0.4044415 , -0.38590334, -0.41210887, -0.39244217, -0.37741521])}

In [22]:
kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
clf = SVC(kernel='linear', C=0.01, probability=True, class_weight='balanced', random_state=SEED)
cross_validate_upsampled(
    clf, 
    X, 
    X_upsampled,
    y, 
    y_upsampled,
    groups=groups, 
    groups_upsampled=groups_upsampled,
    cv=kfold.split(X, y, groups), 
    scoring=('roc_auc', 'f1', 'neg_log_loss'), 
    n_jobs=5
)

{'fit_time': array([311.44477391, 309.35402966, 299.99684381, 306.87008071,
        306.6387558 ]),
 'score_time': array([ 7.13330197,  8.21358943, 10.42922854,  9.74982119,  9.9295752 ]),
 'test_roc_auc': array([0.73441037, 0.72516796, 0.63600927, 0.74315703, 0.77907712]),
 'test_f1': array([0.40776699, 0.41818182, 0.30188679, 0.38461538, 0.45238095]),
 'test_neg_log_loss': array([-0.40407843, -0.39303692, -0.41443596, -0.38737243, -0.37379449])}

### Metadata features

### HPO

In [51]:
kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
clf = Pipeline(
    [
        ('svc', SVC(random_state=SEED, probability=True, class_weight='balanced'))
    ]
)
param_grid = [
    {'svc__C': [0.01, 0.1, 1, 10, 100], 'svc__kernel': ['linear']},
    {'svc__C': [0.01, 0.1, 1, 10, 100], 'svc__gamma': ['scale', 'auto', 0.001, 0.0001], 'svc__kernel': ['rbf', 'sigmoid']},
    {'svc__C': [0.01, 0.1, 1, 10, 100], 'svc__gamma': ['scale', 'auto', 0.001, 0.0001], 'svc__degree': [3, 5], 'svc__kernel': ['poly']},
]
search = GridSearchCV(
    clf, 
    param_grid, 
    refit='roc_auc', 
    scoring=('roc_auc', 'f1', 'neg_log_loss'), 
    cv=kfold.split(X, y, groups), 
    verbose=3, 
    n_jobs=5
)
search.fit(X, y, groups=groups)

Fitting 5 folds for each of 85 candidates, totalling 425 fits
[CV 2/5] END svc__C=0.01, svc__kernel=linear; f1: (test=0.327) neg_log_loss: (test=-0.420) roc_auc: (test=0.642) total time= 3.3min
[CV 3/5] END svc__C=0.01, svc__kernel=linear; f1: (test=0.287) neg_log_loss: (test=-0.420) roc_auc: (test=0.656) total time= 3.3min
[CV 4/5] END svc__C=0.01, svc__kernel=linear; f1: (test=0.358) neg_log_loss: (test=-0.410) roc_auc: (test=0.702) total time= 3.4min
[CV 5/5] END svc__C=0.01, svc__kernel=linear; f1: (test=0.352) neg_log_loss: (test=-0.397) roc_auc: (test=0.711) total time= 3.4min
[CV 1/5] END svc__C=0.01, svc__kernel=linear; f1: (test=0.507) neg_log_loss: (test=-0.393) roc_auc: (test=0.766) total time= 3.4min
[CV 1/5] END svc__C=0.1, svc__kernel=linear; f1: (test=0.516) neg_log_loss: (test=-0.392) roc_auc: (test=0.770) total time= 2.4min
[CV 2/5] END svc__C=0.1, svc__kernel=linear; f1: (test=0.342) neg_log_loss: (test=-0.425) roc_auc: (test=0.637) total time= 2.4min
[CV 3/5] END svc

In [52]:
search.best_params_

{'svc__C': 0.01, 'svc__kernel': 'linear'}