In [2]:
from pathlib import Path
import os

import numpy as np

import torch
from torch import nn
import timm
from skimage import io, transform

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, RegressorMixin

import pandas as pd

from sklearn import metrics

from tqdm.auto import trange, tqdm

import joblib

import warnings
warnings.filterwarnings("ignore")

dev = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
DATA_ROOT   = Path('./data')
WORKING_DIR = Path('./')
STUDY_PATH = WORKING_DIR / 'studies'

In [4]:
X = pd.read_csv(DATA_ROOT / 'X_train.csv')
y = pd.read_csv(DATA_ROOT / 'train_labels.csv')

In [5]:
def clean_tile(t):
    t = t.replace('\'', '_')
    return ''.join(e for e in t if e.isalnum() or e in '- _¢·v)(')

In [6]:
y['title'] = [clean_tile(x) for x in X['movie_title']]

In [7]:
[x for x in y['title'] if 'Before' in x]

['Before Sunrise',
 'The Life Before Her Eyes',
 'Before Sunset',
 'Me Before You',
 'Before I Go to Sleep',
 'Before Midnight']

In [8]:
class PostersDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, genres_csv, root_dir, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.titles = np.array([f.split('.')[0] for f in os.listdir(root_dir)])
        self.genres = genres_csv

    def __len__(self):
        return len(self.genres)

    def __getitem__(self, idx):
        if torch.is_tensor(idx) or isinstance(idx, np.ndarray):
            idx = idx.tolist()

        if not isinstance(idx, list):
            idx = [idx]

        out = []
        for id_ in idx:
            img_name = os.path.join(self.root_dir,
                                    f'{self.titles[id_]}.jpg')
            img = io.imread(img_name)

            if self.transform:
                img = self.transform(img)

            y = self.genres[self.genres['title'] == self.titles[id_]].drop(['imdb_score', 'title'], axis=1).to_numpy()
            
            if len(y) == 0:
                print("============")
                print(self.titles[idx])
                print("============")
            out.append((img, y, self.titles[id_]))
            
        return out

In [9]:
trasf = transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Resize((224, 224)),
                                transforms.RandomHorizontalFlip(),
                            ])

dataset = PostersDataset(y, DATA_ROOT / 'train_posters', transform=trasf)
train_set, val_set = \
    torch.utils.data.random_split(dataset, [int(len(dataset) * .7), int(np.ceil(len(dataset) * .3))])
                                             
train_dtld = DataLoader(train_set, batch_size=64)
test_dtld = DataLoader(val_set, batch_size=64)

In [10]:
genres = dataset.genres.drop(['imdb_score', 'title'], axis=1).columns

In [11]:
a = next(iter(train_dtld))

In [12]:
def imshow(img):
    plt.imshow(torch.movedim(img, 0, 2))

In [13]:
class ImgClass(nn.Module):
    def __init__(self):
        super(ImgClass, self).__init__()
        # self.input = nn.Conv2d(3, 3, 5, 1, 2)
        self.resnet = timm.create_model('resnet34', pretrained=True)
        self.output = nn.Linear(1000, 1)

    def forward(self, x):
        # x = self.input(x)
        x = self.resnet(x)
        return self.output(x).sigmoid()

In [14]:
n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=False)
loss_fn = nn.BCELoss()

In [14]:
N_EPOCHS = 10

train_losses = [[] for _ in range(19)]
train_accs = [[] for _ in range(19)]

test_losses = [[] for _ in range(19)]
test_accs = [[] for _ in range(19)]
trained_models = []

for i, (train_idx, test_idx) in tqdm(enumerate(cv.split(dataset)), total=n_splits):
    train_dtld = DataLoader(dataset[train_idx], batch_size=64, shuffle=False)
    test_dtld = DataLoader(dataset[test_idx], batch_size=64, shuffle=False)
    
    
    models = [ImgClass().to(dev) for _ in range(19)]
    optims = [torch.optim.Adam(models[i].parameters(), lr=1e-3) for i in range(19)]


    for epoch in trange(N_EPOCHS, leave=False):
        ## TRAIN
        accs = [[] for _ in range(19)]
        losses = [[] for _ in range(19)]

        for X, y, _ in tqdm(train_dtld, leave=False):
            for i, model in enumerate(models):
                X, y = X.to(dev), y.to(dev)
                y_pred = model(X)

                loss = loss_fn(y_pred, y[:, :, i].float())

                acc = metrics.accuracy_score(y[:, :, i].cpu().detach().numpy(), y_pred.cpu().detach().numpy() > 0.5)

                model.zero_grad()
                loss.backward()
                optims[i].step()

                losses[i].append(loss.item())
                accs[i].append(acc)
        for acc, loss in zip(accs, losses):
            train_accs[i].append(np.mean(acc))
            train_losses[i].append(np.mean(loss))

    trained_models.append(models)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

In [27]:
oofs = [[] for _ in range(19)]
titles = [[] for _ in range(19)]

accs = [[] for _ in range(19)]
losses = [[] for _ in range(19)]


for i, (train_idx, test_idx) in tqdm(enumerate(cv.split(dataset)), total=n_splits):
    train_dtld = DataLoader(dataset[train_idx], batch_size=64, shuffle=False)
    test_dtld = DataLoader(dataset[test_idx], batch_size=64, shuffle=False)

    ## TEST
    with torch.no_grad():

        for X, y, title in tqdm(test_dtld, leave=False):
            for j in range(19):
                X, y = torch.tensor(X).to(dev), torch.tensor(y).to(dev)
                y_pred = trained_models[i][j](X)

                oofs[j] += y_pred.cpu().detach().tolist()
                titles[j] += title

                loss = loss_fn(y_pred, y[:, :, j].float())
                acc = metrics.accuracy_score(y[:, :, j].cpu().detach().numpy(), y_pred.cpu().detach().numpy() > 0.5)
                losses[j].append(loss.item())
                accs[j].append(acc)
                

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [28]:
[np.mean(acc) for acc in accs]

[0.7427083333333333,
 0.7784375,
 0.9611458333333333,
 0.75375,
 0.8155208333333333,
 0.9790625,
 0.679375,
 0.9025,
 0.888125,
 0.9978125,
 0.9609375,
 0.89125,
 0.955,
 0.8982291666666667,
 0.7982291666666668,
 0.885625,
 0.7203125,
 0.974375,
 0.9878125]

In [33]:
cols = [f'p_img_{genre}' for genre in dataset.genres.drop(['title', 'imdb_score'], axis=1).columns]
oofs_df = pd.DataFrame(np.moveaxis(np.array(oofs).squeeze(), [1, 0], [0, 1]), columns=cols, index=titles[0])

In [34]:
oofs_df

Unnamed: 0,p_img_Action,p_img_Adventure,p_img_Animation,p_img_Comedy,p_img_Crime,p_img_Documentary,p_img_Drama,p_img_Family,p_img_Fantasy,p_img_Foreign,p_img_History,p_img_Horror,p_img_Music,p_img_Mystery,p_img_Romance,p_img_Science Fiction,p_img_Thriller,p_img_War,p_img_Western
Before I Go to Sleep,2.094530e-01,4.418359e-05,1.082134e-05,7.359636e-03,9.999714e-01,4.684079e-06,0.999999,2.394148e-12,5.917024e-08,1.687294e-27,4.076788e-08,2.371799e-09,1.448483e-03,7.992918e-01,8.820374e-07,1.129824e-03,2.347026e-03,9.220045e-08,1.312161e-07
The 5th Wave,6.965966e-01,5.001103e-01,6.815137e-06,1.361879e-04,3.492116e-10,3.178798e-03,0.201886,1.440349e-05,9.942008e-01,1.581902e-26,1.013191e-06,2.537784e-02,1.449423e-07,2.706215e-06,4.047122e-08,8.409939e-04,9.995763e-01,1.849366e-03,4.672410e-05
Shark Tale,1.859718e-01,1.688385e-01,6.149383e-03,9.999229e-01,1.301941e-08,2.205677e-02,0.000002,9.986709e-01,5.563495e-03,2.666445e-25,2.958058e-09,2.526262e-05,3.020452e-07,1.363889e-06,3.055294e-05,3.888128e-05,3.554102e-07,2.730362e-08,7.414762e-05
Cradle 2 the Grave,2.639533e-04,4.246844e-05,3.240485e-07,9.173861e-01,9.731997e-01,1.924754e-03,0.999958,1.593943e-12,8.482793e-09,3.946554e-26,2.649131e-07,2.181222e-07,2.602782e-08,2.564522e-08,7.940095e-05,3.389532e-11,7.318038e-02,6.986314e-10,1.841706e-07
Trainwreck,1.254285e-06,1.535301e-07,1.449123e-06,1.000000e+00,1.523498e-06,4.862097e-05,0.743038,1.338253e-06,2.075179e-07,4.758478e-32,7.064493e-04,3.704129e-07,1.300537e-03,6.072276e-08,9.451264e-01,5.827989e-08,1.118050e-11,1.178447e-09,3.149249e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tumbleweeds,1.967659e-05,1.735477e-03,1.782276e-06,9.999908e-01,1.264812e-05,4.137787e-07,0.001581,6.390149e-05,6.042860e-07,2.762005e-22,1.067728e-07,4.652611e-10,6.229203e-04,3.566561e-06,9.994804e-01,8.156609e-09,1.755155e-12,5.789553e-04,7.557089e-18
The 13th Warrior,5.092020e-02,1.705997e-03,3.548147e-09,5.720665e-07,9.017004e-07,6.577902e-09,0.997621,1.230822e-04,2.833206e-03,2.265297e-25,2.747308e-05,1.330786e-05,1.090782e-11,1.612884e-07,2.501760e-04,2.510036e-02,8.707715e-01,7.625503e-05,6.420456e-08
Madagascar Escape 2 Africa,2.518833e-11,8.738497e-01,7.048868e-03,5.127398e-01,6.144658e-05,7.786108e-08,0.000004,9.999998e-01,1.208938e-04,5.841734e-27,7.945858e-03,6.064271e-06,5.650156e-03,8.027505e-07,1.306850e-03,4.204479e-07,8.306402e-05,2.619223e-10,2.282622e-08
Go for It,1.897840e-07,8.381273e-01,2.515302e-07,5.750727e-02,5.452289e-06,4.684483e-09,0.840371,1.091252e-06,1.230336e-04,5.867333e-25,3.977308e-05,3.156600e-05,1.728630e-06,2.576533e-07,1.437964e-07,7.532151e-05,4.850437e-02,8.949078e-09,6.706995e-07


In [35]:
oofs_df.to_csv(DATA_ROOT / 'oofs_imgs.csv')

In [65]:
oofs_df

Unnamed: 0,p_img_Action,p_img_Adventure,p_img_Animation,p_img_Comedy,p_img_Crime,p_img_Documentary,p_img_Drama,p_img_Family,p_img_Fantasy,p_img_Foreign,p_img_History,p_img_Horror,p_img_Music,p_img_Mystery,p_img_Romance,p_img_Science Fiction,p_img_Thriller,p_img_War,p_img_Western
0,1.968843e-01,1.993689e-05,8.879242e-06,9.054789e-03,9.999534e-01,9.527656e-06,0.999999,3.928550e-12,7.440489e-08,2.541109e-25,1.573689e-08,7.733706e-10,1.583185e-03,7.296594e-01,6.669433e-07,2.736965e-03,2.987275e-03,2.787142e-08,7.378716e-08
1,6.013325e-01,4.630861e-01,2.305611e-06,1.676695e-04,3.830799e-10,6.327938e-03,0.047103,1.557525e-05,9.945295e-01,1.103660e-24,5.598473e-07,1.451819e-02,1.439517e-07,2.931666e-06,1.911390e-08,1.655559e-03,9.997661e-01,1.799241e-03,3.073577e-05
2,5.086008e-03,2.359726e-06,3.646325e-02,9.999930e-01,4.116538e-04,1.543011e-02,0.000971,9.992150e-01,2.298786e-04,1.733378e-22,3.382181e-08,3.788807e-04,1.496798e-07,7.858034e-08,1.240118e-03,3.798546e-06,1.768700e-06,1.678215e-07,2.762499e-05
3,1.388280e-05,7.610676e-03,1.890592e-05,9.839504e-01,5.517299e-02,2.430798e-02,0.999680,9.290728e-08,4.069162e-06,1.810702e-25,4.528638e-08,9.852495e-09,3.685317e-06,1.354828e-04,3.969548e-04,4.676254e-08,7.310975e-02,3.275436e-07,1.043764e-07
4,4.106988e-07,1.246830e-07,1.714886e-06,1.000000e+00,2.149369e-06,1.054680e-04,0.713643,1.720856e-06,1.639125e-07,4.058425e-30,4.863419e-04,2.505092e-07,1.598642e-03,4.327130e-08,9.677393e-01,9.219886e-08,1.264946e-11,1.546674e-09,2.415949e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2889,5.018525e-05,5.303190e-03,7.781357e-06,9.999988e-01,9.056796e-05,7.094293e-09,0.001459,2.257059e-03,1.345413e-12,4.102173e-25,8.928943e-07,3.801621e-09,1.029650e-04,3.512210e-07,8.833754e-01,8.810284e-13,1.341149e-13,1.225216e-04,4.536614e-10
2890,1.334224e-01,3.662193e-03,4.837270e-09,2.174876e-07,1.039623e-06,6.045682e-09,0.998338,1.980763e-04,5.515950e-03,1.247328e-24,4.765255e-05,3.339526e-05,2.202230e-12,4.303270e-07,1.307676e-04,1.300710e-02,8.608657e-01,5.588136e-05,1.619325e-08
2891,2.132113e-09,6.453414e-02,8.688737e-01,8.594906e-01,9.091417e-04,2.217546e-09,0.000001,9.998519e-01,5.083255e-03,3.361246e-28,1.703740e-04,4.822762e-07,1.554362e-05,3.985115e-08,7.243647e-05,5.861467e-08,1.659070e-06,5.427641e-09,4.751224e-09
2892,3.924342e-06,9.841229e-01,8.429642e-07,1.172597e-03,4.349604e-04,4.166642e-08,0.041631,6.436128e-06,4.451708e-02,1.369612e-27,1.602117e-04,1.174433e-05,5.635541e-07,2.219255e-09,1.611820e-06,2.430158e-05,3.058320e-03,1.213406e-08,1.150373e-10


In [19]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        try:
            y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        except AttributeError:
            y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [18]:
voting_model = VotingModel(trained_models)

NameError: name 'trained_models' is not defined

In [22]:
joblib.dump(voting_model, WORKING_DIR / 'models/imgmodel.pkl')

['models/imgmodel.pkl']

In [21]:
voting_model = joblib.load(WORKING_DIR / 'models/imgmodel.pkl')

In [25]:
trained_models = voting_model.estimators