In [4]:
import numpy as np
import cv2
import os
import torch
import spacy
import pandas as pd
import torchvision.transforms as T

from torchvision.io.video import read_video
from tqdm.notebook import tqdm
from PIL import Image
from torch.utils.data import DataLoader,Dataset
from collections import Counter
from IPython.display import clear_output

In [5]:
!python -m spacy download ru_core_news_lg

Collecting ru-core-news-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.6.0/ru_core_news_lg-3.6.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.6.0)
  Downloading pymorphy3-1.2.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.6.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt-ng>=0.6 (from pymorphy3>=1.0.0->ru-core-news-lg==3.6.0)
  Downloading docopt_ng-0.9.0-py3-none-any.whl (16 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.6.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp ./drive/MyDrive/rutube_hackathon_novosibirsk.zip ./rutube_hackathon_novosibirsk.zip

In [None]:
!zip -FF ./rutube_hackathon_novosibirsk.zip --out ./pleasework.zip
clear_output()

In [None]:
!unzip -qq ./pleasework.zip

In [8]:
spacy_ru = spacy.load("ru_core_news_lg")

text = "Сергей Чуприн любит hi! ОН - КАРТОфЕЛЕЛЮБ!!!"
[token.text.lower() for token in spacy_ru.tokenizer(text)]

['сергей',
 'чуприн',
 'любит',
 'hi',
 '!',
 'он',
 '-',
 'картофелелюб',
 '!',
 '!',
 '!']

In [None]:
train_csv = pd.read_csv("./rutube_hackathon_novosibirsk/train/train.csv")

In [None]:
train_csv["len"] = train_csv["description"].apply(lambda x: len(x.split()))

In [None]:
train_csv.len.max()

321

In [None]:
train_csv

Unnamed: 0,video_name,stt_name,category_name,title,description,len
0,0.mp4,0.txt,Развлечения,Правильная цена I #3,С вами Макс Климток и это шоу Правильная цена!...,53
1,1.mp4,1.txt,Спорт/Игры,Три лошадиные силы | Выпуск №2,В этом новом выпуске нас ждут не менее новые и...,46
2,2.mp4,2.txt,Блоги,Хашлама | Выпуск 4 | Силиконовый ПРЕСС Давы | ...,"Привет, это Султан и Авет! Мы опять хаваем вку...",58
3,3.mp4,3.txt,Путешествия,Прогулка по стране - Владивосток,Прогулка по Владивостоку. Самому большому горо...,42
4,4.mp4,4.txt,Искусство,Артмеханика. Выпуск 3. Татуировки + Mika Vino,Были ли татуировки на теле Николая II? Почему ...,62
...,...,...,...,...,...,...
495,495.mp4,495.txt,Юмор,Тот самый мент I Выпуск 3I Мента забирают в армию,В начале рабочего дня Максиму Константиновичу ...,56
496,496.mp4,496.txt,Лайфстайл,Сколько Стоит День? Выпуск 7. Сколько тратит в...,"Сегодня мы провели целый день с певицей, актри...",68
497,497.mp4,497.txt,Развлечения,Правильная цена I #4,С вами Макс Климток и это шоу Правильная цена!...,54
498,498.mp4,498.txt,Развлечения,Тру ДЕТЕКТОР I #6,Вы смотрите шоу с детектором лжи «Тру ДЕТЕКТОР...,93


In [9]:
class Vocabulary:
    def __init__(self,freq_threshold):
        self.itos = {0:"<PAD>", 1:"<SOS>", 2:"<EOS>", 3:"<UNK>"}

        self.stoi = {v:k for k,v in self.itos.items()}

        self.freq_threshold = freq_threshold

    def __len__(self): return len(self.itos)

    @staticmethod
    def tokenize(text):
        return [token.text.lower() for token in spacy_ru.tokenizer(text)]

    @staticmethod
    def del_timestamps(texts):
        output = ""
        for text in texts.split("\n"):
          output += "".join(text.split("]  ")[1:]) + " "
        return " ".join(text)

    def build_vocab(self, sentence_list, texts_list):
      frequencies = Counter()
      idx = 4

      for sentence in tqdm(sentence_list):
          for word in self.tokenize(sentence):
              frequencies[word] += 1

              if frequencies[word] == self.freq_threshold:
                  self.stoi[word] = idx
                  self.itos[idx] = word
                  idx += 1

      for text in tqdm(texts_list):
          text = self.del_timestamps(text)
          for word in self.tokenize(text):
              frequencies[word] += 1

              if frequencies[word] == self.freq_threshold:
                  self.stoi[word] = idx
                  self.itos[idx] = word
                  idx += 1

    def numericalize(self,text):
        tokenized_text = self.tokenize(text)
        return [ self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] for token in tokenized_text ]


In [10]:
class VideoDataset(Dataset):
    def __init__(self, root_dir, captions, video_files, texts, transform=None,freq_threshold=1, build=True):
        self.root_dir = root_dir
        self.transform = transform

        self.imgs = video_files
        self.captions = captions

        text = []
        for file in tqdm(text):
            with open(self.root_dir + "/test_stt/" + file) as cf:
                    text.append(" ".join(cf.readlines()))

        self.vocab = Vocabulary(freq_threshold)
        if build:
            self.vocab.build_vocab(self.captions, texts)


    def __len__(self):
        return len(self.imgs)

    def _read_video(self, path, frames_num=25, window=30):
        frames = []
        cap = cv2.VideoCapture(self.root_dir + "/test_video/" + path)

        fps = int(cap.get(cv2.CAP_PROP_FPS))

        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        N = length // (frames_num)

        current_frame = 1
        for i in range(length):
            ret, frame = cap.read(current_frame)
            if ret and i == current_frame and len(frames) < frames_num:
                size = 226, 226
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, size)
                frames.append(frame)
                current_frame += N


        cap.release()

        return np.array(frames)

    def __getitem__(self,idx):
        caption = self.captions[idx]
        img = self.imgs[idx]

        img = np.rollaxis(self._read_video(self.imgs[idx]), 3, 0)
        img = np.array(np.array(img) / 255, dtype=np.float32)


        caption_vec = []
        caption_vec += [self.vocab.stoi["<SOS>"]]
        caption_vec += self.vocab.numericalize(caption)
        caption_vec += [self.vocab.stoi["<EOS>"]]
        if len(caption_vec) < 300:
            for i in range(300 - len(caption_vec)):
                caption_vec.append(0)

        return torch.tensor(img), torch.tensor(caption_vec[:300])

In [11]:

data_location =  "./rutube_hackathon_novosibirsk/train"
BATCH_SIZE = 10

transforms = T.Compose([
    T.Resize(226),
    T.RandomCrop(224),
    T.ToTensor(),
    T.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
])


dataset =  VideoDataset(
    root_dir=data_location,
    captions=train_csv.description.tolist(),
    texts=train_csv.stt_name.tolist(),
    video_files=train_csv.video_name,
    transform=transforms
)

data_loader = DataLoader(
    dataset=dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    # batch_first=False
)

vocab_size = len(dataset.vocab)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

NameError: ignored

### 3) Defining the Model Architecture

Model is seq2seq model. In the **encoder** pretrained ResNet model is used to extract the features. Decoder, is the implementation of the Bahdanau Attention Decoder. In the decoder model **LSTM cell**.

In [12]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision.models as models
import torchvision.models.video as video

from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as T

In [13]:
class VideoEncoderCNN(nn.Module):
    def __init__(self):
        super(VideoEncoderCNN, self).__init__()
        self.swin_t = video.swin3d_s(pretrained=True)
        for param in self.swin_t.parameters():
            param.requires_grad_(False)

        modules = list(self.swin_t.children())[:-2]
        self.swin_t = nn.Sequential(*modules)


    def forward(self, images):
        features = self.swin_t(images)
        features = features.view(features.size(0), -1, features.size(-1))

        return features


In [14]:
class VideoAttention(nn.Module):
    def __init__(self, encoder_dim,decoder_dim,attention_dim):
        super(VideoAttention, self).__init__()

        self.attention_dim = attention_dim

        self.W = nn.Linear(decoder_dim,attention_dim)
        self.U = nn.Linear(encoder_dim,attention_dim)

        self.A = nn.Linear(attention_dim,1)




    def forward(self, features, hidden_state):
        u_hs = self.U(features)
        w_ah = self.W(hidden_state)

        combined_states = torch.tanh(u_hs + w_ah.unsqueeze(1))

        attention_scores = self.A(combined_states)
        attention_scores = attention_scores.squeeze(2)


        alpha = F.softmax(attention_scores,dim=1)

        attention_weights = features * alpha.unsqueeze(2)
        attention_weights = attention_weights.sum(dim=1)

        return alpha,attention_weights


In [15]:
class VideoDecoderRNN(nn.Module):
    def __init__(self,embed_size, vocab_size, attention_dim,encoder_dim,decoder_dim,drop_prob=0.3):
        super().__init__()

        self.vocab_size = vocab_size
        self.attention_dim = attention_dim
        self.decoder_dim = decoder_dim

        self.embedding = nn.Embedding(vocab_size,embed_size)
        self.attention = VideoAttention(encoder_dim,decoder_dim,attention_dim)


        self.init_h = nn.Linear(encoder_dim, decoder_dim)
        self.init_c = nn.Linear(encoder_dim, decoder_dim)
        self.lstm_cell = nn.LSTMCell(embed_size+encoder_dim,decoder_dim,bias=True)
        self.f_beta = nn.Linear(decoder_dim, encoder_dim)


        self.fcn = nn.Linear(decoder_dim,vocab_size)
        self.drop = nn.Dropout(drop_prob)



    def forward(self, features, captions):
        embeds = self.embedding(captions)

        h, c = self.init_hidden_state(features)

        seq_length = len(captions[0])-1
        batch_size = captions.size(0)
        num_features = features.size(1)

        preds = torch.zeros(batch_size, seq_length, self.vocab_size).to(device)
        alphas = torch.zeros(batch_size, seq_length,num_features).to(device)

        for s in range(seq_length):
            alpha,context = self.attention(features, h)
            lstm_input = torch.cat((embeds[:, s], context), dim=1)
            h, c = self.lstm_cell(lstm_input, (h, c))

            output = self.fcn(self.drop(h))

            preds[:,s] = output
            alphas[:,s] = alpha


        return preds, alphas

    def generate_caption(self,features, max_len=300 ,vocab=None):

        batch_size = features.size(0)
        h, c = self.init_hidden_state(features)

        alphas = []
        word = torch.tensor(vocab.stoi['<SOS>']).view(1,-1).to(device)
        embeds = self.embedding(word)


        captions = []

        for i in range(max_len):
            alpha,context = self.attention(features, h)

            alphas.append(alpha.cpu().detach().numpy())

            lstm_input = torch.cat((embeds[:, 0], context), dim=1)
            h, c = self.lstm_cell(lstm_input, (h, c))
            output = self.fcn(self.drop(h))
            output = output.view(batch_size,-1)


            predicted_word_idx = output.argmax(dim=1)

            captions.append(predicted_word_idx.item())

            if vocab.itos[predicted_word_idx.item()] == "<EOS>":
                break

            embeds = self.embedding(predicted_word_idx.unsqueeze(0))

        return [vocab.itos[idx] for idx in captions],alphas


    def init_hidden_state(self, encoder_out):
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out)
        c = self.init_c(mean_encoder_out)
        return h, c


In [16]:
class VideoEncoderDecoder(nn.Module):
    def __init__(self,embed_size, vocab_size, attention_dim,encoder_dim,decoder_dim,drop_prob=0.3):
        super().__init__()
        self.encoder = VideoEncoderCNN()
        self.decoder = VideoDecoderRNN(
            embed_size=embed_size,
            vocab_size = vocab_size,
            attention_dim=attention_dim,
            encoder_dim=encoder_dim,
            decoder_dim=decoder_dim
        )

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs


### 4) Setting Hypperparameter and Init the model

In [17]:
#Hyperparams
embed_size=300
vocab_size = len(dataset.vocab)
attention_dim=256
encoder_dim=768
decoder_dim=512
learning_rate = 3e-4


NameError: ignored

In [None]:
#init model
model = VideoEncoderDecoder(
    embed_size=embed_size,
    vocab_size=vocab_size,
    attention_dim=attention_dim,
    encoder_dim=encoder_dim,
    decoder_dim=decoder_dim
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [None]:
# !pip install av

In [None]:
#helper function to save the model
def save_model(model,num_epochs):
    model_state = {
        'num_epochs':num_epochs,
        'embed_size':embed_size,
        'vocab_size':len(dataset.vocab),
        'attention_dim':attention_dim,
        'encoder_dim':encoder_dim,
        'decoder_dim':decoder_dim,
        'state_dict':model.state_dict()
    }

    torch.save(model_state,'./drive/MyDrive/attention_model_state.pth')

## 5) Training Job from above configs

In [18]:
torch.cuda.empty_cache()

In [19]:
import nltk
from nltk.translate import meteor
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
def score(text, text_sum):
    text = text.replace("<PAD>", "")
    if isinstance(text_sum, str):
        return round(meteor([word_tokenize(text)],word_tokenize(text_sum)), 4)
    else:
        return 0

In [None]:
num_epochs = 25
print_every = 10

for epoch in tqdm(range(1, num_epochs + 1)):
    losses = []
    print(f"Epoch: {epoch}")
    try:
        for idx, (image, captions) in tqdm(enumerate(data_loader), total=len(data_loader)):
            # print(image.shape[2])
            # if image.shape[2] < 50:
            #     continue
            image, captions = image.to(device),captions.to(device)

            optimizer.zero_grad()

            outputs,attentions = model(image, captions)

            loss = criterion(outputs.view(-1, vocab_size), captions[:, 1:].reshape(-1))

            loss.backward()

            optimizer.step()
            losses.append(loss.item())

            if idx % print_every == 0:
                model.eval()
                with torch.no_grad():
                    dataiter = iter(data_loader)
                    img,targ = next(dataiter)
                    features = model.encoder(img[0:1].to(device))
                    caps, alphas = model.decoder.generate_caption(features, vocab=dataset.vocab)
                    caption = ' '.join(caps)
                    target = " ".join([dataset.vocab.itos[i] for i in targ[0:1][0][1:].tolist()])
                    print("true:", target)
                    print("pred:", caption)
                    print("loss:", loss.item(), "score:", score(target, caption))


                model.train()
    except Exception as ex:
        print("Err:", ex)
        raise ex
    if len(losses) != 0:
        print("Epoch loss: {:.5f}".format(sum(losses) / len(losses)))
    else:
        print(losses)
    #save the latest model
    save_model(model,epoch)
    # clear_output(wait=True)

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 1


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 8.915692329406738 score: 0.0263
loss: 8.47883129119873 score: 0.0227
loss: 7.248097896575928 score: 0.0176
loss: 7.123484134674072 score: 0.0221
loss: 7.134592533111572 score: 0.0329
Epoch loss: 7.71725
Epoch: 2


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 6.8723602294921875 score: 0.0586
loss: 6.755800724029541 score: 0.0215
loss: 7.180953502655029 score: 0.0404
loss: 6.811089515686035 score: 0.0333
loss: 6.925175666809082 score: 0.0339
Epoch loss: 6.94436
Epoch: 3


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 6.799895286560059 score: 0.0
loss: 6.690213680267334 score: 0.035
loss: 6.404407978057861 score: 0.0714
loss: 6.941572189331055 score: 0.0514
loss: 6.587986469268799 score: 0.0461
Epoch loss: 6.66533
Epoch: 4


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 6.638942718505859 score: 0.0668
loss: 6.679940223693848 score: 0.0705
loss: 6.0935235023498535 score: 0.0299
loss: 5.891200065612793 score: 0.1907
loss: 5.950171947479248 score: 0.0146
Epoch loss: 6.36518
Epoch: 5


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 6.25126314163208 score: 0.0626
loss: 5.922518253326416 score: 0.1466
loss: 5.785062313079834 score: 0.0868
loss: 6.2370781898498535 score: 0.0484
loss: 5.428935527801514 score: 0.0916
Epoch loss: 6.06080
Epoch: 6


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 5.95467472076416 score: 0.1083
loss: 5.955918312072754 score: 0.0415
loss: 5.710024356842041 score: 0.1817
loss: 5.569047927856445 score: 0.0738
loss: 5.560596942901611 score: 0.2745
Epoch loss: 5.78921
Epoch: 7


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 5.892467975616455 score: 0.0702
loss: 5.5397467613220215 score: 0.2647
loss: 5.37299108505249 score: 0.1006
loss: 5.445064544677734 score: 0.3061
loss: 5.559462547302246 score: 0.095
Epoch loss: 5.50988
Epoch: 8


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 5.931407451629639 score: 0.092
loss: 5.735106468200684 score: 0.0357
loss: 4.981560230255127 score: 0.2068
loss: 5.473208427429199 score: 0.1597
loss: 4.724553108215332 score: 0.0064
Epoch loss: 5.25738
Epoch: 9


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 5.153927326202393 score: 0.0727
loss: 5.4027791023254395 score: 0.0525
loss: 4.786442756652832 score: 0.1953
loss: 5.465935230255127 score: 0.1197
loss: 4.611083030700684 score: 0.0737
Epoch loss: 5.00303
Epoch: 10


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 4.671930313110352 score: 0.0754
loss: 4.525778770446777 score: 0.1134
loss: 4.630340576171875 score: 0.1006
loss: 4.911326885223389 score: 0.1403
loss: 4.17954158782959 score: 0.0734
Epoch loss: 4.77379
Epoch: 11


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 4.671368598937988 score: 0.0915
loss: 4.824629783630371 score: 0.2686
loss: 4.054232597351074 score: 0.1709
loss: 4.467138767242432 score: 0.1411
loss: 4.65294075012207 score: 0.1559
Epoch loss: 4.55529
Epoch: 12


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 4.7070698738098145 score: 0.2637
loss: 4.263211727142334 score: 0.1122
loss: 4.448678016662598 score: 0.1097
loss: 3.7467827796936035 score: 0.1396
loss: 4.59494161605835 score: 0.5906
Epoch loss: 4.31986
Epoch: 13


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 3.1243600845336914 score: 0.096
loss: 4.167882919311523 score: 0.0849
loss: 4.1580810546875 score: 0.3283
loss: 3.797739267349243 score: 0.3159
loss: 4.178073406219482 score: 0.405
Epoch loss: 4.13995
Epoch: 14


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 4.586398124694824 score: 0.2958
loss: 3.7238595485687256 score: 0.3034
loss: 3.8958935737609863 score: 0.1517
loss: 4.710023403167725 score: 0.3202
loss: 3.929929256439209 score: 0.0984
Epoch loss: 3.90703
Epoch: 15


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 3.4666404724121094 score: 0.2181
loss: 3.590632200241089 score: 0.2459
loss: 3.414557695388794 score: 0.1622
loss: 4.01615571975708 score: 0.1363
loss: 4.424407005310059 score: 0.0455
Epoch loss: 3.72151
Epoch: 16


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 3.253783941268921 score: 0.0712
loss: 3.0496017932891846 score: 0.5472
loss: 3.980968952178955 score: 0.1852
loss: 3.9288644790649414 score: 0.0455
loss: 3.392162561416626 score: 0.23
Epoch loss: 3.57036
Epoch: 17


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 3.2838175296783447 score: 0.1983
loss: 3.1512959003448486 score: 0.0996
loss: 3.2098710536956787 score: 0.324
loss: 3.754456043243408 score: 0.091
loss: 3.884165048599243 score: 0.2066
Epoch loss: 3.36377
Epoch: 18


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 2.4741642475128174 score: 0.1738
loss: 3.038079023361206 score: 0.3497
loss: 3.3952219486236572 score: 0.2654
loss: 3.9681546688079834 score: 0.1641
loss: 3.0849618911743164 score: 0.339
Epoch loss: 3.20564
Epoch: 19


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 3.7605862617492676 score: 0.1783
loss: 3.1154775619506836 score: 0.2251
loss: 3.4027419090270996 score: 0.3025
loss: 2.947638750076294 score: 0.2038
loss: 3.5853614807128906 score: 0.0543
Epoch loss: 3.05595
Epoch: 20


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 2.4285850524902344 score: 0.161
loss: 2.9569919109344482 score: 0.7833
loss: 3.0592637062072754 score: 0.1127
loss: 3.1344974040985107 score: 0.2867
loss: 3.3452725410461426 score: 0.0704
Epoch loss: 2.91555
Epoch: 21


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 2.4120092391967773 score: 0.0693
loss: 2.3151001930236816 score: 0.0994
loss: 2.563652515411377 score: 0.0879
loss: 2.2399256229400635 score: 0.1346
loss: 2.7522597312927246 score: 0.1972
Epoch loss: 2.76233
Epoch: 22


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 2.526745080947876 score: 0.6283
loss: 1.8728582859039307 score: 0.1602
loss: 2.629197597503662 score: 0.2587
loss: 2.0636894702911377 score: 0.1508
loss: 2.0626513957977295 score: 0.0414
Epoch loss: 2.62310
Epoch: 23


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 2.7467591762542725 score: 0.467
loss: 2.439303159713745 score: 0.086
loss: 1.9384729862213135 score: 0.3129
loss: 2.513373374938965 score: 0.0873
loss: 2.6745805740356445 score: 0.2657
Epoch loss: 2.47120
Epoch: 24


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 2.4338889122009277 score: 0.2219
loss: 1.7518889904022217 score: 0.3628
loss: 2.1144773960113525 score: 0.7563
loss: 2.2498204708099365 score: 0.7166
loss: 2.5680336952209473 score: 0.2869
Epoch loss: 2.35807
Epoch: 25


  0%|          | 0/50 [00:00<?, ?it/s]

loss: 1.9401556253433228 score: 0.1478
loss: 2.2931458950042725 score: 0.6116
loss: 2.0408644676208496 score: 0.1866
loss: 2.161356210708618 score: 0.1696
loss: 1.8561944961547852 score: 0.5481
Epoch loss: 2.21570


In [49]:
checkpoint = torch.load("./drive/MyDrive/video_model.pth")

In [50]:
model = VideoEncoderDecoder(
    embed_size=checkpoint["embed_size"],
    vocab_size =checkpoint["vocab_size"],
    attention_dim=checkpoint["attention_dim"],
    encoder_dim=checkpoint["encoder_dim"],
    decoder_dim=checkpoint["decoder_dim"]
).to(device)
model.load_state_dict(checkpoint["state_dict"])



<All keys matched successfully>

In [21]:
!wget https://lodmedia.hb.bizmrg.com/case_files/1026506/test_dataset_test.zip

--2023-10-01 02:59:53--  https://lodmedia.hb.bizmrg.com/case_files/1026506/test_dataset_test.zip
Resolving lodmedia.hb.bizmrg.com (lodmedia.hb.bizmrg.com)... 95.163.53.117
Connecting to lodmedia.hb.bizmrg.com (lodmedia.hb.bizmrg.com)|95.163.53.117|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1696238228 (1.6G) [application/zip]
Saving to: ‘test_dataset_test.zip’


2023-10-01 03:00:44 (32.5 MB/s) - ‘test_dataset_test.zip’ saved [1696238228/1696238228]



In [51]:
import pickle

In [22]:
!unzip -qq test_dataset_test.zip

In [52]:
train_csv = pd.read_csv("./test/test.csv")

In [53]:
sample_submission = pd.read_csv("./sample_submission.csv")

In [54]:
sample_submission.head()

Unnamed: 0,video_name,generated_description
0,0.mp4,Полученное описание
1,1.mp4,Полученное описание
2,2.mp4,Полученное описание
3,3.mp4,Полученное описание
4,4.mp4,Полученное описание


In [55]:

data_location =  "./test/"
BATCH_SIZE = 1

transforms = T.Compose([
    T.Resize(226),
    T.RandomCrop(224),
    T.ToTensor(),
    T.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
])


dataset =  VideoDataset(
    root_dir=data_location,
    captions=train_csv.video_name.tolist(),
    texts=train_csv.stt_name.tolist(),
    video_files=train_csv.video_name,
    transform=transforms,
    build=False
)

dataset.vocab = pickle.load(open("./vocab.pkl", "rb"))


data_loader = DataLoader(
    dataset=dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    # batch_first=False
)

vocab_size = len(dataset.vocab)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

0it [00:00, ?it/s]

device(type='cuda', index=0)

In [56]:
import string

In [57]:
scores = []
model.eval()

for idx, (image, captions) in tqdm(enumerate(data_loader), total=len(data_loader)):
    img, targ = image.to(device),captions.to(device)
    with torch.no_grad():
        features = model.encoder(img[0:1].to(device))
        caps, alphas = model.decoder.generate_caption(features, vocab=dataset.vocab)
        caption = ' '.join(caps)
        caption = caption.replace("<EOS>", "")
        for i in string.punctuation:
            if i not in ["/", "\\", "-"]:
                caption = caption.replace(f" {i}", i)
        # print(targ, caption)
        sample_submission.loc[sample_submission.video_name == f"{idx}.mp4", "generated_description"] = [caption]
        print(caption)
        # target = " ".join([dataset.vocab.itos[i] for i in targ[0:1][0][1:].tolist()])

        # scores.append(score(target, caption))
        # print("true:", target)
        # print("pred:", caption)
        # print("loss:", loss.item(), "score:", score(target, caption))


  0%|          | 0/100 [00:00<?, ?it/s]

добро пожаловать на канал « бюро добрых дел »! его глава – это я, лариса брохман. а миссия нашего бюро – искать самые добрые и хорошие новости по всему миру, чтобы рассказать их вам. в этом выпуске:   - /   - /   - /   - /   - /   ссылка на email   в этом выпуске:   - /   - /   - /   в этом выпуске - /   гость - специальный гость шоу « пацанский клининг », чтобы рассказать их вам. в этом выпуске: -   гость - ксения егорова в рамках рубрики « добрый репортаж », чтобы поднять вам настроение сделать ваш день чуточку радостнее и светлее.   пишите свои добрые истории нам на почту: email   по вопросам рекламы пишите на email 
знаете, как в котором комики играют и не менее от шефа. 
вы смотрите шоу с детектором лжи « тру детектор »! шоу, в котором кристи крайм и её молчаливый друг приглашают в гости известных людей и задают им неудобные вопросы под пристальным взором полиграфа. в этом выпуске, что скрывают звёзды и их ближайшее окружение? какие вопросы они зададут друг другу? и кто окажется с

In [58]:
sample_submission.head()

Unnamed: 0,video_name,generated_description
0,0.mp4,добро пожаловать на канал « бюро добрых дел »!...
1,1.mp4,"знаете, как в котором комики играют и не менее..."
2,2.mp4,вы смотрите шоу с детектором лжи « тру детекто...
3,3.mp4,в этом выпуске: антон протеинов - / - / в ...
4,4.mp4,в этом выпуске ведущая алиса и дизайнер наталь...
