In [0]:
!wget http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12_pt.tar.gz
!tar -xzf ru_conversational_cased_L-12_H-768_A-12_pt.tar.gz
!pip install youtube_transcript_api deeppavlov pytorch_pretrained_bert transformers

In [185]:
import sys
import random
import numpy as np
import pandas as pd
import torch 
import nltk
import pickle
import torch.nn as nn
import torch.nn.functional as F

from pathlib import Path
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from youtube_transcript_api import YouTubeTranscriptApi
from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs
from transformers import AutoModel
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/My Drive/youtube_timestamps')

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

nltk.download('punkt')
nltk.download('stopwords')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
BERT_MODEL_PATH = '/content/ru_conversational_cased_L-12_H-768_A-12_pt'
ROOT_DATA_PATH = Path('/content/drive/My Drive/youtube_timestamps/data')
TRAIN_HOLDOUT_RATIO = 0.8      # split dataframe to involved and holdout parts
TRAIN_VALIDATE_RATIO = 0.8     # split involved to train and validate parts
CONTEXT_RADIUS = 3
CONTEXT_SIZE = CONTEXT_RADIUS * 2 + 1
BATCH_SIZE = 5

In [187]:
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH)
bert_model = BertModel.from_pretrained(BERT_MODEL_PATH)
bert_model.eval()
bert_model.to('cuda')
print('RuConversationalBERT loaded to device')

RuConversationalBERT loaded to device


In [0]:
def eliminate_stop_words(sentence):
    tokens = [word 
              for word in nltk.word_tokenize(sentence) 
              if word not in stopwords.words('russian')]
    return ' '.join(tokens)

def get_embedding(sentence, nlp_model, nlp_tokenizer):
    if sentence is None or sentence == '':
        return torch.zeros(1, nlp_model.config.hidden_size)
    tokens = nlp_tokenizer.tokenize(sentence)
    indexed_tokens = nlp_tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
    with torch.no_grad():
        outputs = nlp_model.forward(tokens_tensor)[1].to('cpu')
    torch.cuda.empty_cache()
    return outputs

In [189]:
df = pd.read_csv(ROOT_DATA_PATH / 'zhiza_timestamps.csv')
df['pause'] = df['time_start'].shift(-1) - (df['time_start'] + df['time_duration'])
df['text'] = df['text'].apply(eliminate_stop_words)
df.drop('desc', axis=1, inplace=True)
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,video_id,sentence_num,text,time_start,time_duration,is_timestamp,pause
0,BCIurE0kubE,0,изучила страну 1000 городах,4.13,6.67,0,-3.03
1,BCIurE0kubE,1,побывала пил чая поезде,7.77,5.1,0,-2.07
2,BCIurE0kubE,2,подстаканника прочувствовал,10.8,4.41,0,-2.34
3,BCIurE0kubE,3,прелесть железных дорог плацкартом жизнь,12.87,5.669,0,-3.329
4,BCIurE0kubE,4,бурлит здорово общается,15.21,6.3,0,-2.971


In [0]:
holdout_split_idx = int(df['video_id'].nunique() * TRAIN_HOLDOUT_RATIO)
validate_split_idx = int(holdout_split_idx * TRAIN_VALIDATE_RATIO)
video_ids = df['video_id'].unique()

train_video_ids = video_ids[:validate_split_idx]
validate_video_ids = video_ids[validate_split_idx:holdout_split_idx]
test_video_ids = video_ids[holdout_split_idx:]

train_df = df[df['video_id'].isin(train_video_ids)]
validate_df = df[df['video_id'].isin(validate_video_ids)]
test_df = df[df['video_id'].isin(test_video_ids)]

In [191]:
train_df.shape, validate_df.shape, test_df.shape

((17584, 7), (3724, 7), (4033, 7))

In [0]:
def process_transcripts_dataframe(dataframe, nlp_model, nlp_tokenizer, video_ids):
    """Gets original (dataframe) and (video_ids) array, then 
    process sentences with (nlp_model), then 
    separate different videos to different tensor datasets.

    Returns list of ``torch.utils.data.TensorDataset`` of float32.
    """

    def convert_dataframe_to_tensor(dataframe):
        first_part = torch.tensor(dataframe.loc[:, dataframe.columns != 'text'].values, dtype=torch.float)
        embeddings = []
        for sentence in dataframe['text'].values:
            embeddings.append(get_embedding(sentence, nlp_model, nlp_tokenizer).float())
        second_part = torch.cat(embeddings, dim=0)
        return torch.cat((first_part, second_part), dim=1)

    datasets = []
    for video_id in video_ids:
        current_df = dataframe[dataframe['video_id'] == video_id]
        X_df = current_df.drop(['video_id', 'is_timestamp'], axis=1)
        y_df = current_df['is_timestamp']
        X_tensor = convert_dataframe_to_tensor(X_df)
        y_tensor = torch.tensor(y_df.values, dtype=torch.float)
        datasets.append(torch.utils.data.TensorDataset(X_tensor, y_tensor))

    return datasets

In [193]:
%%time
try:
    train_tensor_datasets = torch.load(ROOT_DATA_PATH / 'train_tensor_datasets.pt')
    validate_tensor_datasets = torch.load(ROOT_DATA_PATH / 'validate_tensor_datasets.pt')
    test_tensor_datasets = torch.load(ROOT_DATA_PATH / 'test_tensor_datasets.pt')
except IOError:
    train_tensor_datasets = process_transcripts_dataframe(train_df, bert_model, bert_tokenizer, train_video_ids)
    validate_tensor_datasets = process_transcripts_dataframe(validate_df, bert_model, bert_tokenizer, validate_video_ids)
    test_tensor_datasets = process_transcripts_dataframe(test_df, bert_model, bert_tokenizer, test_video_ids)
    torch.save(train_tensor_datasets, ROOT_DATA_PATH / 'train_tensor_datasets.pt')
    torch.save(validate_tensor_datasets, ROOT_DATA_PATH / 'validate_tensor_datasets.pt')
    torch.save(test_tensor_datasets, ROOT_DATA_PATH / 'test_tensor_datasets.pt')

CPU times: user 4.72 ms, sys: 55.4 ms, total: 60.1 ms
Wall time: 112 ms


In [0]:
class ContextSampler(torch.utils.data.BatchSampler):
    def __init__(self, sampler, context_size, stride):
        super().__init__(sampler, context_size, drop_last=False)
        assert stride <= context_size
        self.context_size = context_size
        self.stride = stride

    def __iter__(self):
        context = []
        for idx in self.sampler:
            context.append(idx)
            if len(context) == self.context_size:
                yield context
                context = context[self.stride:]
    
    def __len__(self):
        return len(self.sampler) - self.batch_size + 1

In [0]:
class ContextTensorsDataset(torch.utils.data.TensorDataset):
    def __init__(self, context_loader, context_size):
        X, y = [], []
        for entry in context_loader:
            X.append(entry[0].unsqueeze(0))
            y.append(entry[1][context_size // 2].unsqueeze(0))
        X_tensor = torch.cat(X, 0)
        y_tensor = torch.cat(y, 0)
        super().__init__(X_tensor, y_tensor)

In [0]:
def get_contexts_based_data_loader(dataset, context_size, batch_size, stride):
    """Splits (dataset) into contexts-tensors of (context_size) with (stride), 
    then produce batches of (batch_size), based on randomly permuted contexts.

    Returns ``torch.utils.data.DataLoader`` generator object.
    """
    ctx_sampler = ContextSampler(torch.utils.data.SequentialSampler(dataset), context_size, stride)
    ctx_loader = torch.utils.data.DataLoader(dataset, batch_sampler=ctx_sampler)
    ctx_dataset = ContextTensorsDataset(ctx_loader, context_size)
    random_ctx_sampler = torch.utils.data.RandomSampler(ctx_dataset)
    batch_sampler = torch.utils.data.BatchSampler(random_ctx_sampler, batch_size, drop_last=False)
    data_loader = torch.utils.data.DataLoader(ctx_dataset, batch_sampler=batch_sampler)

    return data_loader

In [0]:
class Model(nn.Module):
    def __init__(self, **kwargs):
        super(Model, self).__init__()
        self.context_size = kwargs['context_size']
        self.batch_size = kwargs['batch_size']
        self.input_length = kwargs['input_length']

        self.conv1 = nn.Conv1d(self.context_size, self.context_size // 2, 7)
        self.conv2 = nn.Conv1d(self.context_size // 2, 1, 5)
        self.max_pool = nn.MaxPool1d(5)

        self.fc1 = nn.Linear(int((self.input_length - 15) / 5 + 1), 300)
        self.fc2 = nn.Linear(300, 50)
        self.out = nn.Linear(50, 1)

        self.act = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.3)
        self.batch_norm = nn.BatchNorm1d(self.context_size)
        

    def forward(self, x):
        convoluted = self.max_pool(self.conv2(self.conv1(x)))
        hidden = self.act(self.fc2(self.act(self.fc1(convoluted))))
        embeds = hidden.view(hidden.shape[0], hidden.shape[1] * hidden.shape[2])
        return self.out(embeds).view(embeds.shape[0])

In [0]:
def mcc(tp, tn, fp, fn):
    """Matthews correlation coefficient, belongs to [-1, 1]:
    
    -1 means trash, absolute incorrect prediction
    0 corresponds to random prediction
    1 means perfect prediction
    """
    return (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

In [0]:
model = Model(context_size=CONTEXT_SIZE, 
              batch_size=BATCH_SIZE, 
              input_length=ROW_LENGTH)
loss = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [243]:
for epoch in range(37):
    model.train()
    total_loss = 0
    for dataset in train_tensor_datasets:
        data_loader = get_contexts_based_data_loader(dataset, CONTEXT_SIZE, BATCH_SIZE, stride=1)
        for X_batch, y_batch in data_loader:
            optimizer.zero_grad()
            outputs = model.forward(X_batch)
            loss_val = loss(outputs, y_batch)
            loss_val.backward()
            total_loss += loss_val
            optimizer.step()
    print(f'epoch #{epoch} | loss: {total_loss}')

epoch #0 | loss: 47.97410202026367
epoch #1 | loss: 46.91338348388672
epoch #2 | loss: 41.32014846801758
epoch #3 | loss: 46.23487091064453
epoch #4 | loss: 37.94314956665039
epoch #5 | loss: 36.361167907714844
epoch #6 | loss: 33.61874008178711
epoch #7 | loss: 36.96523666381836
epoch #8 | loss: 31.8024959564209
epoch #9 | loss: 36.27674865722656


In [0]:
def test_model_on_datasets(model, datasets):
    model.eval()
    video_num = 0
    for dataset in datasets:
        data_loader = get_contexts_based_data_loader(dataset, CONTEXT_SIZE, 
                                                    batch_size=1, stride=CONTEXT_SIZE)
        TP, FP, TN, FN = 0, 0, 0, 0
        for X_batch, y_batch in data_loader:
            outputs = torch.round(torch.sigmoid(model.forward(X_batch)))
            if torch.sum(y_batch) > 0 and torch.sum(outputs) > 0:
                TP += 1
            elif torch.sum(y_batch) == 0 and torch.sum(outputs) == 0:
                TN += 1
            elif torch.sum(y_batch) == 0 and torch.sum(outputs) > 0:
                FP += 1
            else:
                FN += 1
        if TP == FP == FN == 0:
            precision, mcc_coef = 1, 1
        elif TP + FP != 0:
            precision = TP / (TP + FP)
            mcc_coef = mcc(TP, TN, FP, FN)
        else:
            precision, mcc_coef = 0, 0
        print(f'video: #{video_num} | precision: {precision:.3f} | mcc: {mcc_coef:.3f}')
        video_num += 1

In [266]:
test_model_on_datasets(model, validate_tensor_datasets)

video: #0 | precision: 0.000 | mcc: -0.033
video: #1 | precision: 0.000 | mcc: -0.088
video: #2 | precision: 0.333 | mcc: 0.257
video: #3 | precision: 0.000 | mcc: -0.048
video: #4 | precision: 0.000 | mcc: 0.000
video: #5 | precision: 1.000 | mcc: 1.000
video: #6 | precision: 0.250 | mcc: 0.483
video: #7 | precision: 0.333 | mcc: 0.549


In [267]:
test_model_on_datasets(model, test_tensor_datasets)

video: #0 | precision: 1.000 | mcc: 0.810
video: #1 | precision: 0.833 | mcc: 0.728
video: #2 | precision: 0.500 | mcc: 0.576
video: #3 | precision: 0.556 | mcc: 0.725
video: #4 | precision: 0.667 | mcc: 0.807
video: #5 | precision: 0.167 | mcc: 0.390
video: #6 | precision: 0.000 | mcc: nan
video: #7 | precision: 0.400 | mcc: 0.614
video: #8 | precision: 0.500 | mcc: 0.556
video: #9 | precision: 0.333 | mcc: 0.552


  
