In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 29.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 70.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [23]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import re
from torchtext.vocab import build_vocab_from_iterator
from collections import defaultdict
from nltk.corpus import stopwords
import nltk
from transformers import AutoTokenizer, AutoModel
import json
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
def seed_everything(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
seed = 3407
seed_everything(seed)

stop_en = stopwords.words('english')
stop_ru = stopwords.words('russian')

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased").to(device)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
filename = "/content/drive/MyDrive/ASR/lemma/lemma_train.csv"

df = pd.read_csv(filename)

In [12]:
columns = ["clean_text", "clipTarget"]
df = df[columns]
df.sample(5)

Unnamed: 0,clean_text,clipTarget
85686,comics clock ⏰ назад будущее😱,BOOKS
159106,хлеб приготовленный углях песками пустыни,TRAVEL
130368,⚡️ ⛈🌩,MOVIES
110892,бери доску выходи улицу 🤍 серфскейт сёрфскейти...,SPORT
120416,краткое руководство помощью клинча грамотного ...,SPORT


In [13]:
df.dropna(inplace=True)
df = df.reset_index(drop=True)

In [14]:
with open("/content/drive/MyDrive/ASR/classes.json", "r") as f:
    cls2index = json.load(f)

In [27]:
class DescriptionDataset(Dataset):
    def __init__(self, text, target):
        self.text = text
        self.target = target

    def __getitem__(self, item):
        sent = self.text[item]
        target = cls2index[self.target[item]]
        with torch.no_grad():
            tokens = tokenizer(sent, return_tensors="pt")
            for key, val in tokens.items():
                tokens[key] = val.to(device)
            embeds = model(**tokens)["pooler_output"].squeeze(0) # 768
        return embeds, target

    def __len__(self):
        return len(self.text)

dataset = DescriptionDataset(df["clean_text"].to_list(), df["clipTarget"].to_list())
dataloader = DataLoader(dataset=dataset, shuffle=True, batch_size=512)

In [30]:
class Classifier(nn.Module):
    def __init__(self, input_dim: int = 768, hidden_dim: int = 512, output_dim: int = 52, p: float = 0.5):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        return self.fc3(x)

In [None]:
classifier = Classifier().to(device)
criterion = nn.CrossEntropyLoss()
lr = 10e-3
opt = optim.Adam(classifier.parameters(), lr=lr)
EPOCHS = 10
losses = []
for epoch in range(EPOCHS):
    loss_acc = []
    running_score = 0
    for input, output in tqdm(dataloader):
        opt.zero_grad()
        input = input#.to(device)
        output = output.to(device)

        preds = classifier(input)
        loss = criterion(preds, output)

        loss_acc.append(loss.detach().cpu().item())
        loss.backward()

        output = output.detach().cpu().numpy()
        preds = preds.argmax(dim=1).detach().cpu().numpy()
        running_score += f1_score(output, preds, average='macro')

        opt.step()
    loss_acc = np.mean(loss_acc)
    score = running_score / len(dataloader)
    print(f"Epoch: {epoch + 1}/{EPOCHS} | Loss :{loss_acc:.4f} | F1-score :{score:.4f}")
    torch.save(model.state_dict(), f"/content/drive/MyDrive/ASR/rubert/classifier_e{epoch}.pt")

100%|██████████| 440/440 [44:10<00:00,  6.02s/it]


Epoch: 1/10 | Loss :3.0098 | F1-score :0.1300


100%|██████████| 440/440 [43:33<00:00,  5.94s/it]


Epoch: 2/10 | Loss :2.7540 | F1-score :0.1943


100%|██████████| 440/440 [43:52<00:00,  5.98s/it]


Epoch: 3/10 | Loss :2.6974 | F1-score :0.2149


100%|██████████| 440/440 [43:32<00:00,  5.94s/it]


Epoch: 4/10 | Loss :2.6666 | F1-score :0.2238


100%|██████████| 440/440 [46:02<00:00,  6.28s/it]


Epoch: 5/10 | Loss :2.6456 | F1-score :0.2305


100%|██████████| 440/440 [45:19<00:00,  6.18s/it]


Epoch: 6/10 | Loss :2.6405 | F1-score :0.2315


 86%|████████▌ | 378/440 [38:45<06:11,  6.00s/it]