In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
from torch.autograd import Variable
from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils
import torch
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from numpy.linalg import norm
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader

import os
import pandas as pd
import skimage.io
from skimage.transform import resize
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import pickle

import gdown
%matplotlib inline

BASE_DIR = "/content/drive/MyDrive/Colab_data"
DATA_DIR = "/content/Colab_Data"

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
# Simple tokenizer -> Rewrite
class MyTokenizer:
    def __init__(self):
        pass
    def tokenize(self, text):
        return [x.lower() for x in re.findall(r'\w+', text)]

    def encode(self, text):
        tokens = [self.word2ind.get(word, self.word2ind['<unk>']) for word in self.tokenize(text)]
        return tokens

In [6]:
tokenizer = MyTokenizer()

In [15]:
train_data = np.array(pd.read_csv(f"{BASE_DIR}/Hack_Change/train.csv"))
test_data = np.array(pd.read_csv(f"{BASE_DIR}/Hack_Change/test.csv"))

In [17]:
def show_item(ind, data):
  print("Id: ", data[ind][0])
  print("Text: ", data[ind][1])
  print("Source: ", data[ind][2])


show_item(1, train_data)
print(len(train_data))
print(len(test_data))

Id:  198426
Text:  –°–ª–µ–≤–∞ –æ—Ç –º–µ–Ω—è –ê–ª–µ–∫—Å–µ–π –ò–ª—å–º—É—Ö–∏–Ω. –ü—É—Ç–µ—à–µ—Å—Ç–≤–µ–Ω–Ω–∏–∫ –∏ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–π —á–µ–ª–æ–≤–µ–∫. –í —á–∏—Å–ª–µ –µ–≥–æ –ª–∏—á–Ω—ã—Ö –ø–æ–±–µ–¥ - –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–µ –Ω–∞ –≤–µ–ª–æ—Å–∏–ø–µ–¥–µ –æ—Ç –ß–µ–ª—è–±–∏–Ω—Å–∫–∞ –¥–æ –ë–∞–π–∫–∞–ª–∞... –ü—Ä–∏–µ—Ö–∞–ª –∫ –Ω–∞–º –ø–æ–º–æ—á—å –≤ –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏ —Å–µ–º–∏–Ω–∞—Ä–∞ –ú—É—Ö—Ç–∞—Ä–∞. –ü—Ä–æ—Å—Ç–æ –≤–∑—è–ª –∏ –ø—Ä–∏–µ—Ö–∞–ª –∏–∑ –ß–µ–ª—è–±–∏–Ω—Å–∫–∞ –∏ –Ω–∞—á–∞–ª –ø–æ–º–æ–≥–∞—Ç—å... ))üëç #–º–æ–∂–µ—Ç–∫–∞–∂–¥—ã–π
Source:  rusentiment
232366
58092


In [18]:
# print(pd.read_csv(f"{BASE_DIR}/Hack&Change/train.csv").head())

In [19]:
# Creating vocabulary
words = Counter()

for i in range(len(train_data)):
  proccessed_text = train_data[i][1].lower().translate(str.maketrans('', '', string.punctuation))

  for word in word_tokenize(proccessed_text):
    words[word] += 1

vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
counter_threshold = 5

for char, cnt in words.items():
    if cnt > counter_threshold:
        vocab.add(char)

print(f'Vocabulary length: {len(vocab)}')

Vocabulary length: 87012


In [20]:
word2ind = {char: i for i, char in enumerate(vocab)}
ind2word = {i: char for char, i in word2ind.items()}

In [21]:
class CustomDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = word2ind['<unk>']
        self.bos_id = word2ind['<bos>']
        self.eos_id = word2ind['<eos>']
        self.pad_id = word2ind['<pad>']

    def __getitem__(self, idx):
        processed_text = tokenizer.tokenize(self.data[idx][1])
        source = self.data[idx][2]

        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [
            word2ind.get(word, self.unk_id) for word in processed_text
        ]
        tokenized_sentence += [self.eos_id]

        train_sample = {
            "text": tokenized_sentence,
            "source": source,
            "label": self.data[idx][3]
        }

        return train_sample

    def __len__(self) -> int:
        return len(self.data)


def collate_fn_with_padding(input_batch, pad_id=word2ind['<pad>'], max_len=256):
    seq_lens = [len(x['text']) for x in input_batch]
    max_seq_len = min(max(seq_lens), max_len)

    new_batch = []
    for sequence in input_batch:
        sequence['text'] = sequence['text'][:max_seq_len]
        for _ in range(max_seq_len - len(sequence['text'])):
            sequence['text'].append(pad_id)

        new_batch.append(sequence['text'])

    sequences = torch.LongTensor(new_batch).to(device)
    labels = torch.LongTensor([x['label'] for x in input_batch]).to(device)

    new_batch = {
        'input_ids': sequences,
        'label': labels
    }

    return new_batch

In [22]:
X_train, X_val = train_test_split(train_data, test_size=0.1, shuffle=False)

train_dataset = CustomDataset(X_train)
val_dataset = CustomDataset(X_val)

batch_size = 25
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=collate_fn_with_padding, batch_size=batch_size)

val_dataloader = DataLoader(
    val_dataset, shuffle=False, collate_fn=collate_fn_with_padding, batch_size=batch_size)

In [27]:
def F1macro(model, val_dataloader) -> float:
    predictions = []
    target = []
    model.eval()

    with torch.no_grad():
        for batch in val_dataloader:
            logits = model(batch['input_ids'].to(device))
            predictions.append(logits.argmax(dim=1))
            target.append(batch['label'].to(device))

    predictions = torch.cat(predictions).cpu().numpy()
    target = torch.cat(target).cpu().numpy()

    macro_f1 = f1_score(predictions, target, average='macro')

    return macro_f1

def accuracy(model, val_dataloader):
    predictions = []
    target = []
    model.eval()

    with torch.no_grad():
        for batch in val_dataloader:
            logits = model(batch['input_ids'].to(device))
            predictions.append(logits.argmax(dim=1))
            target.append(batch['label'].to(device))

    predictions = torch.cat(predictions).cpu().numpy()
    target = torch.cat(target).cpu().numpy()

    macro_f1 = f1_score(predictions, target, average='macro')

    return macro_f1

In [28]:
class BaseModel(nn.Module):
    def __init__(
        self, hidden_dim, vocab_size, num_classes,
        aggregation_type: str = 'max', lstm_layers: int = 1
        ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.gru = nn.LSTM(hidden_dim, hidden_dim, num_layers=lstm_layers, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2 * 2, hidden_dim)
        self.projection = nn.Linear(hidden_dim, num_classes)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.3)

        self.aggregation_type = aggregation_type

    def forward(self, input_batch) -> torch.Tensor:
        embeddings = self.embedding(input_batch)
        output, _–í–µ = self.gru(embeddings)

        if self.aggregation_type == 'max':
            output = output.max(dim=1)[0]
        elif self.aggregation_type == 'mean':
            output = output.mean(dim=1)
        elif self.aggregation_type == 'max+mean':
            max_pool = output.max(dim=1)[0]
            mean_pool = output.mean(dim=1)

            output = torch.cat([max_pool, mean_pool], dim=1)
        else:
            raise ValueError("Invalid aggregation_type")

        output = self.dropout(self.linear(self.non_lin(output)))
        prediction = self.projection(self.non_lin(output))

        return prediction

In [29]:
def train(model, optimizer, epochs, criterion, train_loader, val_loader):
    train_losses = []
    val_losses = []

    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        train_losses_per_epoch = []
        val_losses_per_epoch = []

        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for batch in pbar:
            optimizer.zero_grad()

            logits = model(batch['input_ids'].to(device))
            loss = criterion(logits, batch['label'])

            loss.backward()
            optimizer.step()
            train_losses_per_epoch.append(loss.item())

        train_losses.append(np.mean(train_losses_per_epoch))

        with torch.no_grad():
            model.eval()
            for batch in val_loader:
                logits = model(batch['input_ids'].to(device))
                loss = criterion(logits, batch['label'])

                val_losses_per_epoch.append(loss.item())

        val_losses.append(np.mean(val_losses_per_epoch))

        print(f"Epoch {epoch+1}/{epochs} | "
              f"train_loss={train_losses[-1]:.2f} | "
              f"val_loss={val_losses[-1]:.2f} |"
              f"F1-micro={F1macro(model, val_dataloader)}")

    return train_losses, val_losses

In [30]:
model = BaseModel(
    hidden_dim=256,
    vocab_size=len(vocab),
    num_classes=3,
    lstm_layers=2,
    aggregation_type='max+mean'
    ).to(device)
criterion = nn.CrossEntropyLoss()

In [31]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
epochs = 4

train_losses, val_losses = train(model, optimizer, epochs, criterion, train_dataloader, val_dataloader)

Epoch 1/4:   0%|          | 0/8366 [00:00<?, ?it/s]

Epoch 1/4 | train_loss=0.83 | val_loss=0.74 |F1-micro=0.66249975031834


Epoch 2/4:   0%|          | 0/8366 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
plt.figure(figsize=(3, 2))
plt.plot(np.arange(len(train_losses)), train_losses, label='Train', color='blue')
plt.plot(np.arange(len(val_losses)), val_losses, label='Validation', color='orange')

plt.xlabel('Epoch')
plt.title('MSE loss')
plt.legend()
plt.show()


NameError: name 'train_losses' is not defined

<Figure size 300x200 with 0 Axes>

In [33]:
save_data = [model, word2ind]

with open("save_data.pkl", "wb") as f:
  pickle.dump(save_data, f)

In [35]:
!git clone https://dmiptrv0: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@github.com/mirkuriit/hack-change-2025.git


Cloning into 'hack-change-2025'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 78 (delta 19), reused 73 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (78/78), 59.08 KiB | 889.00 KiB/s, done.
Resolving deltas: 100% (19/19), done.


In [36]:
import os
!git push origin HEAD
os.chdir("hack-change-2025")

In [47]:
!git add ./../drive

Enumerating objects: 6, done.
Counting objects:  16% (1/6)Counting objects:  33% (2/6)Counting objects:  50% (3/6)Counting objects:  66% (4/6)Counting objects:  83% (5/6)Counting objects: 100% (6/6)Counting objects: 100% (6/6), done.
Delta compression using up to 2 threads
Compressing objects:  20% (1/5)Compressing objects:  40% (2/5)Compressing objects:  60% (3/5)Compressing objects:  80% (4/5)Compressing objects: 100% (5/5)Compressing objects: 100% (5/5), done.
Writing objects:  20% (1/5)Writing objects:  40% (2/5)Writing objects:  60% (3/5)Writing objects:  80% (4/5)Writing objects: 100% (5/5)Writing objects: 100% (5/5), 1.35 KiB | 1.35 MiB/s, done.
Total 5 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/mirkuriit/hack-change-2025.git
   72cc1dd..ec4c609  HEAD -> ml


In [46]:
!git commit -m "First commit"

[ml ec4c609] First commit
 3 files changed, 78 insertions(+)
 create mode 100644 main.py
 create mode 100644 model.py
 create mode 100644 tokenizer.py


In [None]:
!git status

fatal: not a git repository (or any of the parent directories): .git
