In [30]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
import tarfile
import re
from collections import Counter
import pickle

from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
BATCH_SIZE = 32

In [2]:
''' uncomment only for unpacking the downloaded data
archive_path = "./aclImdb_v1.tar.gz"
with tarfile.open(archive_path, "r:gz") as tar:
    tar.extractall()
'''

' uncomment only for unpacking the downloaded data\narchive_path = "./aclImdb_v1.tar.gz"\nwith tarfile.open(archive_path, "r:gz") as tar:\n    tar.extractall()\n'

In [3]:
'''
reviews = load_files('./aclImdb/train')
texts = reviews.data
labels = reviews.target
'''
categories = ['rec.sport.baseball', 'sci.space']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

texts = newsgroups.data
labels = newsgroups.target

print(f"Всего текстов: {len(texts)}")
print(f"Пример текста: {texts[0][:200]}...")

Всего текстов: 1981
Пример текста: From: mss@netcom.com (Mark Singer)
Subject: Re: Young Catchers
Article-I.D.: netcom.mssC52qMx.768
Organization: Netcom Online Communications Services (408-241-9760 login: guest)
Lines: 86

In article ...


In [16]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [17]:
def simple_tokenizer(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)
def build_vocab(texts, max_vocab_size=10000):
    counter = Counter()
    for text in texts:
        tokens = simple_tokenizer(text)
        counter.update(tokens)
    
    most_common = counter.most_common(max_vocab_size - 2)
    vocab = {word: idx for idx, (word, _) in enumerate(most_common, start=2)}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    
    return vocab
    
def text_to_indices(text, vocab, max_len=50):
    tokens = simple_tokenizer(text)
    indices = [vocab.get(token, vocab['<unk>']) for token in tokens][:max_len]
    # Добавляем паддинг если нужно
    if len(indices) < max_len:
        indices += [vocab['<pad>']] * (max_len - len(indices))
    return indices

In [20]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=50):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        indices = text_to_indices(self.texts[idx], self.vocab, self.max_len)
        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

vocab = build_vocab(train_texts, max_vocab_size=1000)
print(f"Словарь: {len(vocab)} слов")
print("Пример словаря:", dict(list(vocab.items())[:25]))

# Создаем dataset
dataset = TextDataset(train_texts, train_labels, vocab, max_len=10)
print("Пример данных:")
for i in range(0,len(dataset),60):
    indices, label = dataset[i]
    print(f"Текст {i}: {indices} -> Метка: {label}")

Словарь: 1000 слов
Пример словаря: {'the': 2, 'to': 3, 'of': 4, 'a': 5, 'and': 6, 'in': 7, 'i': 8, 'is': 9, 'that': 10, 'edu': 11, 'it': 12, 'for': 13, 's': 14, 'from': 15, 'on': 16, 'you': 17, 'be': 18, 'this': 19, 't': 20, 'have': 21, 'was': 22, 'are': 23, '0': 24, 'as': 25, 'with': 26}
Пример данных:
Текст 0: tensor([15,  1,  1,  1, 41,  1,  1, 33, 38, 87]) -> Метка: 1
Текст 60: tensor([ 15, 456,   1,  41, 456,   1,  33,  38, 104, 297]) -> Метка: 0
Текст 120: tensor([ 15,   1,   1,  11,  33,  38, 302,   1, 166,   1]) -> Метка: 1
Текст 180: tensor([ 15,   1,   1, 948,  11, 200, 269,  33,  38,   1]) -> Метка: 0
Текст 240: tensor([ 15,   1,   1,   1, 125, 358,   1,  33, 480,  13]) -> Метка: 0
Текст 300: tensor([ 33,  32, 778, 311, 180, 170, 852,  15,   1, 100]) -> Метка: 1
Текст 360: tensor([ 15,   1,   1,   1,  11,  20,   1,  33, 469, 223]) -> Метка: 1
Текст 420: tensor([ 15,   1,   1, 182, 413,  11,   1,   1,  33,  38]) -> Метка: 1
Текст 480: tensor([ 33,  32, 778, 361, 180, 914,   1

In [23]:
train_dataset = TextDataset(train_texts, train_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = TextDataset(val_texts, val_labels, vocab)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [24]:
class simple_LSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        embed = self.embedding(x)
        _, (hidden, _) = self.lstm(embed)
        out = self.fc(hidden[-1])
        return out

In [34]:
model = simple_LSTM(vocab_size = len(vocab), embed_dim = 64, hidden_dim = 128, output_dim = 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

def train(num_epochs=10, patience = 3, train_loader=train_loader, val_loader=val_loader):
    best_accuracy = 0
    trigger_counter = 0
    for epoch in range(num_epochs):
        model.train()
        e_loss = 0.0
        for x,y in tqdm(train_loader):
            x=x.to(device)
            y=y.to(device)

            optimizer.zero_grad()

            outp = model(x)
            loss = criterion(outp,y)
            loss.backward()
            optimizer.step()
            e_loss+=loss.item()
            
        print(f'On {epoch+1}/{num_epochs} epoch loss = {e_loss/len(train_loader)}')


        model.eval()
        correct, total = 0,0
        with torch.no_grad():
            for x,y in val_loader:
                x = x.to(device)
                y = y.to(device)
                outputs = model(x)
                loss = criterion(outputs, y)

                _, preds = torch.max(outputs, 1)
                total += y.size(0)
                correct += (preds == y).sum().item()
        accuracy = 100 * correct / total
        
        # callbacks
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            trigger_counter = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'accuracy': accuracy,
            }, "best_RNN_trained.pth")
        else:
            trigger_counter += 1
            if trigger_counter >= patience:
                break
    print("Done!")

In [35]:
train(30)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.37it/s]


On 1/30 epoch loss = 0.6887738001346588


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.74it/s]


On 2/30 epoch loss = 0.5845504969358444


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.31it/s]


On 3/30 epoch loss = 0.5007320874929428


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.93it/s]


On 4/30 epoch loss = 0.4257336527109146


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.28it/s]


On 5/30 epoch loss = 0.4331625375151634


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.00it/s]


On 6/30 epoch loss = 0.4704365673661232


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.94it/s]


On 7/30 epoch loss = 0.31757319226861


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.47it/s]


On 8/30 epoch loss = 0.2346352843940258


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.03it/s]


On 9/30 epoch loss = 0.17254639424383642


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.02it/s]


On 10/30 epoch loss = 0.16666438408195972


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.22it/s]


On 11/30 epoch loss = 0.13387531319633125


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.20it/s]


On 12/30 epoch loss = 0.10358375798910856


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.47it/s]


On 13/30 epoch loss = 0.06914439069107176


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.20it/s]


On 14/30 epoch loss = 0.05469030514359474


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.85it/s]


On 15/30 epoch loss = 0.05023158026859164


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.40it/s]


On 16/30 epoch loss = 0.03316193690523505


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.47it/s]


On 17/30 epoch loss = 0.024239648203365504


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.18it/s]


On 18/30 epoch loss = 0.029908204567618668


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.74it/s]


On 19/30 epoch loss = 0.023035225874045863


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.52it/s]


On 20/30 epoch loss = 0.018607535033952446


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.70it/s]


On 21/30 epoch loss = 0.036145967345219104


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.57it/s]


On 22/30 epoch loss = 0.020241148504428565


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.21it/s]


On 23/30 epoch loss = 0.02931147494353354


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.35it/s]


On 24/30 epoch loss = 0.03722226091194898
Done!


In [36]:
model = simple_LSTM(vocab_size = len(vocab), embed_dim = 64, hidden_dim = 128, output_dim = 2).to(device)
checkpoint = torch.load("best_RNN_trained.pth")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
accuracy = checkpoint['accuracy']
print(accuracy)
model.eval()
correct, total = 0,0
with torch.no_grad():
    for x,y in val_loader:
        x = x.to(device)
        y = y.to(device)
        outputs = model(x)
        _, preds = torch.max(outputs, 1)
        total += y.size(0)
        correct += (preds == y).sum().item()

print(correct, total)
assert total == len(val_dataset)

accuracy = 100 * correct / total
print(f'Accuracy on test: {accuracy}%')

94.20654911838791
374 397
Accuracy on test: 94.20654911838791%
