In [1]:
!pip install torch==2.0.1 torchtext==0.5.0



In [2]:
import random

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import OneCycleLR
from torchtext.vocab import GloVe

from torch.utils.data import DataLoader, Dataset

import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [3]:
categories = ['sci.med','sci.electronics', 'sci.space', 'rec.sport.baseball', 'soc.religion.christian']

news_data = fetch_20newsgroups(subset='all', categories=categories)

# Создание обучающей и тестовой выборок
X, y = news_data.data, news_data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(len(news_data.data))

4952


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Удаление метаданных, токенизация и лемматизация
def clean_text(text):
    text = re.sub(r"(^From|Subject|Organization|Lines):.*", "", text, flags=re.MULTILINE)
    text = re.sub(r"\b(?:http|https|www)\S+\b", "", text)  # Удаляем URL
    text = re.sub(r"\W", " ", text)  # Удаление знаков препинания
    text = re.sub(r"\d+", " ", text)  # Удаление чисел
    text = re.sub(r"\s+", " ", text).strip().lower()  # Приведение к нижнему регистру
    return text

# Загрузка необходимых ресурсов NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Токенизация
    tokens = word_tokenize(text)
    # Удаление стоп-слов и лемматизация
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Применяем очистку и предобработку
X_train_cleaned = [preprocess_text(clean_text(doc)) for doc in X_train]
X_test_cleaned = [preprocess_text(clean_text(doc)) for doc in X_test]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from collections import Counter

# Создаем словарь из тренировочных данных
all_tokens = [word for doc in X_train_cleaned for word in doc.split()]
vocab = {word: idx for idx, (word, _) in enumerate(Counter(all_tokens).most_common(), start=1)}

# Добавляем новые два ключа в словарь - специальные токены '<unk'> и '<pad>'.
vocab["<unk>"] = 0  # Для неизвестных слов
vocab["<pad>"] = len(vocab)  # Для паддинга

# Преобразуем текст в последовательности индексов
def text_to_sequence(text, vocab):
    return [vocab.get(word, vocab["<unk>"]) for word in text.split()]

X_train_seq = [text_to_sequence(doc, vocab) for doc in X_train_cleaned]
X_test_seq = [text_to_sequence(doc, vocab) for doc in X_test_cleaned]

In [6]:
# Функция для выравнивания длины последовательностей
def pad_sequences(sequences, vocab, max_len=None):
    max_len = max_len or max(len(seq) for seq in sequences)
    padded = [seq + [vocab["<pad>"]] * (max_len - len(seq)) for seq in sequences]
    return torch.tensor(padded, dtype=torch.long)

# Паддинг для тренировочных и тестовых данных
X_train_padded = pad_sequences(X_train_seq, vocab)
X_test_padded = pad_sequences(X_test_seq, vocab)

In [7]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_padded, y_train, test_size=0.2, random_state=42
)

def shorten_sequences(data, max_len):
    return [seq[:max_len] for seq in data]

X_train_small = shorten_sequences(X_train_split, max_len=400)
X_val_small = shorten_sequences(X_val_split, max_len=400)

# Создание датасетов
train_dataset = TextDataset(X_train_small, y_train_split)
val_dataset = TextDataset(X_val_small, y_val_split)
test_dataset = TextDataset(X_test_padded, y_test)

# DataLoader для тренировочных, валидационных и тестовых данных
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

In [8]:
# Создание матрицы эмбеддингов с использованием GloVe из Torchtext
def create_embedding_matrix(vocab, glove, embedding_dim=100):
    vocab_size = len(vocab)
    embedding_matrix = np.random.normal(size=(vocab_size, embedding_dim))  # Для неизвестных слов

    for word, idx in vocab.items():
        if word in glove.stoi:
            embedding_matrix[idx] = glove[word].numpy()  # Получение вектора для слова
        else:
            embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))

    return torch.tensor(embedding_matrix, dtype=torch.float)

# Загрузка эмбеддингов GloVe
glove = GloVe(name='6B', dim=100)

embedding_dim = 100
embedding_matrix = create_embedding_matrix(vocab, glove, embedding_dim)

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, embedding_matrix):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # Инициализация матрицы эмбеддингов
        self.embedding.weight.data.copy_(embedding_matrix)  # Предобученные веса
        self.embedding.weight.requires_grad = False  # Предобученные эмбеддинги не будут изменяться
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim) # Выходной слой
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded).squeeze(3)) for conv in self.convs]
        pooled = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [12]:
vocab_size = len(vocab)
n_filters = 120
filter_sizes = [2, 3, 4, 5] # Рассматриваются последовательности из 2, 3, 4 и 5 слов
output_dim = len(set(y_train)) # Количество классов
dropout = 0.5

# Инициализация модели
model = CNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, embedding_matrix)
# Инициализация оптимизатора
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

criterion = nn.CrossEntropyLoss()
print(model)

# Аппаратные параметры для обучения
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

CNN(
  (embedding): Embedding(35338, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 120, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 120, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 120, kernel_size=(4, 100), stride=(1, 1))
    (3): Conv2d(1, 120, kernel_size=(5, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=480, out_features=5, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [13]:
n_epochs = 25
patience = 3  # Количество эпох без улучшения перед остановкой
best_val_loss = float('inf')
early_stop_counter = 0  # Счётчик эпох без улучшений

# Планировщик для изменения постоянной обучения
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=0.01, steps_per_epoch=len(train_loader), epochs=n_epochs
)

for epoch in range(n_epochs):
    # Обучение модели
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0
    train_preds = []
    train_targets = []

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        correct_train += (predictions.argmax(1) == y_batch).sum().item()
        total_train += y_batch.size(0)

        # Для F1-метрики
        train_preds.extend(predictions.argmax(1).cpu().numpy())
        train_targets.extend(y_batch.cpu().numpy())

    train_accuracy = correct_train / total_train
    train_f1 = f1_score(train_targets, train_preds, average='macro')

    # Оценка на валидационном наборе
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    val_preds = []
    val_targets = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            val_loss += loss.item()
            correct_val += (predictions.argmax(1) == y_batch).sum().item()
            total_val += y_batch.size(0)

            # Для F1-метрики
            val_preds.extend(predictions.argmax(1).cpu().numpy())
            val_targets.extend(y_batch.cpu().numpy())

    val_accuracy = correct_val / total_val
    val_loss /= len(val_loader)
    val_f1 = f1_score(val_targets, val_preds, average='macro')

    # Вывод метрик за эпоху
    current_lr = scheduler.optimizer.param_groups[0]['lr']
    print(f"Epoch: {epoch + 1:02}, "
          f"Train Loss: {train_loss / len(train_loader):.3f}, "
          f"Train Acc: {train_accuracy * 100:.2f}%, "
          f"Train F1: {train_f1:.3f}, "
          f"Val. Loss: {val_loss:.3f}, "
          f"Val. Acc: {val_accuracy * 100:.2f}%, "
          f"Val. F1: {val_f1:.3f}, "
          f"Current learning rate: {current_lr:.6f}")

    # Ранняя остановка: проверка улучшения валидационного лосса
    if val_loss < best_val_loss:
        best_val_loss = val_loss  # Сохраняем лучшее значение функции потерь
        early_stop_counter = 0  # Сбрасываем счётчик
        # Сохраняем лучшую модель
        torch.save(model.state_dict(), "best_model.pth")
    else:
        early_stop_counter += 1  # Увеличиваем счётчик эпох без улучшений

    # Прекращение обучения, если модель на плато
    if early_stop_counter >= patience:
        break


Epoch: 01, Train Loss: 1.750, Train Acc: 20.82%, Train F1: 0.163, Val. Loss: 1.561, Val. Acc: 37.75%, Val. F1: 0.350, Current learning rate: 0.000875
Epoch: 02, Train Loss: 1.665, Train Acc: 24.13%, Train F1: 0.237, Val. Loss: 1.514, Val. Acc: 41.07%, Val. F1: 0.316, Current learning rate: 0.002207
Epoch: 03, Train Loss: 1.537, Train Acc: 31.60%, Train F1: 0.289, Val. Loss: 1.296, Val. Acc: 64.41%, Val. F1: 0.647, Current learning rate: 0.004132
Epoch: 04, Train Loss: 1.273, Train Acc: 49.17%, Train F1: 0.496, Val. Loss: 1.018, Val. Acc: 82.28%, Val. F1: 0.817, Current learning rate: 0.006268
Epoch: 05, Train Loss: 0.972, Train Acc: 69.37%, Train F1: 0.685, Val. Loss: 0.686, Val. Acc: 89.19%, Val. F1: 0.893, Current learning rate: 0.008193
Epoch: 06, Train Loss: 0.646, Train Acc: 81.53%, Train F1: 0.816, Val. Loss: 0.424, Val. Acc: 91.93%, Val. F1: 0.919, Current learning rate: 0.009525
Epoch: 07, Train Loss: 0.419, Train Acc: 88.24%, Train F1: 0.882, Val. Loss: 0.310, Val. Acc: 91.79%

In [14]:
# Финальная проверка на тестовых данных
model.eval()  # Перевод модели в режим оценки
test_loss = 0
correct_test = 0
total_test = 0

test_preds = []
test_targets = []

with torch.no_grad():  # Отключение вычисления градиентов
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        test_loss += loss.item()
        correct_test += (predictions.argmax(1) == y_batch).sum().item()
        total_test += y_batch.size(0)

        test_preds.extend(predictions.argmax(1).cpu().numpy())
        test_targets.extend(y_batch.cpu().numpy())


test_accuracy = correct_test / total_test
test_loss /= len(test_loader)
test_f1 = f1_score(test_targets, test_preds, average='macro')

test_accuracy = correct_test / total_test

print(f"Test Loss: {test_loss / len(test_loader):.3f}, "
      f"Test Accuracy: {test_accuracy * 100:.2f}%, "
      f"Test F1: {test_f1:.3f}")


Test Loss: 0.091, Test Accuracy: 91.92%, Test F1: 0.919
