In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, random_split


from sklearn.metrics import f1_score
from tqdm import tqdm

from collections import deque
from collections import Counter
from itertools import chain
from os import listdir
from warnings import filterwarnings

filterwarnings('ignore')
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')


In [3]:
(train_df[:2]['abstract'].values,
train_df[:2]['title'].values)

(array(['we consider the problem of utility maximization for investors with power utility functions. building on the earlier work larsen et al. (2016), we prove that the value of the problem is a frechet-differentiable function of the drift of the price process, provided that this drift lies in a suitable banach space.   we then study optimal investment problems with non-markovian driving processes. in such models there is no hope to get a formula for the achievable maximal utility. applying results of the first part of the paper we provide first order expansions for certain problems involving fractional brownian motion either in the drift or in the volatility. we also point out how asymptotic results can be derived for models with strong mean reversion.',
        'in this paper we provide an explicit formula for calculating the boolean number of a ferrers graph. by previous work of the last two authors, this determines the homotopy type of the boolean complex of the graph. specializin

In [4]:
train_df.columns=['question', 'answer']
test_df.columns=['question']

In [5]:
train_df.columns

Index(['question', 'answer'], dtype='object')

In [6]:
all_texts = np.array(list(chain((train_df['question'], train_df['answer'])))).reshape(-1)


In [7]:
all_texts.shape

(270000,)

In [8]:

class Vocabulary:
    def __init__(self, pad_token="<PAD>", unk_token="<UNKNOWN>"):
        """
        Инициализируем словарь с базовыми токенами.
        """
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.word2idx = {pad_token: 0, unk_token: 1}
        self.idx2word = [pad_token, unk_token]

    def add_word(self, word):
        """
        Добавляем слово в словарь, если его еще нет.
        """
        if word not in self.word2idx:
            self.word2idx[word] = len(self.idx2word)
            self.idx2word.append(word)

    def encode(self, text):
        """
        Преобразуем список слов в список индексов.
        Если слово отсутствует в словаре, заменяем его на <UNKNOWN>.
        """
        return [self.word2idx.get(word, self.word2idx[self.unk_token]) for word in text]

    def decode(self, indices):
        """
        Преобразуем список индексов в список слов.
        """
        return [self.idx2word[idx] for idx in indices]


    def build_vocabulary(self, sequences,  min_frequency=1 , max_frequency=19610000001961):
        """
        Строим словарь на основе переданных последовательностей.
        Каждая последовательность — это список слов.
        Слова добавляются в словарь в порядке убывания их частоты.
        """

        # Добавляем слова в порядке убывания частоты
        [vocab.add_word(word=word) for word , frequency in Counter(
                                                    tuple(chain.from_iterable(
                                                        [sequence.split() for sequence in sequences]))
                                                          ).items() if min_frequency<frequency<max_frequency]

    def __len__(self):
        """
        Возвращаем размер словаря.
        """
        return len(self.idx2word)


In [9]:
vocab=Vocabulary()

In [10]:
vocab.build_vocabulary(sequences=all_texts)

In [11]:
len(vocab)

223953

In [12]:


class TextDataset(Dataset):
    def __init__(self, data, vocab, max_question_length, max_answer_length):
        # Кодируем все вопросы и ответы
        self.data = [(vocab.encode(question), vocab.encode(answer)) for question, answer in data]
        self.max_question_length = max_question_length
        self.max_answer_length = max_answer_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question, answer = self.data[index]
        # Паддинг всех вопросов и ответов до максимальной длины
        question = question[:self.max_question_length] + [1] * (self.max_question_length - len(question))  # Паддинг для вопроса
        answer = answer[:self.max_answer_length] + [1] * (self.max_answer_length - len(answer))  # Паддинг для ответа
        return torch.tensor(question), torch.tensor(answer)




# Функция для обработки батчей с паддингом
def collate_fn(batch):
    questions, answers = zip(*batch)

    # Паддинг последовательностей до одинаковой длины
    pad_questions = pad_sequence([torch.tensor(question) for question in questions], batch_first=True, padding_value=1)  # pad_value=1 для <pad>
    pad_answers = pad_sequence([torch.tensor(answer) for answer in answers], batch_first=True, padding_value=1)

    return pad_questions, pad_answers





# Разделение датасета
dataset = TextDataset(data=train_df.values, vocab=vocab,
                      max_question_length=len(train_df.question.max()),
                      max_answer_length=len(train_df.answer.max()))

# Размеры для разделения
train_size = int(0.75 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size  # Остаток для теста

# Разделение датасета
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])


# Создание DataLoader'ов для каждой части
batch_size = 64 # Для демонстрации используем маленький размер батча
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [13]:
vocab.__len__()

223953

In [14]:

# Пример итерации по train_loader
for batch in train_loader:
    pad_questions, pad_answers = batch
    print(f"Паддинговые вопросы:\n{pad_questions.size()}")
    print(f"Паддинговые ответы:\n{pad_answers.size()}")
    break

Паддинговые вопросы:
torch.Size([64, 821])
Паддинговые ответы:
torch.Size([64, 58])


In [None]:
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

In [16]:
# Параметры модели
embedding_dim = 400
hidden_size = 64
output_size = 58
vocab_size = len(vocab.idx2word)

In [67]:
def collate_fn(batch):
    questions, answers = zip(*batch)

    # Паддинг последовательностей до одинаковой длины
    pad_questions = pad_sequence(questions, batch_first=True, padding_value=1)  # pad_value=1 для <pad>
    pad_answers = pad_sequence(answers, batch_first=True, padding_value=1)

    # Преобразуем тип для совместимости с моделью
    pad_questions = pad_questions.float()
    pad_answers = pad_answers.long()  # Для CrossEntropyLoss
    return pad_questions, pad_answers


In [68]:
class TextDataset(Dataset):
    def __init__(self, data, vocab, max_question_length, max_answer_length):
        # Кодируем все вопросы и ответы
        self.data = [(vocab.encode(question), vocab.encode(answer)) for question, answer in data]
        self.max_question_length = max_question_length
        self.max_answer_length = max_answer_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question, answer = self.data[index]
        # Паддинг всех вопросов и ответов до максимальной длины
        question = question[:self.max_question_length] + [1] * (self.max_question_length - len(question))  # Паддинг для вопроса
        answer = answer[:self.max_answer_length] + [1] * (self.max_answer_length - len(answer))  # Паддинг для ответа
        return torch.tensor(question, dtype=torch.float32), torch.tensor(answer, dtype=torch.long)


In [69]:
batch_size = 128  # Размер батча для отладки

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [70]:
for questions, answers in train_loader:
    print("Questions shape:", questions.shape)  # (batch_size, max_sequence_length)
    print("Answers shape:", answers.shape)      # (batch_size, max_sequence_length)
    break


Questions shape: torch.Size([128, 821])
Answers shape: torch.Size([128, 58])


In [71]:
hidden_size = 256  # Размер скрытого слоя
output_size = 58   # Количество возможных токенов в ответе



In [72]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Слой эмбеддингов
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)  # Преобразование токенов в эмбеддинги
        lstm_out, (ht, ct) = self.lstm(x)
        output = self.fc(ht[-1])  # Используем последний скрытый слой
        return output


In [73]:

vocab_size = len(vocab.idx2word)  # Размер словаря
input_size = 400  # Размер эмбеддинга
embedding_dim = 400              # Размерность эмбеддингов
hidden_size = 256                # Размерность скрытого слоя
output_size = 58                 # Количество классов (или размер словаря ответов)
max_sequence_length = 821

model = LSTMModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    output_size=output_size
).to(device)


In [74]:
criterion = nn.CrossEntropyLoss()  # Игнорируем <pad>
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)



100.0

In [49]:
for index ,(questions, answers) in enumerate(train_loader):
    questions, answers = questions.to(device), answers.to(device)

    # Прогон через модель
    outputs = model(questions.long())  # Размерность: (batch_size, output_size)

    # Вычисляем потери


    loss = criterion(outputs.float(), answers.float())

    loss.backward()
    optimizer.step()

    print(index,loss, sep="---->")
    if index>3:
        break



0---->tensor(1111383.2500, device='cuda:0', grad_fn=<DivBackward1>)
1---->tensor(1058081.8750, device='cuda:0', grad_fn=<DivBackward1>)
2---->tensor(1163778.2500, device='cuda:0', grad_fn=<DivBackward1>)
3---->tensor(1096949.5000, device='cuda:0', grad_fn=<DivBackward1>)
4---->tensor(1143338.7500, device='cuda:0', grad_fn=<DivBackward1>)


In [50]:
batch_size

64

In [51]:
answers.size()

torch.Size([64, 58])

In [52]:
# Размерности
# Преобразуем outputs: [batch_size, seq_length, vocab_size] -> [batch_size * seq_length, vocab_size]
outputs = outputs.view(-1)  # Это будет [batch_size * seq_length, vocab_size]

# Преобразуем answers: [batch_size, seq_length] -> [batch_size * seq_length]
answers = answers.view(-1).float()  # Это будет [batch_size * seq_length]

# Убедимся, что answers имеют тип long

# Теперь можно вычислить потери
loss = criterion(outputs, answers)
loss.backward()
optimizer.step()

print(f"Loss: {loss.item():.4f}")


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [53]:
for questions, answers in train_loader:
    questions, answers = questions.to(device), answers.to(device)

    # Прогон через модель
    outputs = model(questions.long())  # Размерность: (batch_size, output_size)

    outputs = outputs.view(-1)  # Это будет [batch_size * seq_length, vocab_size]

    answers = answers.view(-1).float()  # Это будет [batch_size * seq_length]




    loss = criterion(outputs, answers)

    loss.backward()
    optimizer.step()

    print(loss)


tensor(1.4211e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4579e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.5415e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4369e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4937e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4503e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.3925e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4506e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4565e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.5030e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4887e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4301e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4180e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4444e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4383e+08, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.4598e+08, device='cuda:0', grad_fn=<DivBackward1>)


KeyboardInterrupt: 

In [54]:
criterion = nn.CrossEntropyLoss()  # Игнорируем <pad>
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [75]:

def train(model, train_loader, val_loader, optimizer, criterion, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for questions, answers in tqdm(train_loader):
            optimizer.zero_grad()
            questions, answers = questions.to(device), answers.to(device)
            outputs = model(questions.long())  # Размерность: (batch_size, output_size)
            loss = criterion(outputs.float(), answers.float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Валидация
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for questions, answers in tqdm(val_loader):
                questions, answers = questions.to(device), answers.to(device)
                outputs = model(questions.long())  # Размерность: (batch_size, output_size)
                loss = criterion(outputs.float(), answers.float())
                # optimizer.step()
                val_loss += loss.item()

            val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

# Вызов функции обучения


In [76]:
train(model, train_loader, val_loader, optimizer, criterion, num_epochs=10)

100%|██████████| 792/792 [04:44<00:00,  2.79it/s]
100%|██████████| 106/106 [00:13<00:00,  7.85it/s]


Epoch 1, Train Loss: 1167109.1542, Val Loss: 1253332.6722


100%|██████████| 792/792 [04:42<00:00,  2.80it/s]
100%|██████████| 106/106 [00:13<00:00,  7.86it/s]


Epoch 2, Train Loss: 1142437.7648, Val Loss: 1169991.6981


100%|██████████| 792/792 [04:41<00:00,  2.81it/s]
100%|██████████| 106/106 [00:13<00:00,  7.84it/s]


Epoch 3, Train Loss: 1137492.4792, Val Loss: 1173739.8066


100%|██████████| 792/792 [04:40<00:00,  2.82it/s]
100%|██████████| 106/106 [00:13<00:00,  7.72it/s]


Epoch 4, Train Loss: 1145150.6968, Val Loss: 1200444.5920


100%|██████████| 792/792 [04:40<00:00,  2.82it/s]
100%|██████████| 106/106 [00:13<00:00,  7.89it/s]


Epoch 5, Train Loss: 1138283.6555, Val Loss: 1146684.5837


100%|██████████| 792/792 [04:40<00:00,  2.82it/s]
100%|██████████| 106/106 [00:13<00:00,  7.87it/s]


Epoch 6, Train Loss: 1138900.0866, Val Loss: 1147318.3325


  9%|▉         | 75/792 [00:27<04:18,  2.77it/s]


KeyboardInterrupt: 

In [77]:
# prompt: how to save torch model weights

# Save the model's state_dict()
torch.save(model.state_dict(), 'model_weights.pth')

In [40]:
model

LSTMModel(
  (embedding): Embedding(223953, 400)
  (lstm): LSTM(400, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=58, bias=True)
)

In [46]:
model.load_state_dict(torch.load('model_weights.pth'))

<All keys matched successfully>

In [42]:
criterion = nn.CrossEntropyLoss()  # Игнорируем <pad>
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [45]:
train(model, train_loader, val_loader, optimizer, criterion, num_epochs=10)

  5%|▌         | 87/1583 [00:10<02:56,  8.48it/s]


KeyboardInterrupt: 