In [2]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "no gpu")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True
1
Tesla T4


In [3]:
# !!! ЗАГРУЗКА ДАТАСЕТА !!!

from tqdm import tqdm

from src.data_utils import clean_string
from src.data_utils import save_dataset
from src.next_token_dataset import load_sentiment140

texts = load_sentiment140()
save_dataset(texts, "raw_dataset.csv")

# "чистим" тексты
cleaned_texts = list(map(clean_string, texts))
save_dataset(cleaned_texts, "dataset_processed.csv")

  from .autonotebook import tqdm as notebook_tqdm


Saved 1600000 texts to data/raw_dataset.csv
Saved 1600000 texts to data/dataset_processed.csv


'data/dataset_processed.csv'

In [4]:
# !!! РАЗБИВКА НА TRAIN/VAL/TEST !!!

from sklearn.model_selection import train_test_split

# Split into train + temp (temp will become val + test)
train_texts, temp_texts = train_test_split(
    cleaned_texts,
    test_size=0.2,       # 20% of data will be val+test
    random_state=42     # for reproducibility
)

# Split temp into validation and test (50% each → 10% of total each)
val_texts, test_texts = train_test_split(
    temp_texts,
    test_size=0.5,
    random_state=42
)

save_dataset(train_texts, "train.csv")
save_dataset(val_texts, "val.csv")
save_dataset(test_texts, "test.csv")

# Check sizes
print(len(train_texts), len(val_texts), len(test_texts))

Saved 1280000 texts to data/train.csv
Saved 160000 texts to data/val.csv
Saved 160000 texts to data/test.csv
1280000 160000 160000


In [5]:
# !!! ТОКЕНИЗАЦИЯ И ДАТАЛОУДЕРЫ !!!

from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
from src.data_utils import BertDataset
from src.data_utils import collate_fn
from src.next_token_dataset import TextCompletionDataset
from transformers import AutoTokenizer

#tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
special_tokens = ['<user>', '<url>', '<emotion>']
tokenizer.add_tokens(special_tokens)
tokenizer.pad_token = tokenizer.eos_token

transformer_tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # without special chars
transformer_tokenizer.pad_token = tokenizer.eos_token

train_dataset = BertDataset(train_texts, tokenizer)
val_dataset = BertDataset(val_texts, tokenizer)

save_dataset(train_dataset.samples, "train_dataset_tokenized.csv")
save_dataset(val_dataset.samples, "val_dataset_tokenized.csv")

print(f"train_dataset: {len(train_dataset)}, val_dataset: {len(val_dataset)}")

final_train_dataset = TextCompletionDataset(train_texts, tokenizer, mode='train')
final_val_dataset = TextCompletionDataset(val_texts, tokenizer, mode='train')
final_test_dataset = TextCompletionDataset(test_texts, tokenizer, mode='inference')

# даталоадеры
train_loader = DataLoader(final_train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(final_val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(final_test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

print(f"train_loader: {len(train_loader)}, val_loader: {len(val_loader)}")

Saved 12039314 texts to data/train_dataset_tokenized.csv
Saved 1510291 texts to data/val_dataset_tokenized.csv
train_dataset: 12039314, val_dataset: 1510291
train_loader: 20000, val_loader: 2500


In [6]:
# !!! СОЗДАНИЕ LSTM !!!

from src.lstm_model import LSTMClassifier
from src.lstm_model import count_parameters

#vocab_size = tokenizer.vocab_size + len(special_tokens) 
vocab_size = len(tokenizer)
hidden_dim = 128

model = LSTMClassifier(vocab_size, hidden_dim).to(device)
param_count = count_parameters(model)

print(param_count)

13048916


In [7]:
# !!! ОБУЧЕНИЕ LSTM !!!

from src.eval_lstm import evaluate
from src.lstm_train import train
from torch.nn.utils.rnn import pad_sequence

import torch
import torch.nn as nn

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
train(model, train_loader, val_loader, criterion, optimizer, evaluate, tokenizer.pad_token_id, device, 3)

100%|██████████| 20000/20000 [16:01<00:00, 20.81it/s]


Epoch 1 | Train Loss: 2.939 | Val Loss: 2.278 | Val Accuracy: 64.04%


100%|██████████| 20000/20000 [16:03<00:00, 20.76it/s]


Epoch 2 | Train Loss: 2.234 | Val Loss: 2.249 | Val Accuracy: 64.33%


100%|██████████| 20000/20000 [16:09<00:00, 20.63it/s]


Epoch 3 | Train Loss: 2.203 | Val Loss: 2.239 | Val Accuracy: 64.46%


In [None]:
from src.eval_transformer_pipeline import evaluate_rouge

# !!! ROUGE-ОЦЕНКА LSTM !!!
predictionsLSTM = []
referencesLSTM = []

for batch in tqdm(test_loader, desc="Generating"):
    input_ids = batch["input_ids"][0]  # один sample
    target_ids = batch["target_ids"][0]

    gen_text = model.generate(input_ids, max_new_tokens=len(target_ids), tokenizer=tokenizer)

    target_text = tokenizer.decode(target_ids, skip_special_tokens=True)

    predictionsLSTM.append(gen_text)
    referencesLSTM.append(target_text)

evaluate_rouge(predictions=predictionsLSTM, references=referencesLSTM)


Generating:   4%|▎         | 5964/160000 [00:12<05:33, 462.50it/s]

In [None]:
# !!! РАБОТА ТРАНСФОРМЕРА !!!

from transformers import pipeline
from src.eval_transformer_pipeline import evaluate_rouge
from tqdm import tqdm

generator = pipeline(
    "text-generation",
    model="distilgpt2",
    tokenizer=transformer_tokenizer,
    device=0               # или -1 для CPU
)

predictions = []
references = []

for sample in tqdm(final_test_dataset, desc="Generating", unit="sample"):
    input_text = tokenizer.decode(
        sample["input_ids"],
        skip_special_tokens=True
    )

    target_text = tokenizer.decode(
        sample["target_ids"],
        skip_special_tokens=True
    )

    out = generator(
        input_text,
        max_new_tokens=len(sample["target_ids"]),
        do_sample=False,
        temperature=None
    )

    gen_text = out[0]["generated_text"]
    gen_completion = gen_text[len(input_text):].strip()

    #print(f"input_text: {input_text}, target_text: {target_text}, out: {gen_completion}")

    predictions.append(gen_completion)
    references.append(target_text)

generator = pipeline("text-generation", model="distilgpt2")

Device set to use cuda:0
Generating:   0%|          | 10/5000 [00:00<03:30, 23.65sample/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating:   1%|          | 27/5000 [00:00<01:52, 44.20sample/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Generating: 100%|██████████| 5000/5000 [01:43<00:00, 48.16sample/s]
Device set to use cuda:0


In [None]:
# !!! ROUGE-ОЦЕНКА ТРАНСФОРМЕРА !!!

evaluate_rouge(predictions=predictions, references=references)

Metrics
rouge1: 0.0687
rouge2: 0.0105
rougeL: 0.0681
rougeLsum: 0.0681
Examples:
ref:  internet for april and may, prediction: texts.”
ref:  of hoarding out of apartment, prediction: of the money.
ref: s not feeling up to par, prediction: was a girl she would be
ref:  hate smiling, prediction: can't
ref:  good friend riley, prediction: good person.
ref:  need to mention that sorry, prediction: know i was going to


In [None]:
# !!! ВЫВОДЫ !!!

# - предобученный трансформер GPT2 в метриках ROUGE показал чуть более лучший результат. Rouge1: LSTM: 0.0468 vs Трансформер: 0.0687
# - модель LSTM на каждой следующей эпохе обучения немного улучшает точность предсказаний. При этом она не переобучается: значения train loss и val loss схожи.
#       Epoch 1 | Train Loss: 2.939 | Val Loss: 2.278 | Val Accuracy: 64.04%
#       Epoch 2 | Train Loss: 2.234 | Val Loss: 2.249 | Val Accuracy: 64.33%
#       Epoch 3 | Train Loss: 2.203 | Val Loss: 2.239 | Val Accuracy: 64.46%
# - ручная проверка осмысленности автодополнений на выборочных примерах вынуждает принять решение в пользу использования трансформера gpt2.