In [1]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "no gpu")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True
1
Tesla T4


In [None]:
# !!! ЗАГРУЗКА ДАТАСЕТА !!!

from tqdm import tqdm

from src.data_utils import clean_string
from src.data_utils import save_dataset
from src.next_token_dataset import load_sentiment140

texts = load_sentiment140()
save_dataset(texts, "raw_dataset.csv")

# "чистим" тексты
cleaned_texts = list(map(clean_string, texts[:50000]))
save_dataset(cleaned_texts, "dataset_processed.csv")

  from .autonotebook import tqdm as notebook_tqdm


Saved 1600000 texts to data/raw_dataset.csv
Saved 1600000 texts to data/dataset_processed.csv


'data/dataset_processed.csv'

In [3]:
# !!! РАЗБИВКА НА TRAIN/VAL/TEST !!!

from sklearn.model_selection import train_test_split

# Split into train + temp (temp will become val + test)
train_texts, temp_texts = train_test_split(
    cleaned_texts,
    test_size=0.2,       # 20% of data will be val+test
    random_state=42     # for reproducibility
)

# Split temp into validation and test (50% each → 10% of total each)
val_texts, test_texts = train_test_split(
    temp_texts,
    test_size=0.5,
    random_state=42
)

save_dataset(train_texts, "train.csv")
save_dataset(val_texts, "val.csv")
save_dataset(test_texts, "test.csv")

# Check sizes
print(len(train_texts), len(val_texts), len(test_texts))

Saved 1280000 texts to data/train.csv
Saved 160000 texts to data/val.csv
Saved 160000 texts to data/test.csv
1280000 160000 160000


In [4]:
# !!! ТОКЕНИЗАЦИЯ И ДАТАЛОУДЕРЫ !!!

from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
from src.data_utils import BertDataset
from src.data_utils import collate_fn
from src.next_token_dataset import TextCompletionDataset
from transformers import AutoTokenizer

#tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
special_tokens = ['<user>', '<url>', '<emotion>']
tokenizer.add_tokens(special_tokens)
tokenizer.pad_token = tokenizer.eos_token

transformer_tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # without special chars
transformer_tokenizer.pad_token = tokenizer.eos_token

train_dataset = BertDataset(train_texts, tokenizer)
val_dataset = BertDataset(val_texts, tokenizer)

save_dataset(train_dataset.samples, "train_dataset_tokenized.csv")
save_dataset(val_dataset.samples, "val_dataset_tokenized.csv")

print(f"train_dataset: {len(train_dataset)}, val_dataset: {len(val_dataset)}")

final_train_dataset = TextCompletionDataset(train_texts, tokenizer, mode='train')
final_val_dataset = TextCompletionDataset(val_texts, tokenizer, mode='train')
final_test_dataset = TextCompletionDataset(test_texts, tokenizer, mode='inference')

# даталоадеры
train_loader = DataLoader(final_train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(final_val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(final_test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

print(f"train_loader: {len(train_loader)}, val_loader: {len(val_loader)}")

KeyboardInterrupt: 

In [None]:
# !!! СОЗДАНИЕ LSTM !!!

from src.lstm_model import LSTMClassifier
from src.lstm_model import count_parameters

#vocab_size = tokenizer.vocab_size + len(special_tokens) 
vocab_size = len(tokenizer)
hidden_dim = 128

model = LSTMClassifier(vocab_size, hidden_dim).to(device)
param_count = count_parameters(model)

print(param_count)

13048916


In [None]:
# !!! ОБУЧЕНИЕ LSTM !!!

from src.eval_lstm import evaluate
from src.lstm_train import train
from torch.nn.utils.rnn import pad_sequence

import torch
import torch.nn as nn

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
train(model, train_loader, val_loader, criterion, optimizer, evaluate, tokenizer.pad_token_id, device, 3)

  0%|          | 100/20000 [00:04<16:07, 20.57it/s]


KeyboardInterrupt: 

In [None]:
from src.eval_transformer_pipeline import evaluate_rouge

# !!! ROUGE-ОЦЕНКА LSTM !!!
predictionsLSTM = []
referencesLSTM = []

for batch in tqdm(test_loader, desc="Generating"):
    input_ids = batch["input_ids"][0]  # один sample
    target_ids = batch["target_ids"][0]

    gen_text = model.generate(input_ids, max_new_tokens=len(target_ids), tokenizer=tokenizer)

    target_text = tokenizer.decode(target_ids, skip_special_tokens=True)

    predictionsLSTM.append(gen_text)
    referencesLSTM.append(target_text)

evaluate_rouge(predictions=predictionsLSTM, references=referencesLSTM)


Generating: 100%|██████████| 10000/10000 [00:48<00:00, 208.24it/s]


Metrics
rouge1: 0.0455
rouge2: 0.0038
rougeL: 0.0429
rougeLsum: 0.0429
Examples:
ref:  dates im so sad, prediction: <user> warped but you guys wont be on the florida isnt working on


In [None]:
# !!! РАБОТА ТРАНСФОРМЕРА !!!

from transformers import pipeline
from src.eval_transformer_pipeline import evaluate_rouge
from tqdm import tqdm

generator = pipeline(
    "text-generation",
    model="distilgpt2",
    tokenizer=transformer_tokenizer,
    device=0               # или -1 для CPU
)

predictions = []
references = []

for sample in tqdm(final_test_dataset, desc="Generating", unit="sample"):
    input_text = tokenizer.decode(
        sample["input_ids"],
        skip_special_tokens=True
    )

    target_text = tokenizer.decode(
        sample["target_ids"],
        skip_special_tokens=True
    )

    out = generator(
        input_text,
        max_new_tokens=len(sample["target_ids"]),
        do_sample=False,
        temperature=None
    )

    gen_text = out[0]["generated_text"]
    gen_completion = gen_text[len(input_text):].strip()

    #print(f"input_text: {input_text}, target_text: {target_text}, out: {gen_completion}")

    predictions.append(gen_completion)
    references.append(target_text)

generator = pipeline("text-generation", model="distilgpt2")

Device set to use cpu
Generating: 100%|██████████| 10000/10000 [15:15<00:00, 10.92sample/s]
Device set to use cpu


In [None]:
# !!! ROUGE-ОЦЕНКА ТРАНСФОРМЕРА !!!

evaluate_rouge(predictions=predictions, references=references)

Metrics
rouge1: 0.0707
rouge2: 0.0087
rougeL: 0.0702
rougeLsum: 0.0703
Examples:
ref:  dates im so sad, prediction: .
