In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(f"Original dataset size: {len(df)}")

In [None]:
def generate_rating(review):
    """Генерация реалистичных оценок на основе содержания отзыва"""
    strong_positive = ['masterpiece', 'excellent', 'perfect', 'outstanding', 'brilliant']
    moderate_positive = ['great', 'good', 'wonderful', 'enjoyable', 'recommend']
    neutral = ['average', 'mediocre', 'adequate', 'passable', 'acceptable']
    moderate_negative = ['poor', 'bad', 'disappointing', 'weak', 'lacking']
    strong_negative = ['awful', 'terrible', 'horrible', 'worst', 'waste']
    
    strong_pos_count = sum(1 for word in strong_positive if word in review.lower())
    mod_pos_count = sum(1 for word in moderate_positive if word in review.lower())
    neutral_count = sum(1 for word in neutral if word in review.lower())
    mod_neg_count = sum(1 for word in moderate_negative if word in review.lower())
    strong_neg_count = sum(1 for word in strong_negative if word in review.lower())
    
    total_score = (strong_pos_count * 2) + mod_pos_count - mod_neg_count - (strong_neg_count * 2)
    
    if total_score >= 3:
        return 5
    elif total_score >= 1:
        return 4
    elif total_score == 0:
        return 3
    elif total_score >= -2:
        return 2
    else:
        return 1

In [None]:
# Применяем 
df['rating'] = df['review'].apply(generate_rating)
df['class_label'] = df['rating'] - 1  # Конвертируем в 0-4

# Анализ
print("\nRealistic rating distribution:")
rating_dist = df['rating'].value_counts().sort_index()
print(rating_dist)

# Визуализация 
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=rating_dist.index, y=rating_dist.values, palette="viridis")
plt.title('Distribution of Movie Ratings', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
for p in ax.patches:
    ax.annotate(f'{p.get_height():.0f}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 9), 
                textcoords='offset points',
                fontsize=12)

plt.tight_layout()
plt.savefig('rating_distribution.png')
plt.show()


In [None]:
X, y = df['review'].values, df['class_label'].values
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

print(f"\nTrain size: {len(x_train)}")
print(f"Validation size: {len(x_val)}")
print(f"Test size: {len(x_test)}")

# Расчет весов для устранения дисбаланса
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("\nClass weights:", class_weights.cpu().numpy())

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len=256):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
MAX_LEN = 256
BATCH_SIZE = 16

train_dataset = ReviewDataset(x_train, y_train, tokenizer, MAX_LEN)
val_dataset = ReviewDataset(x_val, y_val, tokenizer, MAX_LEN)
test_dataset = ReviewDataset(x_test, y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=5,
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

In [None]:
# Настройка 
EPOCHS = 4
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1 * total_steps,
    num_training_steps=total_steps
)

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler, class_weights=None):
    model.train()
    losses = []
    correct_predictions = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        
        # Взвешивание потерь для учета дисбаланса классов
        if class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=class_weights)
            logits = outputs.logits
            loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    avg_loss = np.mean(losses)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    
    return avg_loss, accuracy


In [None]:
def eval_model(model, data_loader, device, class_weights=None):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            
            if class_weights is not None:
                loss_fct = nn.CrossEntropyLoss(weight=class_weights)
                logits = outputs.logits
                loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
            
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = np.mean(losses)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    
    return avg_loss, accuracy, all_preds, all_labels

In [None]:
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_loss, train_acc = train_epoch(
        model,
        train_loader,
        optimizer,
        device,
        scheduler,
        class_weights
    )
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc.cpu().item())
    
    val_loss, val_acc, _, _ = eval_model(
        model,
        val_loader,
        device,
        class_weights
    )
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc.cpu().item())
    
    print(f"Train loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"Validation loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc
        print("Saved best model!")

In [None]:
plt.figure(figsize=(12, 5))

# потери
plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], 'b-o', label='Training Loss')
plt.plot(history['val_loss'], 'r-o', label='Validation Loss')
plt.title('Training and Validation Loss', fontsize=14)
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend()
plt.grid(True)

# точность
plt.subplot(1, 2, 2)
plt.plot(history['train_acc'], 'b-o', label='Training Accuracy')
plt.plot(history['val_acc'], 'r-o', label='Validation Accuracy')
plt.title('Training and Validation Accuracy', fontsize=14)
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig('learning_curves.png')
plt.show()


In [None]:
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

# Оценка на тестовом наборе
test_loss, test_acc, all_preds, all_labels = eval_model(
    model,
    test_loader,
    device,
    class_weights
)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['1', '2', '3', '4', '5'], 
            yticklabels=['1', '2', '3', '4', '5'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted Ratings', fontsize=14)
plt.ylabel('True Ratings', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

In [None]:
print("\nClassification Report:")
report = classification_report(all_labels, all_preds, target_names=['1', '2', '3', '4', '5'], output_dict=True)
print(classification_report(all_labels, all_preds, target_names=['1', '2', '3', '4', '5']))

In [None]:
metrics = ['precision', 'recall', 'f1-score']
class_names = ['1', '2', '3', '4', '5']

plt.figure(figsize=(12, 8))
for i, metric in enumerate(metrics):
    plt.subplot(3, 1, i+1)
    values = [report[class_name][metric] for class_name in class_names]
    plt.bar(class_names, values, color=sns.color_palette("viridis", 5))
    plt.title(f'{metric.capitalize()} per Class', fontsize=14)
    plt.ylabel(metric, fontsize=12)
    plt.ylim(0.7, 1.0)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Добавление значений на столбцы
    for j, v in enumerate(values):
        plt.text(j, v + 0.01, f"{v:.2f}", ha='center', fontsize=10)

plt.tight_layout()
plt.savefig('class_metrics.png')
plt.show()

In [None]:
model.save_pretrained("bert_sentiment_model")
tokenizer.save_pretrained("bert_sentiment_model")
print("\nModel and tokenizer saved!")