In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import re
import torch.nn.utils.rnn as rnn_utils

# 데이터 불러오기 및 전처리
df = pd.read_csv("netflix_reviews.csv")
df['content'] = df['content'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower().strip()) if isinstance(x, str) else "")
X, y = df['content'].tolist(), df['score'].tolist()

# 데이터셋 클래스 정의
class ReviewDataset(Dataset):
    def __init__(self, X, y, text_pipeline, label_pipeline):
        self.X, self.y = X, y
        self.text_pipeline, self.label_pipeline = text_pipeline, label_pipeline

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.text_pipeline(self.X[idx]), dtype=torch.long), torch.tensor(self.label_pipeline(self.y[idx]), dtype=torch.long)

# 토크나이저 및 어휘 사전 생성
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator([tokenizer(text) for text in X])

# 데이터 로더 정의
def collate_batch(batch):
    reviews, labels = zip(*batch)
    reviews_padded = rnn_utils.pad_sequence(reviews, batch_first=True)
    return reviews_padded, torch.tensor(labels, dtype=torch.long)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset = ReviewDataset(X_train, y_train, lambda text: [vocab[token] for token in tokenizer(text)], lambda label: int(label) - 1)
test_dataset = ReviewDataset(X_test, y_test, lambda text: [vocab[token] for token in tokenizer(text)], lambda label: int(label) - 1)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)

# LSTM 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout_rate=0.5):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded.unsqueeze(1))
        return self.fc(hidden[-1])

# 모델, 손실 함수, 옵티마이저 초기화
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(len(vocab), 64, 128, len(set(y))).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 모델 학습 함수
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for reviews, labels in dataloader:
            reviews, labels = reviews.to(device), labels.to(device)
            optimizer.zero_grad()
            output = model(reviews)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}')

# 모델 평가 함수
def evaluate_model(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for reviews, ratings in dataloader:
            reviews, ratings = reviews.to(device), ratings.to(device)
            outputs = model(reviews)
            predicted = torch.max(outputs, 1)[1]
            total += ratings.size(0)
            correct += (predicted == ratings).sum().item()
    print(f'Accuracy: {100 * correct / total:.2f}%')

# 학습 및 평가
train_model(model, train_dataloader, criterion, optimizer, num_epochs=10)
evaluate_model(model, test_dataloader)


117134lines [00:00, 174254.39lines/s]


Epoch 1, Loss: 1.4636060463283656
Epoch 2, Loss: 1.4364671775505404
Epoch 3, Loss: 1.4358282369151458
Epoch 4, Loss: 1.4349439838233662
Epoch 5, Loss: 1.434109673809273
Epoch 6, Loss: 1.4325827921616747
Epoch 7, Loss: 1.4309258018337419
Epoch 8, Loss: 1.4291480773138105
Epoch 9, Loss: 1.427471437795984
Epoch 10, Loss: 1.425643097418567
Accuracy: 39.14%
