In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split as tts
import numpy as np
import re
import torch.nn.utils.rnn as rnn_utils

# 데이터 불러오기
df = pd.read_csv("netflix_reviews.csv")  # 파일 불러오기

# 텍스트 전처리 함수
def preprocess_text(text):
    if isinstance(text, float):
        return ""
    text = text.lower()  # 대문자를 소문자로
    text = re.sub(r'[^\w\s]', '', text)  # 구두점 제거
    text = re.sub(r'\d+', '', text)  # 숫자 제거
    text = text.strip()  # 띄어쓰기 제외하고 빈 칸 제거
    return text

df['content'] = df['content'].apply(preprocess_text)  # 텍스트 전처리

X = df['content'].tolist()  # 리뷰 리스트
y = df['score'].tolist()  # 점수 리스트

# 어휘 정수화
def text_pipeline(text):
    return [vocab[token] for token in tokenizer(text)]

def label_pipeline(label):
    return np.int64(label) - 1

X_tr, X_te, y_tr, y_te = tts(X, y, test_size=0.2, random_state=42)

# 데이터셋 클래스 정의
class ReviewDataset(Dataset):
    def __init__(self, X, y, text_pipeline, label_pipeline):
        self.X = X
        self.y = y
        self.text_pipeline = text_pipeline
        self.label_pipeline = label_pipeline

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        review = self.text_pipeline(self.X[idx])
        rating = self.label_pipeline(self.y[idx])
        return torch.tensor(review, dtype=torch.long), torch.tensor(rating, dtype=torch.long)

# 토크나이저 정의
tokenizer = get_tokenizer('basic_english')

# 어휘 사전 생성 함수
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# 데이터셋 정의
train_dataset = ReviewDataset(X_tr, y_tr, text_pipeline, label_pipeline)
test_dataset = ReviewDataset(X_te, y_te, text_pipeline, label_pipeline)

# 어휘 사전 생성
vocab = build_vocab_from_iterator(yield_tokens(X))

# 데이터 로더 정의
BATCH_SIZE = 16

def collate_batch(batch):
    reviews, labels = zip(*batch)
    reviews_padded = rnn_utils.pad_sequence(reviews, batch_first=True)
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    return reviews_padded, labels_tensor

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# LSTM 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout_rate=0.5):
        super(LSTMModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, batch_first=True, dropout=dropout_rate)  # num_layers=2로 변경
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)  # 드롭아웃 레이어

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded.unsqueeze(1))  # 배치 차원 추가
        hidden = self.dropout(hidden[-1])  # 드롭아웃 적용
        return self.fc(hidden)

# 하이퍼파라미터 정의
VOCAB_SIZE = len(vocab)
EMBED_DIM = 64
HIDDEN_DIM = 128
OUTPUT_DIM = len(set(y))

# 모델 초기화 및 GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)  # 모델을 GPU로 이동

# 손실 함수와 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 모델 학습 함수 정의
def train_model(model, train_dataloader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_dataloader:
            reviews, labels = batch
            reviews, labels = reviews.to(device), labels.to(device)  # GPU로 이동
            optimizer.zero_grad()  # 기울기 초기화
            output = model(reviews)  # 순전파
            loss = criterion(output, labels)  # 손실 계산
            loss.backward()  # 역전파
            optimizer.step()  # 매개변수 업데이트
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}')

# 모델 학습
num_epochs = 10
train_model(model, train_dataloader, criterion, optimizer, num_epochs)

# 테스트 세트에서 모델 평가
def evaluate_model(model, test_dataloader):
    correct = 0
    total = 0
    model.eval()  # 평가 모드로 전환
    with torch.no_grad():
        for reviews, ratings in test_dataloader:
            reviews, ratings = reviews.to(device), ratings.to(device)  # GPU로 이동
            outputs = model(reviews)
            _, predicted = torch.max(outputs, 1)
            total += ratings.size(0)
            correct += (predicted == ratings).sum().item()

    print(f'Accuracy: {100 * correct / total}%')

evaluate_model(model, test_dataloader)


117134lines [00:04, 27893.06lines/s]


Epoch 1, Loss: 1.4418251464569225
Epoch 2, Loss: 1.4283797522443582
Epoch 3, Loss: 1.4211965424016777
Epoch 4, Loss: 1.41756171771685
Epoch 5, Loss: 1.4145471741564144
Epoch 6, Loss: 1.4112727279465251
Epoch 7, Loss: 1.4057372415535927
Epoch 8, Loss: 1.3953191742201765
Epoch 9, Loss: 1.3764440944643788
Epoch 10, Loss: 1.352955166198948
Accuracy: 47.15499210312887%
