In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv')

# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [3]:
# 1. Define Custom Dataset Class
class NumpyDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.targets[idx]
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        return x, y

# 2. Define LSTM Model Class
class LSTMNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x.unsqueeze(1), (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# 3. Training Loop
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# 4. Create Test Function
def test_model(model, data_loader):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            outputs = model(inputs)
            all_preds.append(outputs.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    all_preds = np.concatenate(all_preds).flatten().round()
    all_targets = np.concatenate(all_targets).flatten()
    return all_targets, all_preds

# 5. Create Evaluation Function
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [None]:
# Define Training Parameters
#input_size = 768  # Number of features
hidden_size = 128  # Example hidden layer size
num_layers = 2 # Number of LSTM layers
output_size = 1 # Output size (single scalar value)
batch_size = 32 # Define Batch Size
num_epochs = 10 # Number of epochs

# BoW

## 3000 Features

### Unigram

In [None]:
input_size = 3000  # Number of features

# Bow 3000 Unigram
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer


# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=3000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review'])

# Convert sparse matrix to dense matrix
X_bow_unigram = X_bow_unigram.todense()

print("X_bow_unigram Feature Shape:", X_bow_unigram.shape)

# Convert to numpy array
X_bow_unigram = np.asarray(X_bow_unigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_unigram)
X_bow_unigram = model.transform(X_bow_unigram)


In [13]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [19]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)
     

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)
     

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using unigrams and 3000 features:")
evaluate_model(y_test, y_pre)

### Bigram

In [23]:
input_size = 3000  # Number of features

# Bow 3000 Unigram
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer


# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=3000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review'])

# Convert sparse matrix to dense matrix
X_bow_bigram = X_bow_bigram.todense()

print("X_bow_bigram Feature Shape:", X_bow_bigram.shape)

# Convert to numpy array
X_bow_bigram = np.asarray(X_bow_bigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_bigram)
X_bow_bigram = model.transform(X_bow_bigram)


X_bow_bigram Feature Shape: (50000, 3000)


In [24]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

Train Feature Shape: (25000, 3000)
Train Labels Shape: (25000,)
Test Feature Shape: (25000, 3000)
Test Labels Shape: (25000,)


In [25]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

Epoch [1/10], Loss: 0.0137
Epoch [2/10], Loss: 0.0263
Epoch [3/10], Loss: 0.0005
Epoch [4/10], Loss: 0.0002
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0001


In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using bigrams and 3000 features:")
evaluate_model(y_test, y_pre)

LSTM with BoW using unigrams and 3000 features:
Accuracy: 0.87
Precision: 0.86
Recall: 0.88
F1-Score: 0.87


### Trigram

In [None]:
input_size = 3000  # Number of features

# Bow 3000 Unigram
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer

# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=3000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review'])

# Convert sparse matrix to dense matrix
X_bow_trigram = X_bow_trigram.todense()


print("X_bow_trigram Feature Shape:", X_bow_trigram.shape)

# Convert to numpy array
X_bow_trigram = np.asarray(X_bow_trigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_trigram)
X_bow_trigram = model.transform(X_bow_trigram)

In [None]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using trigrams and 3000 features:")
evaluate_model(y_test, y_pre)