## LSTM

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize


In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv') 
df.head()


Unnamed: 0.1,Unnamed: 0,review,sentiment,cleaned_review,sentiment_numeric,tokens
0,0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...,1,"['one', 'reviewer', 'mentioned', 'watching', '..."
1,1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1,"['wonderful', 'little', 'production', 'filming..."
2,2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1,"['thought', 'wonderful', 'way', 'spend', 'time..."
3,3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,0,"['basically', 'there', 'family', 'little', 'bo..."
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1,"['petter', 'matteis', 'love', 'time', 'money',..."


## LSTM related classes and methods

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix


# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# 1. Define Custom Dataset Class
class NumpyDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.targets[idx]
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        return x, y

# 2. Define LSTM Model Class
class LSTMNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x.unsqueeze(1), (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# 3. Training Loop
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# 4. Create Test Function
def test_model(model, test_loader):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            outputs = model(inputs)
            all_preds.append(outputs.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    all_preds = np.concatenate(all_preds).flatten().round()
    all_targets = np.concatenate(all_targets).flatten()
    return all_targets, all_preds

# Define LSTM Training Parameters
# input_size = 5000  # Input size (BoW features)
hidden_size = 128  # Example hidden layer size
num_layers = 2 # Number of LSTM layers
output_size = 1 # Output size (single scalar value)
batch_size = 32 # Define Batch Size
num_epochs = 10 # Number of epochs

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [None]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

'# Function to print evaluation metrics\ndef evaluate_model(true_labels, predicted_labels):\n    accuracy = accuracy_score(true_labels, predicted_labels)\n    precision = precision_score(true_labels, predicted_labels)\n    recall = recall_score(true_labels, predicted_labels)\n    f1 = f1_score(true_labels, predicted_labels)\n    \n    print(f"Accuracy: {accuracy:.2f}")\n    print(f"Precision: {precision:.2f}")\n    print(f"Recall: {recall:.2f}")\n    print(f"F1-Score: {f1:.2f}")'

## 3000 Features

In [None]:
input_size = 3000

### Unigram

In [19]:
# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=3000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Unigram Feature Shape with 3000 features:", X_bow_unigram.shape)

# Convert to numpy array
X_bow_unigram = np.asarray(X_bow_unigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_unigram)
X_bow_unigram = model.transform(X_bow_unigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)


BoW Unigram Feature Shape with 3000 features: (50000, 3000)


In [20]:

train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)


In [21]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)


Epoch [1/10], Loss: 0.0971
Epoch [2/10], Loss: 0.0022
Epoch [3/10], Loss: 0.0001
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0001
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000


In [22]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using unigrams and 3000 features:")
evaluate_model(y_test, y_pre)

LSTM with BoW using unigrams and 3000 features:
Accuracy: 0.87
Precision: 0.85
Recall: 0.88
F1-Score: 0.87


### Bigram

In [11]:
# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=3000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Bigram Feature Shape with 3000 features:", X_bow_bigram.shape)

# Convert to numpy array
X_bow_bigram = np.asarray(X_bow_bigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_bigram)
X_bow_bigram = model.transform(X_bow_bigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

BoW Bigram Feature Shape with 3000 features: (50000, 3000)


In [12]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [13]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

Epoch [1/10], Loss: 0.1599
Epoch [2/10], Loss: 0.0019
Epoch [3/10], Loss: 0.0002
Epoch [4/10], Loss: 0.0001
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000


In [14]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using bigrams and 3000 features:")
evaluate_model(y_test, y_pre)

LSTM with BoW using bigrams and 3000 features:
Accuracy: 0.86
Precision: 0.85
Recall: 0.88
F1-Score: 0.87


### Trigram

In [15]:
# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=3000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Trigram Feature Shape with 3000 features:", X_bow_trigram.shape)

# Convert to numpy array
X_bow_trigram = np.asarray(X_bow_trigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_trigram)
X_bow_trigram = model.transform(X_bow_trigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

BoW Trigram Feature Shape with 3000 features: (50000, 3000)


In [16]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [17]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

Epoch [1/10], Loss: 0.1205
Epoch [2/10], Loss: 0.0084
Epoch [3/10], Loss: 0.0001
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000


In [18]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using trigrams and 3000 features:")
evaluate_model(y_test, y_pre)

LSTM with BoW using trigrams and 3000 features:
Accuracy: 0.86
Precision: 0.85
Recall: 0.89
F1-Score: 0.87


## 5000 Features

In [23]:
input_size = 5000

### Unigram

In [24]:
# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=5000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Unigram Feature Shape with 5000 features:", X_bow_unigram.shape)

# Convert to numpy array
X_bow_unigram = np.asarray(X_bow_unigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_unigram)
X_bow_unigram = model.transform(X_bow_unigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

BoW Unigram Feature Shape with 5000 features: (50000, 5000)


In [25]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [26]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)


Epoch [1/10], Loss: 0.0632
Epoch [2/10], Loss: 0.0016
Epoch [3/10], Loss: 0.0016
Epoch [4/10], Loss: 0.0002
Epoch [5/10], Loss: 0.0001
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000


In [27]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using unigrams and 5000 features:")
evaluate_model(y_test, y_pre)


LSTM with BoW using unigrams and 5000 features:
Accuracy: 0.86
Precision: 0.83
Recall: 0.90
F1-Score: 0.86


### Bigram

In [None]:
vectorizer_bow_bigram = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW bigram Feature Shape with 5000 features:", X_bow_bigram.shape)

# Convert to numpy array
X_bow_bigram = np.asarray(X_bow_bigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_bigram)
X_bow_bigram = model.transform(X_bow_bigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

In [None]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using bigrams and 5000 features:")
evaluate_model(y_test, y_pre)

### Trigram

In [None]:
# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=5000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW trigram Feature Shape with 5000 features:", X_bow_trigram.shape)

# Convert to numpy array
X_bow_trigram = np.asarray(X_bow_trigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_trigram)
X_bow_trigram = model.transform(X_bow_trigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

In [None]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using trigrams and 5000 features:")
evaluate_model(y_test, y_pre)

## 7000 Features

In [None]:
input_size = 7000

### Unigram

In [None]:
# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=7000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW unigram Feature Shape with 7000 features:", X_bow_unigram.shape)

# Convert to numpy array
X_bow_unigram = np.asarray(X_bow_unigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_unigram)
X_bow_unigram = model.transform(X_bow_unigram)


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

In [None]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using unigrams and 7000 features:")
evaluate_model(y_test, y_pre)

### Bigram

In [None]:
# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=7000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW bigram Feature Shape with 7000 features:", X_bow_bigram.shape)

# Convert to numpy array
X_bow_bigram = np.asarray(X_bow_bigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_bigram)
X_bow_bigram = model.transform(X_bow_bigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)


In [None]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using bigrams and 7000 features:")
evaluate_model(y_test, y_pre)

### Trigram

In [None]:
# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=7000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW trigram Feature Shape with 7000 features:", X_bow_trigram.shape)

# Convert to numpy array
X_bow_trigram = np.asarray(X_bow_trigram)

scaler = StandardScaler()
model = scaler.fit(X_bow_trigram)
X_bow_trigram = model.transform(X_bow_trigram)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment_numeric'].values, test_size=0.5, random_state=42)


In [None]:
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW using trigrams and 7000 features:")
evaluate_model(y_test, y_pre)