# LSTM Model

### Import Datasets

In [13]:
import pandas as pd

# Import Datasets
df1 = pd.read_csv(r'C:\Users\megan\Desktop\ML Project\processed_df1.csv')
df2 = pd.read_csv(r'C:\Users\megan\Desktop\ML Project\processed_df2.csv')
df3 = pd.read_csv(r'C:\Users\megan\Desktop\ML Project\processed_df3.csv')

### Imports

In [14]:
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, f1_score
from collections import Counter
from torchtext.vocab import Vocab
import torch
import torch.nn as nn
import torch.optim as optim
import re

In [None]:
# Hyperparameters
learning_rate = 0.001   # Learning rate for the optimizer
batch_size = 32         # Batch size for DataLoader
embed_dim = 100         # Embedding dimension for word vectors
hidden_dim = 128        # Hidden dimension for LSTM
num_layers = 2          # Number of LSTM layers
dropout_rate = 0.2      # Dropout rate for regularization
epochs = 5              # Number of training epochs

# Define tokenizer function
def tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

# Label encoding
label_encoder = LabelEncoder()
df1['encoded_label'] = label_encoder.fit_transform(df1['sentiment'])  
df3['encoded_label'] = label_encoder.transform(df3['sentiment']) 

# Manual token-to-index
counter = Counter()
for text in df1['cleaned_reviewText']: 
    counter.update(tokenizer(text))

# Add special tokens
counter.update(['<pad>', '<unk>'])

# Build vocab with special tokens
vocab = {word: idx for idx, (word, _) in enumerate(counter.items())}

# Map tokens to indices - <unk> if not found in vocab
def encode(tokens):
    unk_index = vocab['<unk>']
    return [vocab[token] if token in vocab else unk_index for token in tokens]

# Dataset class 
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        indices = encode(tokens)  
        return torch.tensor(indices), self.labels[idx]

# Collate function for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded = pad_sequence(texts, batch_first=True, padding_value=vocab['<pad>']).long()
    return padded, torch.tensor(labels, dtype=torch.long)

# DataLoaders
train_dataset = SentimentDataset(df1['cleaned_reviewText'].tolist(), df1['encoded_label'].tolist(), vocab, tokenizer)
test_dataset = SentimentDataset(df3['review_description'].tolist(), df3['encoded_label'].tolist(), vocab, tokenizer)

train_loader_lstm = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader_lstm = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
# Define LSTM model
class modelLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers, dropout_rate):
        super(modelLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(dropout_rate)  # Dropout layer

    def forward(self, x):
        x = self.embedding(x)                  # (batch_size, seq_len, embed_dim)
        lstm_out, (hn, cn) = self.lstm(x)      # (batch_size, seq_len, hidden_dim)
        x = self.dropout(hn[-1])               # Take the last hidden state of last LSTM layer
        return self.fc(x)                      # (batch_size, num_classes)

# Instantiate model
model_lstm = modelLSTM(vocab_size=len(vocab), embed_dim=embed_dim, hidden_dim=hidden_dim, num_classes=3, 
                       num_layers=num_layers, dropout_rate=dropout_rate)

# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_lstm.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model_lstm.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in train_loader_lstm:
        optimizer.zero_grad()
        outputs = model_lstm(inputs)  # Forward pass
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

    accuracy = correct_preds / total_preds
    avg_loss = running_loss / len(train_loader_lstm)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {round(avg_loss, 4)}, Accuracy: {round(accuracy, 4)}")

# Evaluate the model on the test set df3
model_lstm.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader_lstm: 
        outputs = model_lstm(inputs)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')
print("")
print("Evaluation using Test Data(df3): ")
print(f"Accuracy: {round(accuracy, 4)}")
print(f"Recall: {round(recall, 4)}")
print(f"F1 Score: {round(f1, 4)}")

Epoch 1/5, Loss: 1.016, Accuracy: 0.4984
Epoch 2/5, Loss: 1.01, Accuracy: 0.5027
Epoch 3/5, Loss: 1.0073, Accuracy: 0.5049
Epoch 4/5, Loss: 1.0027, Accuracy: 0.5067
Epoch 5/5, Loss: 1.0011, Accuracy: 0.5074

Evaluation using Test Data(df3): 
Accuracy: 0.9826
Recall: 0.9826
F1 Score: 0.9802


## Experimentation

### Hyperparameter Tuning

In [None]:
# New Hyperparameters
learning_rate = 0.005  # Increase lr from .001 to .005
batch_size = 16        # Smaller batch size
embed_dim = 100
hidden_dim = 64        # Decrease from 128 to 64
num_layers = 2
dropout_rate = 0.3     # Increase dropout_rate
epochs = 10            # Increase epochs

# Define tokenizer function
def tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

# Label encoding
label_encoder = LabelEncoder()
df1['encoded_label'] = label_encoder.fit_transform(df1['sentiment'])  
df3['encoded_label'] = label_encoder.transform(df3['sentiment']) 

# Manual token-to-index
counter = Counter()
for text in df1['cleaned_reviewText']:  
    counter.update(tokenizer(text))

# Add special tokens
counter.update(['<pad>', '<unk>'])

# Build vocab with special tokens
vocab = {word: idx for idx, (word, _) in enumerate(counter.items())}

# Manually map tokens to indices
def encode(tokens):
    unk_index = vocab['<unk>']
    return [vocab[token] if token in vocab else unk_index for token in tokens]

# Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        indices = encode(tokens)  
        return torch.tensor(indices), self.labels[idx]

# Collate function for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded = pad_sequence(texts, batch_first=True, padding_value=vocab['<pad>']).long()
    return padded, torch.tensor(labels, dtype=torch.long)

# DataLoaders
train_dataset = SentimentDataset(df1['cleaned_reviewText'].tolist(), df1['encoded_label'].tolist(), vocab, tokenizer)
test_dataset = SentimentDataset(df3['review_description'].tolist(), df3['encoded_label'].tolist(), vocab, tokenizer)

train_loader_lstm = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader_lstm = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Define LSTM model
class modelLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers, dropout_rate):
        super(modelLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(dropout_rate)  # Dropout layer

    def forward(self, x):
        x = self.embedding(x)              
        lstm_out, (hn, cn) = self.lstm(x) 
        x = self.dropout(hn[-1])            
        return self.fc(x)                   

# Instantiate model
model_lstm = modelLSTM(vocab_size=len(vocab), embed_dim=embed_dim, hidden_dim=hidden_dim, num_classes=3, 
                       num_layers=num_layers, dropout_rate=dropout_rate)

# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_lstm.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model_lstm.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in train_loader_lstm:
        optimizer.zero_grad()
        outputs = model_lstm(inputs)  # Forward pass
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

    accuracy = correct_preds / total_preds
    avg_loss = running_loss / len(train_loader_lstm)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {round(avg_loss, 4)}, Accuracy: {round(accuracy, 4)}")

# Evaluate the model on the test set df3
model_lstm.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader_lstm: 
        outputs = model_lstm(inputs)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')
print("")
print(f"[New HP] Accuracy: {round(accuracy, 4)}")
print(f"[New HP] Recall: {round(recall, 4)}")
print(f"[New HP] F1 Score: {round(f1, 4)}")

Epoch 1/10, Loss: 1.0139, Accuracy: 0.4992
Epoch 2/10, Loss: 1.0092, Accuracy: 0.5014
Epoch 3/10, Loss: 1.006, Accuracy: 0.5058
Epoch 4/10, Loss: 0.9985, Accuracy: 0.5089
Epoch 5/10, Loss: 0.9915, Accuracy: 0.5121
Epoch 6/10, Loss: 0.9871, Accuracy: 0.5145
Epoch 7/10, Loss: 0.9774, Accuracy: 0.5268
Epoch 8/10, Loss: 0.8127, Accuracy: 0.6622
Epoch 9/10, Loss: 0.6302, Accuracy: 0.7573
Epoch 10/10, Loss: 0.5345, Accuracy: 0.788

[New HP] Accuracy: 0.8228
[New HP] Recall: 0.8228
[New HP] F1 Score: 0.894


### Dataset Sizes

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load your dataset - Using df1
df_ds = pd.read_csv(r'C:\Users\megan\Desktop\ML Project\processed_df1.csv')

# Split data into training and testing
train_data, test_data = train_test_split(df_ds, test_size=0.2, random_state=42)

# Tokenization and padding function
def preprocess_data(data, tokenizer=None, max_words=10000, max_len=100):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(data['cleaned_reviewText'])
    sequences = tokenizer.texts_to_sequences(data['cleaned_reviewText'])
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    return padded_sequences, tokenizer

# Convert labels to integers (sentiment)
def encode_labels(labels):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(labels)

# LSTM model
def ds_lstm_model(input_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=100, input_length=input_length))
    model.add(LSTM(128, activation='relu', return_sequences=False))  # LSTM layer
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  # Softmax for multi-class classification
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to train and evaluate model
def train_and_evaluate_lstm(train_data, test_data, subset_size):
    # Subset the training data
    train_subset = train_data.sample(frac=subset_size, random_state=42)
    
    # Preprocess the data
    X_train, tokenizer = preprocess_data(train_subset, max_words=10000)
    X_test, _ = preprocess_data(test_data, tokenizer=tokenizer, max_words=10000)
    
    y_train = encode_labels(train_subset['sentiment'])
    y_test = encode_labels(test_data['sentiment'])
    
    # Build and train LSTM model
    model = ds_lstm_model(input_length=X_train.shape[1], num_classes=len(set(y_train)))
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate model
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Dataset Size: {subset_size * 100}% | Accuracy: {round(accuracy, 4)}\n")

# Experiment with 25% and 50% dataset sizes
dataset_sizes = [0.25, 0.5]  # 25% and 50% dataset sizes
for size in dataset_sizes:
    train_and_evaluate_lstm(train_data, test_data, size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Dataset Size: 25.0% | Accuracy: 0.6058

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Dataset Size: 50.0% | Accuracy: 0.6438



In [None]:
# Load your dataset - Using df3
df_ds = pd.read_csv(r'C:\Users\megan\Desktop\ML Project\processed_df3.csv')

# Split data into training and testing 
train_data, test_data = train_test_split(df_ds, test_size=0.2, random_state=42)

# Tokenization and padding function
def preprocess_data(data, tokenizer=None, max_words=10000, max_len=100):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(data['review_description'])
    sequences = tokenizer.texts_to_sequences(data['review_description'])
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    return padded_sequences, tokenizer

# Convert labels to integers (sentiment)
def encode_labels(labels):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(labels)

# Build LSTM model
def ds_lstm_model(input_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=100, input_length=input_length))
    model.add(LSTM(128, activation='relu', return_sequences=False))  # LSTM layer
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  # Softmax for multi-class classification
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to train and evaluate model
def train_and_evaluate_lstm(train_data, test_data, subset_size):
    # Subset the training data
    train_subset = train_data.sample(frac=subset_size, random_state=42)
    
    # Preprocess the data
    X_train, tokenizer = preprocess_data(train_subset, max_words=10000)
    X_test, _ = preprocess_data(test_data, tokenizer=tokenizer, max_words=10000)
    
    y_train = encode_labels(train_subset['sentiment'])
    y_test = encode_labels(test_data['sentiment'])
    
    # Build and train LSTM model
    model = ds_lstm_model(input_length=X_train.shape[1], num_classes=len(set(y_train)))
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate model
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Dataset Size: {subset_size * 100}% | Accuracy: {round(accuracy, 4)}\n")

# Experiment with 25% and 50% dataset sizes
dataset_sizes = [0.25, 0.5]  # 25% and 50% dataset sizes
for size in dataset_sizes:
    train_and_evaluate_lstm(train_data, test_data, size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Dataset Size: 25.0% | Accuracy: 0.0109

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Dataset Size: 50.0% | Accuracy: 0.0109

