In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Dataset
try:
    amazon_df = pd.read_csv('amazon_cells_labelled.txt', delimiter='\t', header=None, names=['sentence', 'label'])
    imdb_df = pd.read_csv('imdb_labelled.txt', delimiter='\t', header=None, names=['sentence', 'label'])
    yelp_df = pd.read_csv('yelp_labelled.txt', delimiter='\t', header=None, names=['sentence', 'label'])
except FileNotFoundError:
    print("Dataset files not found. Please ensure the dataset files are in the current directory.")
    exit()

# Combine datasets
df = pd.concat([amazon_df, imdb_df, yelp_df], ignore_index=True)

# Preprocess Text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return words

df['tokens'] = df['sentence'].apply(preprocess_text)

# Build Vocabulary
all_tokens = [token for tokens in df['tokens'] for token in tokens]
vocab_size = 1000
most_common_tokens = Counter(all_tokens).most_common(vocab_size)
word_to_idx = {word: idx + 1 for idx, (word, _) in enumerate(most_common_tokens)}  # Start from 1 for padding

# Vectorize Tokens
def vectorize_tokens(tokens, word_to_idx, max_len=20):
    vector = [word_to_idx.get(token, 0) for token in tokens]
    if len(vector) < max_len:
        vector += [0] * (max_len - len(vector))
    else:
        vector = vector[:max_len]
    return vector

max_len = 20
df['vector'] = df['tokens'].apply(lambda x: vectorize_tokens(x, word_to_idx, max_len))

# Split Data
X = np.stack(df['vector'].values)
y = df['label'].values

split_ratio = 0.8
split_index = int(len(df) * split_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Create DataLoader
batch_size = 32
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float32))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.long), torch.tensor(y_test, dtype=torch.float32))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

# Define LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1, dropout=0.5):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return self.sigmoid(out)

# Model Parameters
embed_size = 64
hidden_size = 128
output_size = 1
num_layers = 2

model = SentimentLSTM(vocab_size, embed_size, hidden_size, output_size, num_layers)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()

accuracy = correct / total
print(f'\nTest Accuracy: {accuracy * 100:.2f}%')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asraf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch [1/20], Loss: 0.6937
Epoch [2/20], Loss: 0.6773
Epoch [3/20], Loss: 0.6096
Epoch [4/20], Loss: 0.5296
Epoch [5/20], Loss: 0.4544
Epoch [6/20], Loss: 0.3887
Epoch [7/20], Loss: 0.3360
Epoch [8/20], Loss: 0.2770
Epoch [9/20], Loss: 0.2426
Epoch [10/20], Loss: 0.2003
Epoch [11/20], Loss: 0.1717
Epoch [12/20], Loss: 0.1502
Epoch [13/20], Loss: 0.1534
Epoch [14/20], Loss: 0.1345
Epoch [15/20], Loss: 0.1158
Epoch [16/20], Loss: 0.1239
Epoch [17/20], Loss: 0.1088
Epoch [18/20], Loss: 0.1039
Epoch [19/20], Loss: 0.1122
Epoch [20/20], Loss: 0.0970

Test Accuracy: 73.82%
