In [27]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [29]:
df=pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [30]:
def clean_text(text):
    text = re.sub(r'<br\s*/><br\s*/>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

In [32]:
df['review']=df['review'].apply(clean_text)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [34]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [35]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [10]:
max_vocab_size = 20000
max_sequence_length = 200


In [11]:
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])

In [12]:
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = df['sentiment'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.shape

(40000, 200)

In [15]:
embedding_dim = 100


In [None]:
embeddings_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

In [None]:
embeddings_index['the'].shape

In [18]:
word_index = tokenizer.word_index
num_words = min(max_vocab_size, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

In [19]:
embedding_matrix.shape

(20000, 100)

In [20]:
for word, i in word_index.items():
    if i < max_vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [21]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.0035074 , -0.14286   ,  0.80261999, ..., -0.58814001,
         0.31889999,  0.012209  ],
       [ 0.20203   , -0.25244001, -0.12557   , ..., -0.16885   ,
        -0.99378997,  0.32501   ],
       [ 0.097328  ,  0.37051001, -0.34889001, ...,  0.037943  ,
         0.27794001,  0.68112999]])

In [22]:
model=Sequential([
    Embedding(input_dim=num_words,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_sequence_length,
              trainable=False),
    SimpleRNN(10,return_sequences=True),
    SimpleRNN(5,return_sequences=False),
    Dense(1, activation='sigmoid')
])

In [23]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, verbose=1)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 66ms/step - accuracy: 0.8544 - loss: 0.3313 - val_accuracy: 1.0000 - val_loss: 0.0484
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 65ms/step - accuracy: 1.0000 - loss: 0.0407 - val_accuracy: 1.0000 - val_loss: 0.0253
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 65ms/step - accuracy: 1.0000 - loss: 0.0224 - val_accuracy: 1.0000 - val_loss: 0.0158
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 66ms/step - accuracy: 1.0000 - loss: 0.0143 - val_accuracy: 1.0000 - val_loss: 0.0108
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 66ms/step - accuracy: 1.0000 - loss: 0.0100 - val_accuracy: 1.0000 - val_loss: 0.0078


In [24]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step
Test Accuracy: 1.0


In [25]:
model2=Sequential([
    Embedding(input_dim=num_words,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_sequence_length,
              trainable=False),
    LSTM(10,return_sequences=True),
    LSTM(5,return_sequences=False),
    Dense(1, activation='sigmoid')
])

In [26]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model2.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, verbose=1)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 108ms/step - accuracy: 0.9550 - loss: 0.3002 - val_accuracy: 1.0000 - val_loss: 0.0593
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 107ms/step - accuracy: 1.0000 - loss: 0.0472 - val_accuracy: 1.0000 - val_loss: 0.0258
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 108ms/step - accuracy: 1.0000 - loss: 0.0224 - val_accuracy: 1.0000 - val_loss: 0.0151
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 107ms/step - accuracy: 1.0000 - loss: 0.0136 - val_accuracy: 1.0000 - val_loss: 0.0100
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 107ms/step - accuracy: 1.0000 - loss: 0.0092 - val_accuracy: 1.0000 - val_loss: 0.0071


In [27]:
y_pred = (model2.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step
Test Accuracy: 1.0


Pytorch Embedding Implementation

In [23]:
import torch
from torch import nn
import torch.optim as optim
from torchtext.vocab import GloVe
import torchtext
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,0
1,a wonderful little production the filming tec...,0
2,i thought this was a wonderful way to spend ti...,0
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,0


In [22]:
reviews = df['review'].values
labels = df['sentiment'].values

In [48]:
# Tokenize reviews
def tokenize(text):
    return word_tokenize(text.lower())

# Tokenize all reviews
tokenized_reviews = [tokenize(review) for review in reviews]


In [49]:
vocab = Counter([word for review in tokenized_reviews for word in review])

# Create word to index mapping
word2idx = {word: i+2 for i, word in enumerate(vocab)}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1


In [50]:
embedding_dim = 100  # or 50, 200 depending on the GloVe file you use
embedding_matrix = np.zeros((len(word2idx), embedding_dim))

for word, idx in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

In [51]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels, word2idx, max_len=200):
        self.reviews = reviews
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        
        # Convert words to indices, pad if necessary
        indices = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in review]
        if len(indices) > self.max_len:
            indices = indices[:self.max_len]
        else:
            indices = indices + [self.word2idx['<PAD>']] * (self.max_len - len(indices))
        
        return torch.tensor(indices), torch.tensor(label)


In [52]:
# Split the data into training and testing sets
train_reviews, test_reviews, train_labels, test_labels = train_test_split(
    tokenized_reviews, labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = IMDBDataset(train_reviews, train_labels, word2idx)
test_dataset = IMDBDataset(test_reviews, test_labels, word2idx)

# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [53]:
class SentimentLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers, dropout):
        super(SentimentLSTM, self).__init__()
        
        vocab_size, embedding_dim = embedding_matrix.shape
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Freeze GloVe embeddings
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        output = self.fc(self.dropout(hidden[-1]))
        return output

# Define model parameters
hidden_dim = 128
output_dim = 1
n_layers = 2
dropout = 0.5

# Initialize model
model = SentimentLSTM(embedding_matrix, hidden_dim, output_dim, n_layers, dropout).to(device)


In [54]:
# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        epoch_acc = 0
        for reviews, labels in train_loader:
            reviews, labels = reviews.to(device), labels.to(device).float()
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(reviews).squeeze(1)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}')

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=5)


Epoch 1/5, Loss: 0.009315132898115553
Epoch 2/5, Loss: 3.394587571820011e-05
Epoch 3/5, Loss: 1.4156521410041023e-05
Epoch 4/5, Loss: 7.81111176183913e-06
Epoch 5/5, Loss: 4.8000471600971654e-06


In [55]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for reviews, labels in test_loader:
            reviews, labels = reviews.to(device), labels.to(device).float()
            outputs = model(reviews).squeeze(1)
            predictions = torch.round(torch.sigmoid(outputs))
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
        
        print(f'Accuracy: {correct/total*100:.2f}%')

# Evaluate the model
evaluate_model(model, test_loader)


Accuracy: 100.00%
