In [50]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [51]:
df=pd.read_csv(r"C:\Users\ASUS\Desktop\College\Sem VII\NLP\SNLP-main\SNLP-main\IMDB Dataset.csv\IMDB Dataset.csv")

In [52]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [53]:
def clean_text(text):
    text = re.sub(r'<br\s*/><br\s*/>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

In [54]:
df['review']=df['review'].apply(clean_text)

In [55]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [56]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [57]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [58]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [59]:
max_vocab_size = 20000
max_sequence_length = 200

In [60]:
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])

In [61]:
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = df['sentiment'].values

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
X_train.shape

(40000, 200)

In [64]:
embedding_dim = 100

In [65]:
embeddings_index = {}
glove_path=r"C:\Users\ronit\OneDrive\Desktop\College\Sem 7\SNLP\glove.6B\glove.6B.100d.txt"
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

In [66]:
embeddings_index['the'].shape

(100,)

In [67]:
word_index = tokenizer.word_index
num_words = min(max_vocab_size, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

In [68]:
embedding_matrix.shape

(20000, 100)

In [69]:
for word, i in word_index.items():
    if i < max_vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [70]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.0035074 , -0.14286   ,  0.80261999, ..., -0.58814001,
         0.31889999,  0.012209  ],
       [ 0.20203   , -0.25244001, -0.12557   , ..., -0.16885   ,
        -0.99378997,  0.32501   ],
       [ 0.097328  ,  0.37051001, -0.34889001, ...,  0.037943  ,
         0.27794001,  0.68112999]])

In [71]:
model=Sequential([
    Embedding(input_dim=num_words,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_sequence_length,
              trainable=False),
    SimpleRNN(10,return_sequences=True),
    SimpleRNN(5,return_sequences=False),
    Dense(1, activation='sigmoid')
])

In [72]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [73]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 1.0


In [74]:
model2=Sequential([
    Embedding(input_dim=num_words,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_sequence_length,
              trainable=False),
    LSTM(10,return_sequences=True),
    LSTM(5,return_sequences=False),
    Dense(1, activation='sigmoid')
])

In [75]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model2.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [76]:
y_pred = (model2.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 1.0


In [77]:
import torch
from torch import nn
import torch.optim as optim
from torchtext.vocab import GloVe
import torchtext
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter

In [78]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,0
1,a wonderful little production the filming tec...,0
2,i thought this was a wonderful way to spend ti...,0
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,0


In [79]:
reviews = df['review'].values
labels = df['sentiment'].values

In [80]:
def tokenize(text):
    return word_tokenize(text.lower())

tokenized_reviews = [tokenize(review) for review in reviews]

In [81]:
vocab = Counter([word for review in tokenized_reviews for word in review])

word2idx = {word: i+2 for i, word in enumerate(vocab)}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

In [82]:
embedding_dim = 100
embedding_matrix = np.zeros((len(word2idx), embedding_dim))

for word, idx in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

In [83]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels, word2idx, max_len=200):
        self.reviews = reviews
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        
        indices = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in review]
        if len(indices) > self.max_len:
            indices = indices[:self.max_len]
        else:
            indices = indices + [self.word2idx['<PAD>']] * (self.max_len - len(indices))
        
        return torch.tensor(indices), torch.tensor(label)

In [84]:
train_reviews, test_reviews, train_labels, test_labels = train_test_split(
    tokenized_reviews, labels, test_size=0.2, random_state=42)

train_dataset = IMDBDataset(train_reviews, train_labels, word2idx)
test_dataset = IMDBDataset(test_reviews, test_labels, word2idx)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [85]:
class SentimentLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers, dropout):
        super(SentimentLSTM, self).__init__()
        
        vocab_size, embedding_dim = embedding_matrix.shape
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        output = self.fc(self.dropout(hidden[-1]))
        return output

hidden_dim = 128
output_dim = 1
n_layers = 2
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentLSTM(embedding_matrix, hidden_dim, output_dim, n_layers, dropout).to(device)

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        epoch_acc = 0
        for reviews, labels in train_loader:
            reviews, labels = reviews.to(device), labels.to(device).float()
            
            optimizer.zero_grad()
            outputs = model(reviews).squeeze(1)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}')

train_model(model, train_loader, criterion, optimizer, num_epochs=5)

In [93]:
def evaluate_model(model, test_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for reviews, labels in test_loader:
            reviews, labels = reviews.to(device), labels.to(device).float()
            outputs = model(reviews).squeeze(1)
            predictions = torch.round(torch.sigmoid(outputs))
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
        
        print(f'Accuracy: {correct/total*100:.2f}%')

evaluate_model(model, test_loader)

Accuracy: 100.00%
