In [123]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import to_categorical

# Download the IMDb movie reviews dataset from NLTK
nltk.download('movie_reviews')

# Prepare the movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
import random
random.shuffle(documents)

# Preprocess the movie reviews data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(words):
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

# Prepare features and labels
X = [preprocess_text(words) for words, category in documents]
y = [category for words, category in documents]

# Tokenize the text and convert labels to numerical values
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
encoded_texts = tokenizer.texts_to_sequences(X)

label_categories = list(set(y))
label_encoder = {label: index for index, label in enumerate(label_categories)}
encoded_labels = [label_encoder[label] for label in y]
encoded_labels = to_categorical(encoded_labels)

# Pad sequences for equal input length
max_length = 200  # you can adjust this based on your dataset and input length
padded_texts = pad_sequences(encoded_texts, maxlen=max_length, padding='post')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_texts, encoded_labels, test_size=0.2, random_state=42)

# Build the LSTM model
embedding_dim = 50  # dimension of word embeddings
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=100))
model.add(Dense(units=len(label_categories), activation='softmax'))




[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/achakraborty/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [124]:
from keras.metrics import Precision, AUC
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
num_epochs = 5
batch_size = 32
model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2a7e73940>

In [None]:
enumerate(label_categories)

In [None]:
label_encoder = {label: index for index, label in label_categories}

In [None]:
encoded_labels = [label_encoder[label] for label in y]

In [None]:
encoded_labels = to_categorical(encoded_labels)
encoded_labels

In [None]:
model.predict("I hate it")

In [134]:
import numpy as np


preprocessed_text = preprocess_text("".join(documents[1][0]))
input_sequences = tokenizer.texts_to_sequences([preprocessed_text])

input_sequences_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')

prediction = model.predict(input_sequences_padded)
index = np.argmax(prediction) if np.max(prediction) > 0.8 else np.argmin(prediction)
predicted_sentiment_label = label_categories[index]

prediction



array([[0.99777657, 0.00222339]], dtype=float32)

In [None]:
predicted_sentiment_label

In [None]:
movie_reviews

In [None]:
documents

In [None]:
len(documents)

In [None]:
movie_reviews.categories()

In [None]:
len(movie_reviews.fileids())

In [132]:
encoded_labels = [label_encoder[label] for label in y]

In [None]:
encoded_labels

In [None]:
to_categorical(encoded_labels)

In [None]:
tokenizer.word_index

In [133]:
a = ("".join(documents[1][0]))