In [94]:
import pandas as pd 
import nltk
import re
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense ,GRU





In [95]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [96]:
corpus = df['review'].astype(str)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = PunktSentenceTokenizer()
def clean_text(corpus):
    corpus = re.sub(r'<.*?>', ' ', corpus)  
    corpus = re.sub(r"[,!?;:\-]+", " ", corpus)  
    sentences = tokenizer.tokenize(corpus)  
    cleaned_sentences =  [" ".join([word.lower() for word in re.findall(r'\b\w+\b', sent) if word.lower() not in stop_words]) for sent in sentences]  
    return " ".join(cleaned_sentences) 

df['cleaned_review'] = df['review'].apply(clean_text)
df.head()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mizog\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [97]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_review'])

sequences = tokenizer.texts_to_sequences(df['cleaned_review'])

padded_sequences = pad_sequences(sequences, maxlen=200, padding='post')


In [98]:

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'].map({'positive': 1, 'negative': 0}), test_size=0.2, random_state=42)


In [99]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=200, input_length=300),  
    GRU(64, return_sequences=True), 
    GRU(32), 
    Dense(16, activation='relu'),  
    Dense(1, activation='sigmoid')  
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])






In [100]:

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 120ms/step - accuracy: 0.5434 - loss: 0.6646 - val_accuracy: 0.8710 - val_loss: 0.3169
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 124ms/step - accuracy: 0.9047 - loss: 0.2471 - val_accuracy: 0.8994 - val_loss: 0.2460
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 130ms/step - accuracy: 0.9581 - loss: 0.1211 - val_accuracy: 0.8953 - val_loss: 0.2909
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 144ms/step - accuracy: 0.9840 - loss: 0.0551 - val_accuracy: 0.8921 - val_loss: 0.3675
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 146ms/step - accuracy: 0.9924 - loss: 0.0274 - val_accuracy: 0.8865 - val_loss: 0.4483


In [101]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.8858 - loss: 0.4396
Loss: 0.4483, Accuracy: 0.8865
