In [1]:
!pip install kaggle



In [2]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
kaggle_dict = json.load(open("kaggle.json"))
os.environ["KAGGLE_USERNAME"] = kaggle_dict['username']
os.environ["KAGGLE_KEY"] = kaggle_dict['key']

!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s]
100% 25.7M/25.7M [00:00<00:00, 1.44GB/s]


In [4]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [5]:
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip") as zip_ref :
  zip_ref.extractall()
!ls

'IMDB Dataset.csv'			 kaggle.json
 imdb-dataset-of-50k-movie-reviews.zip	 sample_data


In [6]:
data = pd.read_csv("IMDB Dataset.csv")
print(data.shape)
data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [8]:
data.replace({'sentiment':{"positive" : 1, "negative" : 0}}, inplace=True)
data.head()

  data.replace({'sentiment':{"positive" : 1, "negative" : 0}}, inplace=True)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=40)
print(train_data.shape)

(40000, 2)


In [10]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)
y_train = train_data['sentiment']
y_test = test_data['sentiment']
print(X_train)
print(X_test)

[[   0    0    0 ...   18   12  644]
 [   0    0    0 ...   38   94   32]
 [   0    0    0 ...   45   51  915]
 ...
 [   0    0    0 ...   52   68  589]
 [   0    0    0 ...    1  682  890]
 [   0    0    0 ... 1829    4  619]]
[[   4    1  204 ...  455  125   55]
 [   0    0    0 ... 1079   51 2935]
 [   0    0    0 ...    4    1  312]
 ...
 [   0    0    0 ...    6  350 2404]
 [ 317    1  249 ...    8    1  181]
 [   0    0    0 ... 3166 1738 1832]]


In [11]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
model.summary()



In [12]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

In [13]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 430ms/step - accuracy: 0.7206 - loss: 0.5254 - val_accuracy: 0.8446 - val_loss: 0.3554
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 393ms/step - accuracy: 0.8543 - loss: 0.3509 - val_accuracy: 0.8543 - val_loss: 0.3440
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 390ms/step - accuracy: 0.8299 - loss: 0.3817 - val_accuracy: 0.8397 - val_loss: 0.4210
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 387ms/step - accuracy: 0.8965 - loss: 0.2632 - val_accuracy: 0.8572 - val_loss: 0.3549
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 371ms/step - accuracy: 0.9142 - loss: 0.2209 - val_accuracy: 0.8650 - val_loss: 0.3345
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 388ms/step - accuracy: 0.9225 - loss: 0.2036 - val_accuracy: 0.8717 - val_loss: 0.3465
Epoc

KeyboardInterrupt: 

In [14]:
model.summary()

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}, accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 110ms/step - accuracy: 0.8661 - loss: 0.3519
Test loss: 0.3433963656425476, accuracy: 0.8711000084877014


In [16]:
def predict_sentiment(review):
  sequence = pad_sequences(tokenizer.texts_to_sequences([review]), maxlen=200)
  prediction = model.predict(sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [25]:
new_review = ""
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
The sentiment of the review is positive
