In [16]:

!pip install kaggle



In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import os
import json

import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
# Updated import statement
from tensorflow.keras.preprocessing.text import Tokenizer
# Updated import statement
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [20]:
kaggle_json_path = '/content/drive/MyDrive/kaggle.json'

# Check if the file exists
if not os.path.exists(kaggle_json_path):
    print(f"File not found at {kaggle_json_path}. Creating a new file.")
    # Get username and key from user input
    username = input("Enter your Kaggle username: ")
    key = input("Enter your Kaggle API key: ")

    # Create a dictionary with the user's credentials
    kaggle_dictionary = {"username": username, "key": key}

    # Write the dictionary to a JSON file
    with open(kaggle_json_path, 'w') as f:
        json.dump(kaggle_dictionary, f)

    print(f"File created at {kaggle_json_path}.")
else:
    print(f"File exists at {kaggle_json_path}. Loading credentials.")
    # Load credentials from existing file
    with open(kaggle_json_path, 'r') as f:
        kaggle_dictionary = json.load(f)

# Set environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

File exists at /content/drive/MyDrive/kaggle.json. Loading credentials.


In [21]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [22]:
from zipfile import ZipFile

with ZipFile("/content/imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall("/content")

In [23]:
dataset = pd.read_csv("/content/IMDB Dataset.csv")
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [24]:
dataset.replace ({"sentiment":{"positive":1,"negative":0}}, inplace=True)
dataset.head()

  dataset.replace ({"sentiment":{"positive":1,"negative":0}}, inplace=True)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [25]:
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
print(train_data.shape, test_data.shape)

(40000, 2) (10000, 2)


In [26]:
tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(train_data["review"])

# convert the text data into sequences of integers and pads the sequences to ensure they all have the same length
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [27]:

y_train = train_data["sentiment"]
y_test = test_data["sentiment"]

In [29]:
embedding_dim=32
vocab_size=5000

model = Sequential([
    Embedding(vocab_size, embedding_dim, name="embedding"),
    Bidirectional(LSTM(64)),
    Dropout(0.2),
    # Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [30]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [32]:
history = model.fit(x_train, y_train, epochs=3, batch_size=32, validation_split=0.2)

Epoch 1/3
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 191ms/step - accuracy: 0.8900 - loss: 0.2741 - val_accuracy: 0.8474 - val_loss: 0.3791
Epoch 2/3
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 180ms/step - accuracy: 0.9114 - loss: 0.2287 - val_accuracy: 0.8457 - val_loss: 0.3490
Epoch 3/3
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 176ms/step - accuracy: 0.9189 - loss: 0.2102 - val_accuracy: 0.8767 - val_loss: 0.3167


In [33]:
loss, accuracy = model.evaluate(x_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 47ms/step - accuracy: 0.8828 - loss: 0.2958
Test Loss: 0.29586002230644226
Test Accuracy: 0.8847000002861023


In [34]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])      # convert to sequence of integers , [] is important
  padded_sequence = pad_sequences(sequence, maxlen=200)  # pad the sequence

  # predict the sentiment
  prediction = model.predict(padded_sequence)
  print(prediction)

  # return sentiment
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [35]:
while True:
  new_review = input("Enter a movie review (or type 'exit' to quit): ")
  if new_review.lower() == "exit":
    break
  sentiment = predict_sentiment(new_review)
  print("Sentiment of the review is:", sentiment)

Enter a movie review (or type 'exit' to quit): i love it 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357ms/step
[[0.54348725]]
Sentiment of the review is: positive
Enter a movie review (or type 'exit' to quit): amazing movie!!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[[0.81187344]]
Sentiment of the review is: positive
Enter a movie review (or type 'exit' to quit): i thought the special effects were good but the plot was pretty bad
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[[0.11990261]]
Sentiment of the review is: negative
Enter a movie review (or type 'exit' to quit): the acting was really good
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[[0.2793054]]
Sentiment of the review is: negative
Enter a movie review (or type 'exit' to quit): i hate this movie
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[[0.19815944]]
Sentiment of the review is: negative
Enter