In [None]:
!pip install kaggle




In [None]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout  # Import Dropout layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [None]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [None]:
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 86% 22.0M/25.7M [00:00<00:00, 224MB/s]
100% 25.7M/25.7M [00:00<00:00, 234MB/s]


In [None]:
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
    zip_ref.extractall()


In [None]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
print(data.shape)
print(data.head())
print(data.tail())
print(data["sentiment"].value_counts())

data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

# Check the value counts again
print(data["sentiment"].value_counts())


(50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
                                                  review sentiment
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
sentiment
1    25000
0    25000
Name: count, dtype: int64


  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [None]:
# ** CNN Model**
# Build the  CNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(Conv1D(filters=8, kernel_size=7, activation='relu'))  # Fewer filters and larger kernel size
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.7))  # High dropout rate
model.add(Dense(1, activation='sigmoid'))  # Smaller dense layer

model.summary()

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# **Training the Model**
model.fit(X_train, Y_train, epochs=3, batch_size=128, validation_split=0.2)  # Fewer epochs and larger batch size

# **Model Evaluation**
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")



Epoch 1/3
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 116ms/step - accuracy: 0.5498 - loss: 0.6744 - val_accuracy: 0.7919 - val_loss: 0.5243
Epoch 2/3
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 114ms/step - accuracy: 0.7130 - loss: 0.5592 - val_accuracy: 0.8366 - val_loss: 0.4478
Epoch 3/3
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 119ms/step - accuracy: 0.7476 - loss: 0.4976 - val_accuracy: 0.8434 - val_loss: 0.3995
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.8520 - loss: 0.3971
Test Loss: 0.39568543434143066
Test Accuracy: 0.8511000275611877


In [None]:
Y_pred_probs = model.predict(X_test)
Y_pred = (Y_pred_probs > 0.5).astype("int32")

# Calculate precision, recall, and F1 score
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
Precision: 0.834023334587881
Recall: 0.8795395911887279
F1 Score: 0.8561769535400368
Confusion Matrix:
[[4079  882]
 [ 607 4432]]


In [None]:
def predict_sentiment(review):
    # tokenize and pad the review
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
    return sentiment

In [None]:
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
The sentiment of the review is: positive
