<a href="https://colab.research.google.com/github/Dhawaldwivedi/Bca-5th-sem-AI-mmdu/blob/main/ai-ml2-g1/sentimentals_detection_using_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np

In [11]:
import zipfile
import os

# Specify the path to your zip file and the directory to extract to
zip_file_path = 'sentimental.zip'  # Updated zip file name
extracted_dir = 'extracted_dataset'

# Create the extraction directory if it doesn't exist
os.makedirs(extracted_dir, exist_ok=True)

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract specific files
    zip_ref.extract('reviews.txt', extracted_dir)
    zip_ref.extract('labels.txt', extracted_dir)


print(f"Specified files extracted to: {extracted_dir}")

Specified files extracted to: extracted_dataset


In [15]:
# Assuming the dataset is a CSV file named 'your_dataset.csv' inside the extracted directory
# Replace 'your_dataset.csv' with the actual filename of your dataset
reviews_file_path = os.path.join(extracted_dir, 'reviews.txt')
labels_file_path = os.path.join(extracted_dir, 'labels.txt')


# Load the datasets into pandas DataFrames (assuming they are text files, reading as lines)
try:
    with open(reviews_file_path, 'r') as f:
        reviews = f.readlines()
    with open(labels_file_path, 'r') as f:
        labels = f.readlines()

    # You might want to further process these lists into DataFrames or other structures
    print("Reviews and labels loaded successfully!")
    print("First 5 reviews:", reviews[:5])
    print("First 5 labels:", labels[:5])

except FileNotFoundError:
    print(f"Error: Dataset files not found at {reviews_file_path} or {labels_file_path}")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Reviews and labels loaded successfully!
First 5 reviews: ['bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \n', 'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrifi

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

training_size = 20000

training_sentences = reviews[0:training_size]
testing_sentences = reviews[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

vocab_size = 10000
max_length = 512
padding_type = 'post'
trunc_type = 'post'

tokenizer = Tokenizer(num_words=vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert labels to numerical format (0 for negative, 1 for positive) and remove newline characters
training_labels_numpy = np.array([1 if 'positive' in label else 0 for label in training_labels])
testing_labels_numpy = np.array([1 if 'positive' in label else 0 for label in testing_labels])

training_padded_numpy = np.array(training_padded)
testing_padded_numpy = np.array(testing_padded)



In [14]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard~=2.20.0->tensorflow)
  Downloading tensorboard_data_server-0.

In [17]:
import tensorflow as tf

multiLayerModel = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

multiLayerModel.build(input_shape=(None, max_length)) # Explicitly build the model

multiLayerModel.summary()

multiLayerModel.compile(loss='binary_crossentropy',
                        optimizer='adam',
                        metrics=['accuracy'])



In [18]:
history = multiLayerModel.fit(
            training_padded_numpy,
            training_labels_numpy,
            validation_data=(testing_padded_numpy, testing_labels_numpy),
            epochs=15,
            verbose=1)

Epoch 1/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 319ms/step - accuracy: 0.6632 - loss: 0.5794 - val_accuracy: 0.8290 - val_loss: 0.4153
Epoch 2/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 319ms/step - accuracy: 0.8781 - loss: 0.3203 - val_accuracy: 0.7972 - val_loss: 0.4419
Epoch 3/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 319ms/step - accuracy: 0.8952 - loss: 0.2763 - val_accuracy: 0.8284 - val_loss: 0.4463
Epoch 4/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 320ms/step - accuracy: 0.9365 - loss: 0.1764 - val_accuracy: 0.8164 - val_loss: 0.4139
Epoch 5/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 318ms/step - accuracy: 0.9447 - loss: 0.1519 - val_accuracy: 0.7976 - val_loss: 0.5160
Epoch 6/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 319ms/step - accuracy: 0.9343 - loss: 0.1852 - val_accuracy: 0.7904 - val_loss: 0.5258
Epoc

In [19]:
sentences = ['This movie was absolutely fantastic!',
             'I was very disappointed with the service.']

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
predictions = multiLayerModel.predict(padded)

for i in range(len(predictions)):
    if predictions[i] > 0.5: # Corrected the condition
        print('POSITIVE')
    else:
        print('NEGATIVE')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 367ms/step
POSITIVE
NEGATIVE


In [20]:
sentences = ['Simply awful and disappointed! It’s a very shallow story, and there in no time to feel an affinity for the main character/characters. It’s almost like whoever edited this movie, edited the first 30% and the last 30% and the movie begins as if you missed part of it. It ends, as if the signal on your TV went out, and it’s abrupt snd makes no sense! OMG! Wait for this movie to come out on DVD! HONESTLY! I am an avid movie goer!',
             'Great movie overall! Hoping for a 2nd one to pickup where the ending left off. Left with a cliffhanger so fingers crossed.']

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
predictions = multiLayerModel.predict(padded)

for i in range(len(predictions)):
    if predictions[i] > 0.5: # Corrected the condition
        print('POSITIVE')
    else:
        print('NEGATIVE')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
NEGATIVE
POSITIVE
