K Bala Sai Manvitha - CS22B1030

In [None]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load dataset
train_path = "/content/drive/MyDrive/sst2_sentiment_dataset/sst2_train.parquet"
val_path = "/content/drive/MyDrive/sst2_sentiment_dataset/sst2_valid.parquet"

In [None]:
train_df = pd.read_parquet(train_path)
val_df = pd.read_parquet(val_path)

In [None]:
train_df

Unnamed: 0,idx,sentence,label
0,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0
4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...
67344,67344,a delightful comedy,1
67345,67345,"anguish , anger and frustration",0
67346,67346,"at achieving the modest , crowd-pleasing goals...",1
67347,67347,a patient viewer,1


In [None]:
# Splitting the dataset (5,000 for test, rest for training)
train_data, test_data = train_test_split(train_df, test_size=5000, random_state=42, stratify=train_df["label"])

In [None]:
# Extract text and labels
train_texts, train_labels = train_data["sentence"].values, train_data["label"].values
test_texts, test_labels = test_data["sentence"].values, test_data["label"].values
val_texts, val_labels = val_df["sentence"].values, val_df["label"].values

In [None]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.strip()  # Remove extra spaces
    return text

In [None]:
# Apply preprocessing
train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]
val_texts = [preprocess_text(text) for text in val_texts]

In [None]:
# Tokenization
vocab_size = 20000  # Vocabulary size
max_length = 100  # Max sequence length
embedding_dim = 128

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

In [None]:
# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

In [None]:
# Convert labels to NumPy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)

In [None]:
from tensorflow.keras.layers import Bidirectional

# Build the improved RNN model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(SimpleRNN(units=128, activation='tanh', dropout=0.3, return_sequences=False)),
    Dense(64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(1, activation="sigmoid")  # Binary classification
])




In [None]:
# Compile model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# Train the model
history = model.fit(
    train_padded, train_labels,
    validation_data=(val_padded, val_labels),
    epochs=7,
    batch_size=64,
    verbose=1
)

Epoch 1/7
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 146ms/step - accuracy: 0.5422 - loss: 0.9263 - val_accuracy: 0.7362 - val_loss: 0.5618
Epoch 2/7
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 146ms/step - accuracy: 0.8305 - loss: 0.4140 - val_accuracy: 0.8119 - val_loss: 0.4958
Epoch 3/7
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 144ms/step - accuracy: 0.9197 - loss: 0.2378 - val_accuracy: 0.8073 - val_loss: 0.5522
Epoch 4/7
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 146ms/step - accuracy: 0.9381 - loss: 0.1969 - val_accuracy: 0.8245 - val_loss: 0.5845
Epoch 5/7
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 144ms/step - accuracy: 0.9486 - loss: 0.1684 - val_accuracy: 0.8142 - val_loss: 0.5796
Epoch 6/7
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 144ms/step - accuracy: 0.9550 - loss: 0.1468 - val_accuracy: 0.8257 - val_loss: 0.6129
Epoch 7/7


In [None]:
# Evaluate on the test set
test_preds = model.predict(test_padded)
test_preds = (test_preds > 0.5).astype(int)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step


In [None]:
accuracy = accuracy_score(test_labels, test_preds)
report = classification_report(test_labels, test_preds, target_names=["Negative", "Positive"])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Test Accuracy: 0.5578
Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00      2211
    Positive       0.56      1.00      0.72      2789

    accuracy                           0.56      5000
   macro avg       0.28      0.50      0.36      5000
weighted avg       0.31      0.56      0.40      5000

