K Bala Sai Manvitha - CS22B1030

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load dataset
train_path = "/content/drive/MyDrive/sst2_sentiment_dataset/sst2_train.parquet"
val_path = "/content/drive/MyDrive/sst2_sentiment_dataset/sst2_valid.parquet"

In [None]:
train_df = pd.read_parquet(train_path)
val_df = pd.read_parquet(val_path)

In [None]:
train_df

Unnamed: 0,idx,sentence,label
0,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0
4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...
67344,67344,a delightful comedy,1
67345,67345,"anguish , anger and frustration",0
67346,67346,"at achieving the modest , crowd-pleasing goals...",1
67347,67347,a patient viewer,1


In [None]:
val_df

Unnamed: 0,idx,sentence,label
0,0,it 's a charming and often affecting journey .,1
1,1,unflinchingly bleak and desperate,0
2,2,allows us to hope that nolan is poised to emba...,1
3,3,"the acting , costumes , music , cinematography...",1
4,4,"it 's slow -- very , very slow .",0
...,...,...,...
867,867,has all the depth of a wading pool .,0
868,868,a movie with a real anarchic flair .,1
869,869,a subject like this should inspire reaction in...,0
870,870,... is an arthritic attempt at directing by ca...,0


In [None]:
# Splitting the dataset (5,000 for test, rest for training)
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(train_df, test_size=5000, random_state=42, stratify=train_df["label"])

In [None]:
# Extract text and labels
train_texts, train_labels = train_data["sentence"].values, train_data["label"].values
test_texts, test_labels = test_data["sentence"].values, test_data["label"].values
val_texts, val_labels = val_df["sentence"].values, val_df["label"].values

In [None]:
# Text preprocessing function
import string

def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.strip()  # Remove extra spaces
    return text

In [None]:
# Apply preprocessing
import re

train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]
val_texts = [preprocess_text(text) for text in val_texts]

In [None]:
# Tokenization
vocab_size = 20000  # Vocabulary size
max_length = 100  # Max sequence length
embedding_dim = 128

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

In [None]:
train_sequences[5]

[5600, 11820, 4, 2, 2049, 73, 513, 6730, 44, 14, 1577, 738, 4, 277, 625]

In [None]:
# Padding sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

In [None]:
train_padded

array([[  205,     2,   158, ...,     0,     0,     0],
       [   14,  4428,  4429, ...,     0,     0,     0],
       [11819,    44,     3, ...,     0,     0,     0],
       ...,
       [  707,    19,     8, ...,     0,     0,     0],
       [12122,   196,   272, ...,     0,     0,     0],
       [   50,  1944,    11, ...,     0,     0,     0]], dtype=int32)

In [None]:
# Convert labels to NumPy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)

In [None]:
# Build the RNN model (LSTM-based)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(units=128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])



In [None]:
# Compile model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# Train the model
history = model.fit(
    train_padded, train_labels,
    validation_data=(val_padded, val_labels),
    epochs=5,
    batch_size=64,
    verbose=1


Epoch 1/5
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 338ms/step - accuracy: 0.5560 - loss: 0.6877 - val_accuracy: 0.5092 - val_loss: 0.6978
Epoch 2/5
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m367s[0m 327ms/step - accuracy: 0.5549 - loss: 0.6876 - val_accuracy: 0.5092 - val_loss: 0.7001
Epoch 3/5
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 326ms/step - accuracy: 0.5577 - loss: 0.6867 - val_accuracy: 0.5092 - val_loss: 0.6961
Epoch 4/5
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 329ms/step - accuracy: 0.5586 - loss: 0.6865 - val_accuracy: 0.5092 - val_loss: 0.6970
Epoch 5/5
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 326ms/step - accuracy: 0.5546 - loss: 0.6874 - val_accuracy: 0.5092 - val_loss: 0.6987


In [None]:
# Evaluate on the test set
test_preds = model.predict(test_padded)
test_preds = (test_preds > 0.5).astype(int)  # Convert probabilities to binary labels

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 62ms/step


In [None]:
# Print accuracy and classification report
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(test_labels, test_preds)
report = classification_report(test_labels, test_preds, target_names=["Negative", "Positive"])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Test Accuracy: 0.5578
Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00      2211
    Positive       0.56      1.00      0.72      2789

    accuracy                           0.56      5000
   macro avg       0.28      0.50      0.36      5000
weighted avg       0.31      0.56      0.40      5000

