In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from transformers import TFRobertaModel, RobertaTokenizer
import itertools

In [None]:
# Load the dataset
df = pd.read_csv('Data/Reviews.csv')

# Map 'Score' to sentiment labels
def map_score_to_label(score):
    if score in [1, 2]:
        return 0  # Negative
    elif score == 3:
        return 1  # Neutral
    else:  # 4 or 5
        return 2  # Positive

df['label'] = df['Score'].apply(map_score_to_label)

In [None]:
# Balance the dataset by sampling 5000 reviews per class
df_balanced = df.groupby('label').apply(lambda x: x.sample(n=5000, random_state=42)).reset_index(drop=True)

# Clean the text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_balanced['cleaned_text'] = df_balanced['Text'].apply(clean_text)

In [None]:
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text
def tokenize_function(texts):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )

tokenized_texts = tokenize_function(df_balanced['cleaned_text'])

# Convert tensors to numpy arrays
input_ids = tokenized_texts['input_ids'].numpy()
attention_mask = tokenized_texts['attention_mask'].numpy()
labels = df_balanced['label'].values

# Clear unused variables to save memory
del tokenized_texts

In [None]:
# Split into train, validation, and test sets
train_input_ids, temp_input_ids, train_labels, temp_labels, train_attention_mask, temp_attention_mask = train_test_split(
    input_ids,
    labels,
    attention_mask,
    test_size=0.3,  # 70% train, 30% temp
    random_state=42
)

# Further split temp into validation and test sets
val_input_ids, test_input_ids, val_labels, test_labels, val_attention_mask, test_attention_mask = train_test_split(
    temp_input_ids,
    temp_labels,
    temp_attention_mask,
    test_size=0.5,  # 15% validation, 15% test
    random_state=42
)

# Clear unused variables to save memory
del input_ids, attention_mask, labels, temp_input_ids, temp_labels, temp_attention_mask

# Function to create TensorFlow datasets
def create_tf_dataset(input_ids, attention_mask, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': input_ids, 'attention_mask': attention_mask},
        labels
    ))
    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
# Build the model function
def build_model(base_model, activation_function='relu', dropout_rate=0.2, use_batchnorm=False):
    # Define inputs
    input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
    
    # Get outputs from the base model
    outputs = base_model(input_ids, attention_mask=attention_mask)
    cls_token = outputs.last_hidden_state[:, 0, :]
    
    # Add new encoding layers
    x = tf.keras.layers.Dense(512)(cls_token)
    if use_batchnorm:
        x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(activation_function)(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    
    x = tf.keras.layers.Dense(256)(x)
    if use_batchnorm:
        x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(activation_function)(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    
    x = tf.keras.layers.Dense(128)(x)
    if use_batchnorm:
        x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(activation_function)(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    
    x = tf.keras.layers.Dense(64)(x)
    if use_batchnorm:
        x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation(activation_function)(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    
    output = tf.keras.layers.Dense(3, activation='softmax')(x)
    
    # Build the model
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    
    return model

In [None]:
# Function to get optimizer
def get_optimizer(name, learning_rate=5e-5):
    if name == 'adam':
        return tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif name == 'sgd':
        return tf.keras.optimizers.SGD(learning_rate=learning_rate)
    else:
        raise ValueError(f'Unsupported optimizer: {name}')

In [None]:
# Define configurations
configurations = [
    {'optimizer': 'adam', 'batch_size': 32, 'dropout_rate': 0.1, 'activation_function': 'relu', 'use_batchnorm': False},
    {'optimizer': 'adam', 'batch_size': 32, 'dropout_rate': 0.2, 'activation_function': 'tanh', 'use_batchnorm': True},
    {'optimizer': 'sgd', 'batch_size': 64, 'dropout_rate': 0.1, 'activation_function': 'relu', 'use_batchnorm': False},
    {'optimizer': 'sgd', 'batch_size': 16, 'dropout_rate': 0.2, 'activation_function': 'tanh', 'use_batchnorm': True},
]

# Load the base RoBERTa model and freeze layers
base_model = TFRobertaModel.from_pretrained('roberta-base')
for layer in base_model.layers:
    layer.trainable = False

histories = []
results = []

In [None]:
# Loop over configurations
for i, config in enumerate(configurations):
    print(f"Training configuration {i+1}/{len(configurations)}: {config}")
    
    # Build the model
    model = build_model(
        base_model,
        activation_function=config['activation_function'],
        dropout_rate=config['dropout_rate'],
        use_batchnorm=config['use_batchnorm']
    )
    
    # Get optimizer
    optimizer = get_optimizer(config['optimizer'])
    
    # Compile the model
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Create datasets with specified batch size
    batch_size = config['batch_size']
    train_dataset = create_tf_dataset(train_input_ids, train_attention_mask, train_labels, batch_size)
    val_dataset = create_tf_dataset(val_input_ids, val_attention_mask, val_labels, batch_size)
    test_dataset = create_tf_dataset(test_input_ids, test_attention_mask, test_labels, batch_size)
    
    # Set up callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    
    # Fit the model using the validation set
    history = model.fit(
        train_dataset,
        epochs=5,
        validation_data=val_dataset,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Record the history and config
    histories.append((history, config))
    
    # Evaluate the final model on the test set
    loss, accuracy = model.evaluate(test_dataset)
    print(f"Test accuracy: {accuracy}")
    
    # Generate predictions for the test set
    y_pred_probs = model.predict(test_dataset)
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    # Flatten test labels (in case they are in batches)
    y_true = test_labels
    
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Classification report
    report = classification_report(y_true, y_pred, target_names=['Negative', 'Neutral', 'Positive'])
    print(f"Classification Report for Configuration {i+1}:\n{report}")
    
    # Plot confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title(f'Confusion Matrix for Configuration {i+1}')
    plt.show()
    
    # Store results
    results.append({'config': config, 'accuracy': accuracy, 'loss': loss, 'confusion_matrix': cm, 'classification_report': report})
    
    # Clear session and delete model to free memory
    tf.keras.backend.clear_session()
    del model

In [None]:
# Visualize the results

# Plot training and validation accuracy for each configuration
for i, (history, config) in enumerate(histories):
    plt.figure(figsize=(8, 6))
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f"Configuration {i+1}: {config}")
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [None]:
# Plot training and validation loss for each configuration
for i, (history, config) in enumerate(histories):
    plt.figure(figsize=(8, 6))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f"Configuration {i+1}: {config}")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
# Create a DataFrame of the results
results_df = pd.DataFrame([{
    'optimizer': res['config']['optimizer'],
    'batch_size': res['config']['batch_size'],
    'dropout_rate': res['config']['dropout_rate'],
    'activation_function': res['config']['activation_function'],
    'use_batchnorm': res['config']['use_batchnorm'],
    'accuracy': res['accuracy'],
    'loss': res['loss']
} for res in results])

print(results_df)