Initiate Steps For Model Training and Evaluation

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

# Load the processed data
train_data = pd.read_csv('../data/processed/cleaned_train_data.csv')
val_data = pd.read_csv('../data/processed/cleaned_val_data.csv')
test_data = pd.read_csv('../data/processed/cleaned_test_data.csv')

# Verify data loading
print("Training data size:", train_data.shape)
print("Validation data size:", val_data.shape)
print("Test data size:", test_data.shape)

Training data size: (710042, 2)
Validation data size: (52581, 2)
Test data size: (52582, 2)


Prepare Data for Model Input

In [3]:
# Separate features and labels
X_train_cleaned = train_data['Review_Text']
X_train_cleaned = X_train_cleaned.astype('str')
y_train = train_data['Sentiment']

X_val_cleaned = val_data['Review_Text']
X_val_cleaned = X_val_cleaned.astype('str')
y_val = val_data['Sentiment']

X_test_cleaned = test_data['Review_Text']
X_test_cleaned = X_test_cleaned.astype('str')
y_test = test_data['Sentiment']

print("Features and labels successfully isolated.")

Features and labels successfully isolated.


Tokenize and Encode Text Data To Prepare It for Transformers

In [4]:
from src.feature_extraction import get_tokenizer, encode_texts

# Load tokenizer
tokenizer = get_tokenizer()

# Encode training data
X_train_encoded = encode_texts(X_train_cleaned, tokenizer)

# Encode the validation data
X_val_encoded = encode_texts(X_val_cleaned, tokenizer)

# Encode test data
X_test_encoded = encode_texts(X_test_cleaned, tokenizer)

print("The training, validation, and test inputs have been successfully encoded.")

The training, validation, and test inputs have been successfully encoded.


Prepare Data for TensorFlow

In [5]:
# Prepare labels for TensorFlow
y_train_tf = y_train.values
y_val_tf = y_val.values
y_test_tf = y_test.values

print("Labels for TensorFlow prepared.")

Labels for TensorFlow prepared.


Create TensorFlow Datasets and Split Into Batches

In [6]:
# Define batch size and epochs
batch_size = 16
epochs = 3

# Create TensorFlow datasets for training and evaluation
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_encoded),
    y_train_tf
)).shuffle(len(y_train_tf)).batch(batch_size)

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_val_encoded),
    y_val_tf
)).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test_encoded),
    y_test_tf
)).batch(batch_size)

print("TensorFlow Datasets have been successfully prepared.")

TensorFlow Datasets have been successfully prepared.


Build, Compile, and Train the Model with Early Stopping

In [None]:
from src.model_training import build_model

# Build the model
model = build_model()

# Define callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
]

# Train the model
history = model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=validation_dataset,
    callbacks=callbacks
)

Model Evaluation on Test Data

In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_dataset)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")