Model Building and Training - Encode Training and Test Data

In [None]:
import pandas as pd
from src.feature_extraction import get_tokenizer, encode_texts

# Load tokenizer
tokenizer = get_tokenizer()

# Retrieve cleaned training data
train_data = pd.read_csv('../data/processed/cleaned_train_data.csv')
X_train_cleaned = train_data['Combined_Text']
X_train_cleaned = X_train_cleaned.astype('str')
y_train_resampled = train_data['Sentiment']
train_data.head()

In [None]:
# Retrieve cleaned test data
test_data = pd.read_csv('../data/processed/cleaned_test_data.csv')
X_test_cleaned = test_data['Combined_Text']
X_test_cleaned = X_test_cleaned.astype('str')
y_test = test_data['Sentiment']
test_data.head()

In [None]:
# Encode training data
X_train_encoded = encode_texts(X_train_cleaned, tokenizer)

# Encode test data
X_test_encoded = encode_texts(X_test_cleaned, tokenizer)

print("The training and test features have been successfully encoded.")

Prepare Data for Training

In [None]:
import tensorflow as tf

# Prepare labels for TensorFlow
y_train_tf = y_train_resampled.values
y_test_tf = y_test.values

# Define batch size and epochs
batch_size = 16
epochs = 3

# Create TensorFlow datasets for training and evaluation
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_encoded),
    y_train_tf
)).shuffle(len(y_train_tf)).batch(batch_size)

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test_encoded),
    y_test_tf
)).batch(batch_size)

Training the Model

In [None]:
from src.model_training import build_model

# Build the model
model = build_model()

# Train the model
history = model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=validation_dataset,
)

Model Evaluation on Test Data

In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(validation_dataset)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")