In [16]:
# Cell 1

# Required Libraries and Their Purposes:
# - pandas: Used for data manipulation and analysis, particularly for reading and writing CSV files.
# - transformers: Provides the BERT model and tokenizer for sequence classification tasks.
# - tensorflow: Used for model training, including defining, compiling, and fitting neural networks.
# - numpy: Supports numerical operations, such as manipulating arrays and tensors.
# - sklearn: Used for splitting data into training and validation sets.

# Check and Print Library Versions
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel
import os
import glob
from sklearn.model_selection import train_test_split

print("Pandas version:", pd.__version__)
print("TensorFlow version:", tf.__version__)
print("NumPy version:", np.__version__)
print("scikit-learn version:", sklearn.__version__)
print("Transformers version:", transformers.__version__)

# Required pip installs for the project (Uncomment to install)

# !pip install pandas==2.2.2
# !pip install tensorflow==2.17.0
# !pip install numpy==1.23.5
# !pip install scikit-learn==1.4.2
# !pip install transformers==4.44.1


Pandas version: 2.2.2
TensorFlow version: 2.17.0
NumPy version: 1.23.5
scikit-learn version: 1.4.2
Transformers version: 4.44.1


In [17]:
# Cell 2: Training

# Description:
# Trains a multitask BERT-based model to evaluate customer interactions by 
# predicting categories, quality levels, and sentiment scores from text data. 
# It includes functions for data preprocessing, such as concatenating multiple 
# CSV files, tokenizing text inputs with a BERT tokenizer, and mapping 
# labels to numerical values. The create_multitask_model function constructs a 
# TensorFlow Keras model with separate output layers for category classification, 
# quality assessment, and sentiment regression. The train_and_save_model function 
# orchestrates the workflow by loading and preparing the data, splitting it into 
# training and validation sets, training the model with early stopping, saving 
# the trained model and tokenizer, and generating predictions that are saved to 
# a CSV file for further analysis.

# import os
# import glob
# import pandas as pd
# import numpy as np
# import tensorflow as tf
# from transformers import BertTokenizer, TFBertModel
# from sklearn.model_selection import train_test_split

# Function to scale sentiment output
def scale_sentiment_output(x):
    return 0.1 + 0.8 * x

# Function to create the model using Functional API
def create_multitask_model(bert_model, max_length=128):
    # Define Inputs
    input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

    # Obtain BERT outputs
    bert_output = bert_model([input_ids, attention_mask])[0]  # Last hidden state
    cls_token = bert_output[:, 0, :]  # CLS token

    # Define Outputs
    category_output = tf.keras.layers.Dense(4, activation='softmax', name='category_output')(cls_token)
    quality_output = tf.keras.layers.Dense(3, activation='softmax', name='quality_output')(cls_token)
    sentiment_linear = tf.keras.layers.Dense(1, activation='sigmoid', name='sentiment_linear')(cls_token)
    sentiment_output = tf.keras.layers.Lambda(scale_sentiment_output, name='sentiment_output')(sentiment_linear)

    # Create Model
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs={
            'category_output': category_output,
            'quality_output': quality_output,
            'sentiment_output': sentiment_output
        }
    )

    return model

# Function to concatenate all files into a single DataFrame
def concatenate_files(file_paths):
    data_frames = []
    for file_path in file_paths:
        print(f"Loading file: {file_path}")
        df = pd.read_csv(file_path)
        data_frames.append(df)
    concatenated_df = pd.concat(data_frames, ignore_index=True)
    return concatenated_df

# Function to tokenize texts
def tokenize_texts(tokenizer, texts, max_length=128):
    try:
        inputs = tokenizer(
            texts.tolist(),
            padding='max_length',
            truncation=True,
            return_tensors="tf",
            max_length=max_length
        )
        print("Tokenization completed successfully.")
        return inputs
    except Exception as e:
        print(f"Error during tokenization: {e}")
        raise

# Function to map labels
def map_labels(data):
    category_mapping = {
        'Greetings': 0,
        'Problem Investigation': 1,
        'Closure': 2,
        'Account Verification': 3
    }
    quality_mapping = {
        'Positive': 0,
        'Neutral': 1,
        'Negative': 2
    }
    data['Category Label'] = data['Category Truth'].map(category_mapping)
    data['Quality Label'] = data['Quality Truth'].map(quality_mapping)
    data['Sentiment Label'] = data['Sentiment Truth'].astype(float)
    return data

# Function to make predictions and save to CSV, including truth columns
def predict_and_save(model, tokenizer, data, output_dir):
    texts = data['Text']
    inputs = tokenize_texts(tokenizer, texts)

    predictions = model.predict([inputs['input_ids'], inputs['attention_mask']])

    # Extract predictions
    predicted_categories = np.argmax(predictions['category_output'], axis=1)
    predicted_qualities = np.argmax(predictions['quality_output'], axis=1)
    predicted_sentiments = predictions['sentiment_output'].flatten()

    # Convert predicted integer labels back to their respective categories
    inverse_category_mapping = {0: 'Greetings', 1: 'Problem Investigation', 2: 'Closure', 3: 'Account Verification'}
    inverse_quality_mapping = {0: 'Positive', 1: 'Neutral', 2: 'Negative'}

    # Map predictions to their respective labels
    data['Predicted Category'] = pd.Series(predicted_categories).map(inverse_category_mapping)
    data['Predicted Quality'] = pd.Series(predicted_qualities).map(inverse_quality_mapping)
    data['Predicted Sentiment'] = predicted_sentiments.round(2)  # Rounded for readability

    # Save the updated DataFrame with predictions, including the original truth columns
    output_file = os.path.join(output_dir, 'predictions.csv')
    data[['Person', 'Text', 'Category Truth', 'Quality Truth', 'Sentiment Truth',
          'Predicted Category', 'Predicted Quality', 'Predicted Sentiment']].to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

# Function to train and save the model using Functional API
def train_and_save_model():
    # Define your input and output directories for CSV files
    input_dir = 'data/training/predictionTraining'
    output_dir = 'data/training/predictionSolution'
    os.makedirs(output_dir, exist_ok=True)

    # Directory for saving the model and tokenizer
    bert_dir = 'fine_tuned_bert'
    os.makedirs(bert_dir, exist_ok=True)

    # Define model path
    model_path = os.path.join(bert_dir, 'saved_model')

    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Get list of all CSV files in the input directory
    file_paths = glob.glob(os.path.join(input_dir, '*.csv'))

    if not file_paths:
        print(f"No CSV files found in the directory: {input_dir}")
        return

    # Concatenate all files into a single DataFrame
    data = concatenate_files(file_paths)
    print("All files concatenated into one DataFrame.")

    # Map labels
    data = map_labels(data)

    # Prepare texts and labels
    texts = data['Text']
    labels_category = data['Category Label'].values
    labels_quality = data['Quality Label'].values
    labels_sentiment = data['Sentiment Label'].values

    # Tokenize texts
    inputs = tokenize_texts(tokenizer, texts)

    # Convert inputs to numpy arrays
    input_ids = inputs['input_ids'].numpy()
    attention_mask = inputs['attention_mask'].numpy()

    # Split the data into training and validation sets
    train_indices, val_indices = train_test_split(
        np.arange(len(input_ids)),
        test_size=0.2,
        random_state=42,
        stratify=labels_category
    )

    x_train_ids = input_ids[train_indices]
    x_val_ids = input_ids[val_indices]
    x_train_mask = attention_mask[train_indices]
    x_val_mask = attention_mask[val_indices]

    y_train_category = labels_category[train_indices]
    y_val_category = labels_category[val_indices]
    y_train_quality = labels_quality[train_indices]
    y_val_quality = labels_quality[val_indices]
    y_train_sentiment = labels_sentiment[train_indices]
    y_val_sentiment = labels_sentiment[val_indices]

    # Prepare training and validation data
    train_dataset = (
        (x_train_ids, x_train_mask),
        {
            'category_output': y_train_category,
            'quality_output': y_train_quality,
            'sentiment_output': y_train_sentiment
        }
    )
    val_dataset = (
        (x_val_ids, x_val_mask),
        {
            'category_output': y_val_category,
            'quality_output': y_val_quality,
            'sentiment_output': y_val_sentiment
        }
    )

    # Check if model exists
    if os.path.exists(model_path):
        print(f"Loading existing model from {model_path}...")
        model = tf.keras.models.load_model(
            model_path,
            custom_objects={
                'TFBertModel': TFBertModel,
                'scale_sentiment_output': scale_sentiment_output
            }
        )
        print("Model loaded successfully.")

        # Check if 'tf_model.h5' exists in bert_dir
        tf_model_h5_path = os.path.join(bert_dir, 'tf_model.h5')
        if os.path.exists(tf_model_h5_path):
            # Load the fine-tuned BERT model separately
            bert_model = TFBertModel.from_pretrained(bert_dir)
            print("Fine-tuned BERT model loaded from fine_tuned_bert.")
        else:
            # Get the bert_model from the loaded model
            try:
                bert_model = model.get_layer('bert')
                print("BERT model extracted from loaded model.")
            except ValueError:
                # Layer not found, try alternative names
                bert_model = None
                for layer in model.layers:
                    if isinstance(layer, TFBertModel):
                        bert_model = layer
                        print(f"BERT model found in model layers: {layer.name}")
                        break
                if bert_model is None:
                    raise ValueError("Could not find TFBertModel in the loaded model.")
    else:
        # Create bert_model
        bert_model = TFBertModel.from_pretrained('bert-base-uncased', name='bert')
        print("Base BERT model loaded.")
        # Create the multi-task model using bert_model
        model = create_multitask_model(bert_model, max_length=128)
        print("New model created.")

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={
            'category_output': 'sparse_categorical_crossentropy',
            'quality_output': 'sparse_categorical_crossentropy',
            'sentiment_output': 'mean_squared_error'
        },
        metrics={
            'category_output': 'accuracy',
            'quality_output': 'accuracy',
            'sentiment_output': 'mae'
        }
    )
    print("Model compiled successfully.")

    # Define early stopping callback
    earlystopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        verbose=1
    )

    # Train the model
    print("Starting model training...")
    model.fit(
        x=train_dataset[0],
        y=train_dataset[1],
        validation_data=val_dataset,
        epochs=10,  # Adjust epochs as needed
        batch_size=16,
        callbacks=[earlystopping_callback],
        verbose=1
    )
    print("Model training completed.")

    # Save the entire model in TensorFlow's SavedModel format
    print(f"Saving the entire model to {model_path}...")
    model.save(model_path, save_format='tf')
    print("Model saved.")

    # Save the fine-tuned BERT model and tokenizer
    print("Saving the fine-tuned BERT model and tokenizer...")
    bert_model.save_pretrained(bert_dir)
    tokenizer.save_pretrained(bert_dir)
    print("Fine-tuned BERT model and tokenizer saved.")

    # Make predictions and save results
    predict_and_save(model, tokenizer, data, output_dir)

# Execute the training or load existing model and predict
if __name__ == "__main__":
    train_and_save_model()




Loading file: data/training/predictionTraining\prediction1.csv
Loading file: data/training/predictionTraining\prediction2.csv
Loading file: data/training/predictionTraining\prediction3.csv
Loading file: data/training/predictionTraining\prediction4.csv
Loading file: data/training/predictionTraining\prediction5.csv
Loading file: data/training/predictionTraining\prediction6.csv
All files concatenated into one DataFrame.
Tokenization completed successfully.
Loading existing model from fine_tuned_bert\saved_model...
Model loaded successfully.


All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at fine_tuned_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Fine-tuned BERT model loaded from fine_tuned_bert.
Model compiled successfully.
Starting model training...
Epoch 1/10
















Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.
Model training completed.
Saving the entire model to fine_tuned_bert\saved_model...
INFO:tensorflow:Assets written to: fine_tuned_bert\saved_model\assets


INFO:tensorflow:Assets written to: fine_tuned_bert\saved_model\assets


Model saved.
Saving the fine-tuned BERT model and tokenizer...
Fine-tuned BERT model and tokenizer saved.
Tokenization completed successfully.
Predictions saved to data/training/predictionSolution\predictions.csv


In [18]:
# Cell 3: Combined evaluation with appending logic

# Description:
# Processes multiple CSV files within the data/training/predictionSolution 
# directory to evaluate the accuracy of predicted categories, quality levels, 
# and sentiment scores against their true values. For each file, it calculates
# the percentage accuracy for category and quality predictions, as well as 
# sentiment accuracy based on a specified tolerance level. The results are 
# aggregated and appended to a consolidated CSV file (evaluation.csv) in the 
# data/training/evaluations directory, ensuring continuous updates without 
# overwriting existing data.

# import os
# import glob
# import pandas as pd

def calculate_scores(file_path, tolerance=0.1):
    
    # Load the data
    data = pd.read_csv(file_path)
    
    # Ensure that the necessary columns exist
    required_columns = ['Category Truth', 'Predicted Category', 
                        'Quality Truth', 'Predicted Quality',
                        'Sentiment Truth', 'Predicted Sentiment']
    for col in required_columns:
        if col not in data.columns:
            raise KeyError(f"Column '{col}' is missing from the data.")
    
    # Calculate Category Accuracy
    category_correct = (data['Predicted Category'] == data['Category Truth']).sum()
    total_predictions = len(data)
    category_accuracy = (category_correct / total_predictions) * 100
    
    # Calculate Quality Accuracy
    quality_correct = (data['Predicted Quality'] == data['Quality Truth']).sum()
    quality_accuracy = (quality_correct / total_predictions) * 100
    
    # Calculate Sentiment Accuracy
    sentiment_within_tolerance = (abs(data['Predicted Sentiment'] - data['Sentiment Truth']) <= tolerance).sum()
    sentiment_accuracy = (sentiment_within_tolerance / total_predictions) * 100
    
    # Prepare the results dictionary
    results = {
        'file': os.path.basename(file_path),
        'category_accuracy (%)': round(category_accuracy, 2),
        'quality_accuracy (%)': round(quality_accuracy, 2),
        'sentiment_accuracy (%)': round(sentiment_accuracy, 2),
    }
    
    return results

def process_multiple_files(input_directory, output_file, tolerance=0.1):
    
    # Find all CSV files in the input directory
    file_paths = glob.glob(os.path.join(input_directory, '*.csv'))
    
    if not file_paths:
        print(f"No CSV files found in the directory: {input_directory}")
        return

    # Initialize an empty list to store all evaluation results
    all_results = []

    for file in file_paths:
        print(f"Processing file: {file}")
        try:
            # Calculate scores and append the results to the list
            all_results.append(calculate_scores(file, tolerance))
        except Exception as e:
            print(f"Error processing {file}: {e}")
            continue  # Proceed to the next file
    
    if not all_results:
        print("No evaluation results to save.")
        return

    # Convert the list of results into a DataFrame
    results_df = pd.DataFrame(all_results)

    # Check if the output file already exists
    if os.path.exists(output_file):
        # If the file exists, load it and append the new results
        existing_df = pd.read_csv(output_file)
        results_df = pd.concat([existing_df, results_df], ignore_index=True)

    # Save the results, appending to the existing data
    os.makedirs(os.path.dirname(output_file), exist_ok=True)  # Ensure the output directory exists
    results_df.to_csv(output_file, index=False)
    
    print(f"Results appended to {output_file}")

# Example usage
if __name__ == "__main__":
    # Directory containing evaluation files
    input_dir = 'data/training/predictionSolution'
    
    # Path to save combined evaluation results
    output_file = 'data/training/evaluations/evaluation.csv'
    
    # Process all evaluation files in the directory and save to a single file
    process_multiple_files(input_dir, output_file, tolerance=0.1)


Processing file: data/training/predictionSolution\predictions.csv
Results appended to data/training/evaluations/evaluation.csv
