In [1]:
# Cell 1

# Required Libraries and Their Purposes:
# - pandas: Used for data manipulation and analysis, particularly for reading and writing CSV files.
# - transformers: Provides the BERT model and tokenizer for sequence classification tasks.
# - tensorflow: Used for model training, including defining, compiling, and fitting neural networks.
# - numpy: Supports numerical operations, such as manipulating arrays and tensors.
# - sklearn: Used for splitting data into training and validation sets.

# Check and Print Library Versions
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel
import os
import glob

print("Pandas version:", pd.__version__)
print("TensorFlow version:", tf.__version__)
print("NumPy version:", np.__version__)
print("scikit-learn version:", sklearn.__version__)
print("Transformers version:", transformers.__version__)

# Required pip installs for the project (Uncomment to install)

# !pip install pandas==2.2.2
# !pip install tensorflow==2.17.0
# !pip install numpy==1.23.5
# !pip install scikit-learn==1.4.2
# !pip install transformers==4.44.1



Pandas version: 2.2.2
TensorFlow version: 2.17.0
NumPy version: 1.23.5
scikit-learn version: 1.4.2
Transformers version: 4.44.1


In [5]:
# # This Training and Prediction Pipeline processes multiple CSV files by first 
# # concatenating them into a single DataFrame. It uses a pre-trained BERT-based 
# # model to tokenize the text and generate predictions for various submetrics 
# # (e.g., "Thank Customer," "Ask Permission," etc.). If no model is available, 
# # it will train a new one using the concatenated data. The model is trained with 
# # early stopping to prevent overfitting, and the predictions are then saved to a 
# # CSV file for further evaluation.

# This is better for training due to a larger data pool, but if you try to 
# evaluate it, you may not get accurate %'s because with a large enough data 
# set, you will almost certainlyhave at least a 1 in every column/row which 
# will skew the go or no go scoring system

# import os
# import glob
# import pandas as pd
# import numpy as np
# import tensorflow as tf
# from transformers import BertTokenizer, TFBertForSequenceClassification

# # Load and preprocess the data
# def load_data(file_path):
#     data = pd.read_csv(file_path)
#     return data

# # Map the labels for category prediction
# def map_labels(data):
#     category_mapping = {'Greetings': 0, 'Account Verification': 1, 'Problem Investigation': 2, 'Closure': 3}
#     data['Category Label'] = data['Category Truth'].apply(lambda x: category_mapping[x])
#     return data

# # Concatenate multiple files into a single DataFrame
# def concatenate_files(file_paths):
#     data_frames = []
#     for file_path in file_paths:
#         print(f"Loading file: {file_path}")
#         df = pd.read_csv(file_path)
#         data_frames.append(df)
#     concatenated_df = pd.concat(data_frames, ignore_index=True)
#     return concatenated_df

# # Tokenize the input text using BERT tokenizer
# def tokenize_texts(tokenizer, texts, max_length=128):
#     return tokenizer(
#         texts.tolist(),
#         padding='max_length',
#         truncation=True,
#         max_length=max_length,
#         return_tensors="tf"
#     )

# # Function to load an existing model and tokenizer
# def load_saved_model_and_tokenizer():
#     try:
#         # Attempt to load the Keras model
#         model = tf.keras.models.load_model('sent_ana_model', custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})
#         tokenizer = BertTokenizer.from_pretrained('sent_ana_model')
#         print("Model loaded successfully.")
#     except Exception as e:
#         print("No existing model found. Building and training a new model.")
#         print(e)
#         model = build_model()  # Define this function as per your architecture if needed
#         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     return model, tokenizer

# # Compile and train the model with early stopping
# def train_model(model, tokenizer, data, input_dir, output_dir):
#     # Tokenize the text data
#     inputs = tokenize_texts(tokenizer, data['Text'])

#     # Define labels for each task
#     labels_category = data['Category Label']
#     labels_thank_customer = data['Thank Customer']
#     labels_introduce_self = data['Introduce Self']
#     labels_ask_reason = data['Ask Reason']
#     labels_ask_accurate = data['Ask Accurate Details']
#     labels_ask_permission = data['Ask Permission']
#     labels_resolve_issue = data['Resolve Issue']
#     labels_offer_assistance = data['Offer Assistance']
#     labels_thank_again = data['Thank Again']
#     labels_farewell = data['Farewell']

#     # Prepare the dataset for training
#     dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), 
#         (labels_category, labels_thank_customer, labels_introduce_self, labels_ask_reason,
#          labels_ask_accurate, labels_ask_permission, labels_resolve_issue, labels_offer_assistance,
#          labels_thank_again, labels_farewell)
#     )).shuffle(100).batch(16)

#     # Compile the model
#     optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
#     loss = {
#         'category_output': 'sparse_categorical_crossentropy',
#         'thank_customer': 'sparse_categorical_crossentropy',
#         'introduce_self': 'sparse_categorical_crossentropy',
#         'ask_reason': 'sparse_categorical_crossentropy',
#         'ask_accurate': 'sparse_categorical_crossentropy',
#         'ask_permission': 'sparse_categorical_crossentropy',
#         'resolve_issue': 'sparse_categorical_crossentropy',
#         'offer_assistance': 'sparse_categorical_crossentropy',
#         'thank_again': 'sparse_categorical_crossentropy',
#         'farewell': 'sparse_categorical_crossentropy'
#     }
#     metrics = {
#         'category_output': 'accuracy',
#         'thank_customer': 'accuracy',
#         'introduce_self': 'accuracy',
#         'ask_reason': 'accuracy',
#         'ask_accurate': 'accuracy',
#         'ask_permission': 'accuracy',
#         'resolve_issue': 'accuracy',
#         'offer_assistance': 'accuracy',
#         'thank_again': 'accuracy',
#         'farewell': 'accuracy'
#     }

#     model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

#     # Add early stopping
#     early_stopping_callback = tf.keras.callbacks.EarlyStopping(
#         monitor='val_loss',
#         patience=3,  # Stop training after 3 epochs of no improvement
#         restore_best_weights=True
#     )

#     # Train the model with early stopping
#     model.fit(dataset, epochs=10, callbacks=[early_stopping_callback])

#     # Save the model using Keras method
#     model.save('sent_ana_model')  # Save the Keras model
#     tokenizer.save_pretrained('sent_ana_model')  # Save the tokenizer

#     return model

# # Make predictions and save to CSV
# def make_predictions(model, tokenizer, data, output_file):
#     # Tokenize the input text
#     inputs = tokenize_texts(tokenizer, data['Text'])

#     # Predict for each output layer
#     predictions = model.predict(dict(inputs))

#     # Convert predictions to binary format for sub-criteria (0 or 1)
#     prediction_category = np.argmax(predictions[0], axis=1)
#     prediction_thank_customer = np.argmax(predictions[1], axis=1)
#     prediction_introduce_self = np.argmax(predictions[2], axis=1)
#     prediction_ask_reason = np.argmax(predictions[3], axis=1)
#     prediction_ask_accurate = np.argmax(predictions[4], axis=1)
#     prediction_ask_permission = np.argmax(predictions[5], axis=1)
#     prediction_resolve_issue = np.argmax(predictions[6], axis=1)
#     prediction_offer_assistance = np.argmax(predictions[7], axis=1)
#     prediction_thank_again = np.argmax(predictions[8], axis=1)
#     prediction_farewell = np.argmax(predictions[9], axis=1)

#     # Add predictions to the dataframe
#     data['Predicted Category'] = prediction_category
#     data['Predicted Thank Customer'] = prediction_thank_customer
#     data['Predicted Introduce Self'] = prediction_introduce_self
#     data['Predicted Ask Reason'] = prediction_ask_reason
#     data['Predicted Ask Accurate Details'] = prediction_ask_accurate
#     data['Predicted Ask Permission'] = prediction_ask_permission
#     data['Predicted Resolve Issue'] = prediction_resolve_issue
#     data['Predicted Offer Assistance'] = prediction_offer_assistance
#     data['Predicted Thank Again'] = prediction_thank_again
#     data['Predicted Farewell'] = prediction_farewell

#     # Save the results to a CSV file
#     data.to_csv(output_file, index=False)
#     print(f"Predictions saved to {output_file}")

# # Example pipeline
# def run_pipeline():
#     # Input and output directories for CSV files
#     input_dir = 'data/training/metricTraining'
#     output_dir = 'data/training/metricSolution'
#     os.makedirs(output_dir, exist_ok=True)

#     # Get list of all CSV files in the input directory
#     file_paths = glob.glob(os.path.join(input_dir, '*.csv'))
    
#     if not file_paths:
#         print(f"No CSV files found in the directory: {input_dir}")
#         return

#     # Concatenate all files into a single DataFrame
#     data = concatenate_files(file_paths)
#     print("All files concatenated into one DataFrame.")
    
#     # Try loading an existing model and tokenizer, otherwise build a new one
#     model, tokenizer = load_saved_model_and_tokenizer()

#     # Map the labels
#     data = map_labels(data)

#     # Train the model
#     model = train_model(model, tokenizer, data, input_dir, output_dir)

#     # Make predictions and save to a CSV file
#     make_predictions(model, tokenizer, data, os.path.join(output_dir, 'predictions.csv'))

# # Run the pipeline
# run_pipeline()


Loading file: data/training/metricTraining\metric1.csv
Loading file: data/training/metricTraining\metric2.csv
Loading file: data/training/metricTraining\metric3.csv
Loading file: data/training/metricTraining\metric4.csv
Loading file: data/training/metricTraining\metric5.csv
Loading file: data/training/metricTraining\metric6.csv
All files concatenated into one DataFrame.
Model loaded successfully.
Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/10
Epoch 3/10
Epoch 4/10


Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: sent_ana_model\assets


INFO:tensorflow:Assets written to: sent_ana_model\assets
  inputs = self._flatten_to_reference_inputs(inputs)


Predictions saved to data/training/metricSolution\predictions.csv


In [2]:
# Cell 2

# Description
# The Non-Concatenating Prediction Pipeline processes multiple CSV files 
# individually without combining them. It loads a pre-trained BERT-based 
# model and tokenizer, tokenizes the text from each file, and generates 
# predictions for various submetrics (like "Thank Customer," "Ask Permission," 
# etc.). The predictions are compared with the true labels and saved to new 
# CSV files in a specified output directory. Each file is handled separately, 
# allowing for file-specific evaluations of predictions without altering the 
# original data.

# Load and preprocess the data from a single file
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Map the labels for category prediction (if needed for evaluation)
def map_labels(data):
    category_mapping = {'Greetings': 0, 'Account Verification': 1, 'Problem Investigation': 2, 'Closure': 3}
    data['Category Label'] = data['Category Truth'].apply(lambda x: category_mapping[x])
    return data

# Tokenize the input text using BERT tokenizer
def tokenize_texts(tokenizer, texts, max_length=128):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

# Function to load the saved model and tokenizer (for predictions only)
def load_saved_model_and_tokenizer():
    try:
        # Load the trained Keras model
        model = tf.keras.models.load_model('sent_ana_model', custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})
        tokenizer = BertTokenizer.from_pretrained('sent_ana_model')
        print("Model and tokenizer loaded successfully.")
    except Exception as e:
        print("Error loading the model or tokenizer.")
        raise e
    return model, tokenizer

# Make predictions for a single file and save results to CSV
def make_predictions_for_file(model, tokenizer, data, output_file):
    # Tokenize the input text
    inputs = tokenize_texts(tokenizer, data['Text'])

    # Predict for each output layer
    predictions = model.predict(dict(inputs))

    # Convert predictions to binary format for sub-criteria (0 or 1)
    prediction_category = np.argmax(predictions[0], axis=1)
    prediction_thank_customer = np.argmax(predictions[1], axis=1)
    prediction_introduce_self = np.argmax(predictions[2], axis=1)
    prediction_ask_reason = np.argmax(predictions[3], axis=1)
    prediction_ask_accurate = np.argmax(predictions[4], axis=1)
    prediction_ask_permission = np.argmax(predictions[5], axis=1)
    prediction_resolve_issue = np.argmax(predictions[6], axis=1)
    prediction_offer_assistance = np.argmax(predictions[7], axis=1)
    prediction_thank_again = np.argmax(predictions[8], axis=1)
    prediction_farewell = np.argmax(predictions[9], axis=1)

    # Add predictions to the dataframe
    data['Predicted Category'] = prediction_category
    data['Predicted Thank Customer'] = prediction_thank_customer
    data['Predicted Introduce Self'] = prediction_introduce_self
    data['Predicted Ask Reason'] = prediction_ask_reason
    data['Predicted Ask Accurate Details'] = prediction_ask_accurate
    data['Predicted Ask Permission'] = prediction_ask_permission
    data['Predicted Resolve Issue'] = prediction_resolve_issue
    data['Predicted Offer Assistance'] = prediction_offer_assistance
    data['Predicted Thank Again'] = prediction_thank_again
    data['Predicted Farewell'] = prediction_farewell

    # Save the results to a CSV file
    data.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

# Prediction-only pipeline that processes each file individually
def run_prediction_pipeline():
    # Input and output directories for CSV files
    input_dir = 'data/training/metricTraining'
    output_dir = 'data/training/metricSolution'
    os.makedirs(output_dir, exist_ok=True)

    # Get list of all CSV files in the input directory
    file_paths = glob.glob(os.path.join(input_dir, '*.csv'))
    
    if not file_paths:
        print(f"No CSV files found in the directory: {input_dir}")
        return
    
    # Load the saved model and tokenizer
    model, tokenizer = load_saved_model_and_tokenizer()

    # Process each file individually
    for file_path in file_paths:
        # Load the data for the current file
        data = load_data(file_path)
        
        # Map the labels if needed (optional, depending on your evaluation process)
        data = map_labels(data)
        
        # Prepare the output file path (e.g., saving with a similar name in the output directory)
        output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.csv', '_predictions.csv'))
        
        # Make predictions and save the results
        make_predictions_for_file(model, tokenizer, data, output_file)

# Run the prediction-only pipeline
run_prediction_pipeline()




Model and tokenizer loaded successfully.


  inputs = self._flatten_to_reference_inputs(inputs)


Predictions saved to data/training/metricSolution\metric1_predictions.csv


  inputs = self._flatten_to_reference_inputs(inputs)


Predictions saved to data/training/metricSolution\metric2_predictions.csv
Predictions saved to data/training/metricSolution\metric3_predictions.csv
Predictions saved to data/training/metricSolution\metric4_predictions.csv
Predictions saved to data/training/metricSolution\metric5_predictions.csv
Predictions saved to data/training/metricSolution\metric6_predictions.csv


In [3]:
# Cell 3

# Description:
# Evaluates the accuracy of predicted values against truth values for specific 
# submetrics across multiple CSV files in a given directory (input_dir). 
# It calculates the accuracy for each submetric (like "Thank Customer" 
# and "Ask Permission") by comparing the truth and predicted columns, 
# ignoring rows where the truth values are zero. The accuracy for each submetric 
# is computed for each file, and an overall average accuracy is calculated. 
# The results are stored in a DataFrame, with each row representing a file, and 
# a final row showing the overall average across all files.

# Input directory containing all the CSV files
input_dir = 'data/training/metricSolution'

# List of submetrics
submetrics = ['Thank Customer', 'Introduce Self', 'Ask Reason', 'Ask Accurate Details',
              'Ask Permission', 'Resolve Issue', 'Offer Assistance', 'Thank Again', 'Farewell']

# Function to calculate submetric accuracy for a single DataFrame
def calculate_submetric_accuracy(data):
    accuracy_results = {}
    for submetric in submetrics:
        truth_col = submetric
        pred_col = f'Predicted {submetric}'

        # Check if both truth and prediction columns have no 1's at all
        if (data[truth_col].sum() == 0) and (data[pred_col].sum() == 0):
            # If no 1's in both truth and prediction columns, accuracy is 100%
            accuracy = 100.0
        else:
            # Only consider rows where the truth value is 1 (ignoring 0s)
            valid_rows = data[data[truth_col] == 1]
            if len(valid_rows) > 0:
                # Calculate accuracy as percentage of correct predictions (truth == prediction)
                accuracy = (valid_rows[truth_col] == valid_rows[pred_col]).mean() * 100
            else:
                accuracy = 0.0  # If no valid rows, set accuracy to 0

        accuracy_results[submetric] = round(accuracy, 2)
    return accuracy_results

# Function to load CSV files from the directory and calculate accuracy for each file
def evaluate_files(input_dir):
    # Get list of all CSV files in the directory
    file_paths = glob.glob(os.path.join(input_dir, '*.csv'))
    
    if not file_paths:
        raise FileNotFoundError(f"No CSV files found in the directory: {input_dir}")
    
    results = {}
    for file in file_paths:
        file_name = os.path.basename(file).replace('.csv', '')
        data = pd.read_csv(file)
        results[file_name] = calculate_submetric_accuracy(data)
    
    # Convert results to DataFrame for better visualization
    results_df = pd.DataFrame(results).T
    
    # Calculate the average for each row (file)
    results_df['Average'] = results_df.mean(axis=1)
    
    # Add an overall average row
    overall_average = results_df.mean(axis=0)
    overall_average['Task'] = 'Average'
    results_df.loc['Average'] = overall_average
    
    return results_df

# Function to save the evaluation results to a CSV file
def save_evaluation_results(results_df, output_file):
    results_df.to_csv(output_file, index=True)
    print(f"Evaluation results saved to {output_file}")

# Example usage
input_dir = 'data/training/metricSolution'
output_file = 'data/training/metricEvaluation/submetricEvaluation.csv'

# Run the evaluation on all files in the directory
results_df = evaluate_files(input_dir)

# Save the results to CSV
save_evaluation_results(results_df, output_file)




Evaluation results saved to data/training/metricEvaluation/submetricEvaluation.csv
