In [1]:
import os
import pandas as pd
from PIL import Image
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import Tuple

from utils.ml_logging import get_logger

logger = get_logger()

# Define the target directory
target_directory = r"C:\Users\pablosal\Desktop\gbb-ai-smart-document-processing"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbb-ai-smart-document-processing


## Evaluations

### Building Evaluation Framework

📄 **Use Case**: We are focused on classifying various types of commercial documents (e.g., invoices, receipts, contracts) from scanned images. This is essential for automating document processing and enhancing efficiency in managing large volumes of documents. Our goal is to assist enterprises with extensive archives of unlabelled data. Currently, these documents are either unprocessed or manually processed, which is highly inefficient.

### Our Solution

🚀 **Solution**: We are implementing a multi-faceted approach to tackle this problem, leveraging the power of advanced AI technologies:

1. **Azure AI Document Intelligence**:
   - Utilizing Azure's robust AI capabilities to extract and analyze information from documents.

2. **Small Language Models (SLMs)**:
   - Employing lightweight models for quick and efficient document classification.

3. **Large Language Models (LLMs)**:
   - Harnessing the power of large-scale models for more complex and nuanced document understanding.

### Benefits

✅ **Benefits**:
- **Automation**: Reduces the need for manual document processing, saving time and resources.
- **Efficiency**: Streamlines the handling of large document archives, making it easier to manage and retrieve information.
- **Scalability**: Capable of processing vast amounts of data, suitable for enterprises of any size.

### Methodology

🔄 **Methodology**: We follow an "apples-to-apples" comparison approach to ensure fair evaluation:

1. **Training and Validation Sets**:
   - We split our data into training and validation sets to train the models and evaluate their performance on unseen data.

2. **Consistent Evaluation**:
   - We use the same evaluation metrics and visualizations for all methodologies to ensure a consistent comparison.

By doing this, we can get a clear picture of how well our aprroaches are performing in classifying different types of documents and where improvements might be needed. This systematic approach allows us to base our conclusions on solid data and make informed decisions about the best methodology to use.

### Evaluating the Approaches

🔍 **Model Comparison**: We will use different methodologies and compare their performance to find the best one. This involves:

1. **Comparing Predictions**:
   - We compare the actual document types (true labels) with the types predicted by the model (predicted labels).

2. **Calculating Metrics**:
   - **Accuracy**: The percentage of correctly predicted document types out of all predictions.
   - **Precision**: When the model predicts a document type, how often it is correct.
   - **Recall**: How often the model correctly identifies a document type when it is actually that type.
   - **F1 Score**: A balance between precision and recall, providing a single metric to evaluate performance.

3. **Confusion Matrix**:
   - We create a confusion matrix, which is a table that shows the number of correct and incorrect predictions for each document type. This helps us see where the model is making mistakes.

4. **Detailed Report**:
   - We generate a detailed report that breaks down these metrics for each document type, giving us insights into the performance for specific categories.

### Visualizing the Results

📊 **Visual Representations**: We create visual representations to make it easier to understand the model's performance:

1. **Confusion Matrix**:
   - A heatmap that shows where the model is getting confused between different document types.

2. **Classification Report**:
   - A heatmap that shows detailed metrics (precision, recall, F1 score) for each document type.

3. **Evaluation Metrics**:
   - A bar chart that shows the overall performance metrics (accuracy, precision, recall, F1 score).



In [13]:
from typing import Dict, Tuple
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from utils.ml_logging import get_logger

logger = get_logger()

In [37]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from typing import Dict, Tuple
from utils.ml_logging import get_logger

logger = get_logger()

def evaluate_model(
    y_true: np.ndarray, y_pred: np.ndarray, labels: np.ndarray, show_visualization: bool = False
) -> Tuple[Dict[str, float], np.ndarray, Dict]:
    """
    Evaluate the performance of a classification model.

    Args:
        y_true (np.ndarray): True labels.
        y_pred (np.ndarray): Predicted labels.
        labels (np.ndarray): List of label names.
        show_visualization (bool): Whether to show visualizations or not.

    Returns:
        Tuple[Dict[str, float], np.ndarray, Dict]: Dictionary containing evaluation metrics, confusion matrix, and classification report.
    """
    try:
        logger.info("Evaluating model performance...")

        valid_labels = set(labels)
        original_y_pred = y_pred.copy()
        y_pred = np.array([
            label if label in valid_labels else 'hallucination' for label in y_pred
        ])

        hallucinations = [original_y_pred[i] for i, label in enumerate(y_pred) if label == 'hallucination']
        if hallucinations:
            hallucination_counts = pd.Series(hallucinations).value_counts()
            hallucination_table = pd.DataFrame({
                'Hallucination': hallucination_counts.index,
                'Count': hallucination_counts.values
            })
            logger.info(f"Invalid predictions detected and marked as 'hallucination':\n{hallucination_table}")

        hallucination_indices = [i for i, label in enumerate(y_pred) if label == 'hallucination']

        true_labels_for_hallucinations = y_true[hallucination_indices]
        if true_labels_for_hallucinations.size > 0:
            true_labels_counts = pd.Series(true_labels_for_hallucinations).value_counts()
            true_labels_table = pd.DataFrame({
                'True Label': true_labels_counts.index,
                'Count': true_labels_counts.values
            })
            logger.info(f"True labels corresponding to hallucinations:\n{true_labels_table}")

        valid_indices = [i for i in range(len(y_pred)) if i not in hallucination_indices]
        y_true_filtered = y_true[valid_indices]
        y_pred_filtered = y_pred[valid_indices]

        logger.info(f"Length of y_true_filtered: {len(y_true_filtered)}")
        logger.info(f"Length of y_pred_filtered: {len(y_pred_filtered)}")

        if len(y_true_filtered) != len(y_pred_filtered):
            logger.error("The lengths of y_true_filtered and y_pred_filtered do not match.")
            raise ValueError("The lengths of y_true_filtered and y_pred_filtered do not match.")

        accuracy = accuracy_score(y_true_filtered, y_pred_filtered)
        precision = precision_score(y_true_filtered, y_pred_filtered, labels=labels, average="weighted", zero_division=0)
        recall = recall_score(y_true_filtered, y_pred_filtered, labels=labels, average="weighted", zero_division=0)
        f1 = f1_score(y_true_filtered, y_pred_filtered, labels=labels, average="weighted", zero_division=0)
        conf_matrix = confusion_matrix(y_true_filtered, y_pred_filtered, labels=labels)
        class_report_dict = classification_report(
            y_true_filtered, y_pred_filtered, labels=labels, target_names=labels, output_dict=True, zero_division=0
        )
        class_report_df = pd.DataFrame(class_report_dict).transpose()

        metrics = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
        }

        logger.info(f"Accuracy: {accuracy:.4f}")
        logger.info(f"Precision: {precision:.4f}")
        logger.info(f"Recall: {recall:.4f}")
        logger.info(f"F1 Score: {f1:.4f}")

        if show_visualization:
            plot_evaluation_metrics(metrics)
            plot_confusion_matrix(conf_matrix, labels)
            plot_classification_report(class_report_df, labels)

        return metrics, conf_matrix, class_report_dict

    except Exception as e:
        logger.error(f"Error during model evaluation: {e}")
        raise

def plot_confusion_matrix(conf_matrix: np.ndarray, labels: np.ndarray) -> None:
    """
    Plot the confusion matrix using Plotly.

    Args:
        conf_matrix (np.ndarray): Confusion matrix.
        labels (np.ndarray): List of label names.
    """
    try:
        fig = px.imshow(
            conf_matrix,
            labels=dict(x="Predicted Labels", y="True Labels", color="Count"),
            x=labels,
            y=labels,
            color_continuous_scale="Blues",
        )
        fig.update_layout(
            title="Confusion Matrix",
            xaxis_title="Predicted Label",
            yaxis_title="True Label",
        )
        fig.show()
    except Exception as e:
        logger.error(f"Error plotting confusion matrix: {e}")
        raise

def plot_classification_report(class_report: pd.DataFrame, labels: np.ndarray) -> None:
    """
    Plot the classification report using Plotly.

    Args:
        class_report (pd.DataFrame): Classification report as a DataFrame.
        labels (np.ndarray): List of label names.
    """
    try:
        report_df = class_report.iloc[:-3, :]  # Exclude 'accuracy', 'macro avg', and 'weighted avg'

        fig = go.Figure(
            data=[
                go.Heatmap(
                    z=report_df.iloc[:, :-1].values,
                    x=report_df.columns[:-1],
                    y=report_df.index,
                    colorscale="Blues",
                    showscale=True,
                )
            ]
        )
        fig.update_layout(
            title="Classification Report Heatmap",
            xaxis_title="Metrics",
            yaxis_title="Document Types",
        )
        fig.show()
    except Exception as e:
        logger.error(f"Error plotting classification report: {e}")
        raise

def plot_evaluation_metrics(metrics: Dict[str, float]) -> None:
    """
    Plot the evaluation metrics using Plotly.

    Args:
        metrics (Dict[str, float]): Dictionary containing evaluation metrics.
    """
    try:
        fig = px.bar(
            x=list(metrics.keys()),
            y=list(metrics.values()),
            labels={"x": "Metrics", "y": "Values"},
            text_auto=True,
        )
        fig.update_layout(
            title="Evaluation Metrics",
            yaxis=dict(range=[0, 1]),
            xaxis_title="Metrics",
            yaxis_title="Values",
        )
        fig.show()
    except Exception as e:
        logger.error(f"Error plotting evaluation metrics: {e}")
        raise

## Testing Evaluation Framework

In [39]:
import numpy as np

labels = np.array(
    [
        "letter",
        "form",
        "email",
        "handwritten",
        "advertisement",
        "scientific report",
        "scientific publication",
        "specification",
        "file folder",
        "news article",
        "budget",
        "invoice",
        "presentation",
        "questionnaire",
        "resume",
        "memo",
    ]
)
y_true = np.random.choice(labels, 100)

# Introduce some hallucinations in the predictions
y_pred = np.random.choice(np.append(labels, ["unknown", "random", "fake"]), 100)

# Evaluate the model with the simulated data
metrics, conf_matrix, class_report = evaluate_model(
    y_true, y_pred, labels, show_visualization=True
)

2024-08-11 21:19:18,723 - micro - MainProcess - INFO     Evaluating model performance... (3955842053.py:evaluate_model:34)
INFO:micro:Evaluating model performance...
2024-08-11 21:19:18,732 - micro - MainProcess - INFO     Invalid predictions detected and marked as 'hallucination':
  Hallucination  Count
0        random      8
1       unknown      7
2          fake      7 (3955842053.py:evaluate_model:49)
INFO:micro:Invalid predictions detected and marked as 'hallucination':
  Hallucination  Count
0        random      8
1       unknown      7
2          fake      7
2024-08-11 21:19:18,741 - micro - MainProcess - INFO     True labels corresponding to hallucinations:
          True Label  Count
0               form      4
1      questionnaire      4
2  scientific report      3
3            invoice      3
4              email      3
5               memo      2
6             letter      2
7      advertisement      1 (3955842053.py:evaluate_model:60)
INFO:micro:True labels corresponding to 

In [40]:
class_report['questionnaire']

{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4.0}