In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
INDOX_API_KEY = os.getenv("INDOX_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
from indoxJudge import Evaluator
from indoxJudge.models import IndoxApi
from indoxJudge.models import OpenAi
# from indoxJudge.metrics import bias, fairness, accuracy, precision, recall, f1_score, roc_auc_score, confusion_matrix
model = IndoxApi(api_key=INDOX_API_KEY)
# model = OpenAi(api_key=OPENAI_API_KEY,model="gpt-3.5-turbo-0125")

In [2]:
query = "What are the main benefits and drawbacks of remote work?"
retrieval_context = [
    "Remote work allows employees to work from anywhere, reducing the need for commuting. However, it may lead to a sense of isolation and reduced team cohesion.",
    "Studies have shown that remote work can lead to increased productivity in some cases. However, it can also result in longer working hours and a blurred line between work and personal life.",
    "Managers often believe that remote workers are less productive, which can result in biased evaluations and reduced career advancement opportunities for these employees.",
    "Some people, such as parents with young children, find remote work challenging due to the need to balance work and family responsibilities.",
    "While remote work offers flexibility, it is not suitable for all types of jobs, particularly those requiring high levels of collaboration or access to specialized equipment."
    ]
response ="Remote work offers flexibility and reduced commute time, which can lead to a better work-life balance. However, it can also lead to isolation, reduced collaboration, and a lack of clear boundaries between work and personal life. Some studies suggest that remote work can reduce productivity, especially in creative fields. There is also a concern about managers having biased views towards remote workers, assuming they are less committed or productive. Additionally, certain groups may face challenges, such as parents with young children or individuals with disabilities."

In [8]:
from typing import List
from loguru import logger
import sys
import json
from indoxJudge.metrics import (Faithfulness, AnswerRelevancy, Bias, Gruen, Rouge,
                                KnowledgeRetention, BLEU, Hallucination, Toxicity, BertScore)

# Set up logging
logger.remove()  # Remove the default logger
logger.add(sys.stdout,
           format="<green>{level}</green>: <level>{message}</level>",
           level="INFO")
logger.add(sys.stdout,
           format="<red>{level}</red>: <level>{message}</level>",
           level="ERROR")

# Perplexity , Exact Match (EM),Diversity,Latency


class LlmEvaluation:
    """
    The Evaluator class is designed to evaluate various aspects of language model outputs using specified metrics.

    It supports metrics such as Faithfulness, Answer Relevancy, Bias, Contextual Relevancy, GEval, Hallucination,
    Knowledge Retention, Toxicity, BertScore, BLEU, Rouge, and METEOR.
    """

    def __init__(self, llm_as_judge, llm_response, retrieval_context, query):
        """
        Initializes the Evaluator with a language model and a list of metrics.

        Args:
            llm_as_judge: The language model .
        """
        self.model = llm_as_judge
        self.metrics = [
            Faithfulness(llm_response=llm_response, retrieval_context=retrieval_context),
            AnswerRelevancy(query=query, llm_response=llm_response),
            Bias(llm_response=llm_response),
            Hallucination(llm_response=llm_response, retrieval_context=retrieval_context),
            KnowledgeRetention(messages=[{"query": query, "llm_response": llm_response}]),
            Toxicity(messages=[{"query": query, "llm_response": llm_response}]),
            BertScore(llm_response=llm_response, retrieval_context=retrieval_context),
            BLEU(llm_response=llm_response, retrieval_context=retrieval_context),
            # Rouge(llm_response=llm_response, retrieval_context=retrieval_context),
            # Gruen(candidates=llm_response)
        ]
        logger.info("Evaluator initialized with model and metrics.")
        self.set_model_for_metrics()
        self.evaluation_score = 0
        self.metrics_score = {}

    def set_model_for_metrics(self):
        """
        Sets the language model for each metric that requires it.
        """
        for metric in self.metrics:
            if hasattr(metric, 'set_model'):
                metric.set_model(self.model)
        logger.info("Model set for all metrics.")

    # def display_evaluation(self):
    #     from .graph.plots import MetricVisualizer
    #     visualizer = MetricVisualizer(self.metrics_score)
    #     return visualizer.show_all_plots()

    def judge(self):
        """
        Evaluates the language model using the provided metrics and returns the results.

        Returns:
            dict: A dictionary containing the evaluation results for each metric.
        """
        results = {}
        for metric in self.metrics:
            metric_name = metric.__class__.__name__
            try:
                logger.info(f"Evaluating metric: {metric_name}")
                if isinstance(metric, Faithfulness):
                    claims = metric.evaluate_claims()
                    truths = metric.evaluate_truths()
                    verdicts = metric.evaluate_verdicts(claims.claims)
                    reason = metric.evaluate_reason(verdicts, truths.truths)
                    score = metric.calculate_faithfulness_score()
                    results['Faithfulness'] = {
                        'claims': claims.claims,
                        'truths': truths.truths,
                        'verdicts': [verdict.__dict__ for verdict in verdicts.verdicts],
                        'score': score,
                        'reason': reason.reason
                    }
                    self.evaluation_score += score
                    self.metrics_score["Faithfulness"] = score
                elif isinstance(metric, AnswerRelevancy):
                    score = metric.measure()
                    results['AnswerRelevancy'] = {
                        'score': score,
                        'reason': metric.reason,
                        'statements': metric.statements,
                        'verdicts': [verdict.dict() for verdict in metric.verdicts]
                    }
                    self.evaluation_score += score
                    self.metrics_score["AnswerRelevancy"] = score

                # elif isinstance(metric, ContextualRelevancy):
                #     irrelevancies = metric.get_irrelevancies(metric.query, metric.retrieval_contexts)
                #     metric.set_irrelevancies(irrelevancies)
                #     verdicts = metric.get_verdicts(metric.query, metric.retrieval_contexts)
                #     metric.verdicts = verdicts.verdicts  # Ensure verdicts are stored in the metric object
                #
                #     # Determine the score, e.g., based on the number of relevant contexts
                #     score = metric.calculate_score()
                #     reason = metric.get_reason(irrelevancies, score)
                #     results = {
                #         'ContextualRelevancy': {
                #             'verdicts': [verdict.dict() for verdict in verdicts.verdicts],
                #             'reason': reason.dict(),
                #             'score': score
                #         }
                #     }
                #     self.evaluation_score += score
                #
                #     self.metrics_score["ContextualRelevancy"] = score
                # elif isinstance(metric, GEval):
                #     geval_result = metric.g_eval()
                #     results['GEVal'] = geval_result.replace("\n", " ")
                #     geval_data = json.loads(results["GEVal"])
                #     score = geval_data["score"]
                #     self.evaluation_score += int(score) / 8
                #
                #     self.metrics_score["GEVal"] = int(score) / 8

                elif isinstance(metric, KnowledgeRetention):
                    score = metric.measure()
                    results['KnowledgeRetention'] = {
                        'score': score,
                        'reason': metric.reason,
                        'verdicts': [verdict.dict() for verdict in metric.verdicts],
                        'knowledges': [knowledge.data for knowledge in metric.knowledges]
                    }
                    self.evaluation_score += score

                    self.metrics_score["KnowledgeRetention"] = score
                elif isinstance(metric, Hallucination):
                    score = metric.measure()
                    results['Hallucination'] = {
                        'score': score,
                        'reason': metric.reason,
                        'verdicts': [verdict.dict() for verdict in metric.verdicts]
                    }
                    self.evaluation_score += score

                    self.metrics_score["Hallucination"] = score
                elif isinstance(metric, Toxicity):
                    score = metric.measure()
                    results['Toxicity'] = {
                        'score': score,
                        'reason': metric.reason,
                        'opinions': metric.opinions,
                        'verdicts': [verdict.dict() for verdict in metric.verdicts]
                    }
                    self.evaluation_score += score

                    self.metrics_score["Toxicity"] = score
                elif isinstance(metric, Bias):
                    score = metric.measure()
                    results['Bias'] = {
                        'score': score,
                        'reason': metric.reason,
                        'opinions': metric.opinions,
                        'verdicts': [verdict.dict() for verdict in metric.verdicts]
                    }
                    self.evaluation_score += score
                    self.metrics_score["Bias"] = score
                elif isinstance(metric, BertScore):
                    score = metric.measure()
                    results['BertScore'] = {
                        'precision': score['Precision'],
                        'recall': score['Recall'],
                        'f1_score': score['F1-score']
                    }
                    # self.evaluation_score += score

                    # self.metrics_score["BertScore"] = score
                    self.metrics_score['precision'] = score['Precision']
                    self.metrics_score['recall'] = score['Recall']
                    self.metrics_score['f1_score'] =  score['F1-score']

                elif isinstance(metric, BLEU):
                    score = metric.measure()
                    results['BLEU'] = {
                        'score': score
                    }

                    self.metrics_score["BLEU"] = score
                elif isinstance(metric, Rouge):
                    score = metric.measure()
                    results['rouge'] = {
                        'precision': score['Precision'],
                        'recall': score['Recall'],
                        'f1_score': score['F1-score']
                    }

                    self.metrics_score["Rouge"] = score
                    self.metrics_score["Rouge"] = score
                # elif isinstance(metric, Gruen):
                #     score = metric.measure()
                #     results['gruen'] = {
                #         'score': score
                #     }
                logger.info(f"Completed evaluation for metric: {metric_name}")

            except Exception as e:
                logger.error(f"Error evaluating metric {metric_name}: {str(e)}")
        return results


In [9]:
evaluator = LlmEvaluation(model, response, retrieval_context, query)

[32mINFO[0m: [1mEvaluator initialized with model and metrics.[0m
[32mINFO[0m: [1mModel set for all metrics.[0m


In [10]:
result = evaluator.judge()

[32mINFO[0m: [1mEvaluating metric: Faithfulness[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Faithfulness[0m
[32mINFO[0m: [1mEvaluating metric: AnswerRelevancy[0m
[32mINFO[0m: [1mCompleted evaluation for metric: AnswerRelevancy[0m
[32mINFO[0m: [1mEvaluating metric: Bias[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Bias[0m
[32mINFO[0m: [1mEvaluating metric: Hallucination[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Hallucination[0m
[32mINFO[0m: [1mEvaluating metric: KnowledgeRetention[0m
[32mINFO[0m: [1mCompleted evaluation for metric: KnowledgeRetention[0m
[32mINFO[0m: [1mEvaluating metric: Toxicity[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Toxicity[0m
[32mINFO[0m: [1mEvaluating metric: BertScore[0m
[32mINFO[0m: [1mCompleted evaluation for metric: BertScore[0m
[32mINFO[0m: [1mEvaluating metric: BLEU[0m
[32mINFO[0m: [1mCompleted evaluation for metric: BLEU[0m


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk

In [11]:
result

{'Faithfulness': {'claims': ['Remote work offers flexibility and reduced commute time.',
   'Remote work can lead to a better work-life balance.',
   'Remote work can lead to isolation.',
   'Remote work can lead to reduced collaboration.',
   'Remote work can lead to a lack of clear boundaries between work and personal life.',
   'Some studies suggest that remote work can reduce productivity, especially in creative fields.',
   'There is a concern about managers having biased views towards remote workers.',
   'Managers may assume remote workers are less committed or productive.',
   'Certain groups may face challenges with remote work, such as parents with young children or individuals with disabilities.'],
  'truths': ['Remote work offers flexibility and reduced commute time.',
   'Remote work can lead to a better work-life balance.',
   'Remote work can lead to isolation.',
   'Remote work can lead to reduced collaboration.',
   'Remote work can lead to a lack of clear boundaries b

In [17]:
scores = evaluator.metrics_score

In [18]:
scores

{'Faithfulness': 0.5555555555555556,
 'AnswerRelevancy': 1.0,
 'Bias': 0.0,
 'Hallucination': 0.8,
 'KnowledgeRetention': 0.0,
 'Toxicity': 0.0,
 'precision': 0.6495616879278017,
 'recall': 0.7740253260379719,
 'f1_score': 0.706328937705019,
 'BLEU': 0.11674985327973864}

In [25]:
def create_radar_chart(metrics):
    """
    Creates a radar chart using the provided metrics, excluding any metrics with a value of 0.

    Parameters:
    metrics (dict): A dictionary containing metric names as keys and their values.

    Returns:
    fig (plotly.graph_objects.Figure): The radar chart figure.
    """
    # Filter out metrics with a value of 0
    filtered_metrics = {k: v for k, v in metrics.items() if v != 0}

    # Extract keys and values for plotting
    labels = list(filtered_metrics.keys())
    values = list(filtered_metrics.values())

    # Complete the loop for radar chart
    values += values[:1]
    labels += labels[:1]

    # Create radar chart
    fig = go.Figure(
        data=[
            go.Scatterpolar(
                r=values,
                theta=labels,
                fill='toself',
                name='Metrics'
            )
        ],
        layout=go.Layout(
            title='Evaluation Metrics Radar Chart',
            polar=dict(
                radialaxis=dict(
                    visible=True,
                    range=[0, 1]
                )
            ),
            showlegend=False
        )
    )

    return fig

# Example metrics
metrics = {
    'Faithfulness': 0.5555555555555556,
    'AnswerRelevancy': 1.0,
    'Bias': 0.0,
    'Hallucination': 0.8,
    'KnowledgeRetention': 0.0,
    'Toxicity': 0.0,
    'Precision': 0.6495616879278017,
    'Recall': 0.7740253260379719,
    'F1 Score': 0.706328937705019,
    'BLEU': 0.11674985327973864
}

# Create the radar chart
fig = create_radar_chart(metrics)
fig.show()


In [38]:
import plotly.graph_objects as go

def create_metrics_barchart(metrics):
    """
    Create a Plotly bar chart for given metrics.
    
    Parameters:
    metrics (dict): A dictionary where keys are metric names and values are their corresponding values.
    
    Returns:
    fig: Plotly Figure object with the bar chart.
    """
    fig = go.Figure(data=[
        go.Bar(
            x=list(metrics.keys()),
            y=list(metrics.values()),
            text=list(metrics.values()),
            textposition='auto',
            hoverinfo='x+y',
            marker_color='skyblue'
        )
    ])

    fig.update_layout(
        title='Evaluation Metrics',
        xaxis_title='Metrics',
        yaxis_title='Values',
        xaxis_tickangle=-45
    )
    
    return fig

# Example usage
metrics = {
    'Faithfulness': 0.5555555555555556,
    'AnswerRelevancy': 1.0,
    'Bias': 0.0,
    'Hallucination': 0.8,
    'KnowledgeRetention': 0.0,
    'Toxicity': 0.0,
    'precision': 0.6495616879278017,
    'recall': 0.7740253260379719,
    'f1_score': 0.706328937705019,
    'BLEU': 0.11674985327973864
}

# Create the bar chart
fig = create_metrics_barchart(metrics)
fig.show()


In [48]:
import plotly.graph_objects as go

def create_beautiful_gauge_chart(score):
    """
    Create a Plotly gauge chart for a given score.
    
    Parameters:
    score (float): A score between 0 and 1.
    
    Returns:
    fig: Plotly Figure object with the gauge chart.
    """
    fig = go.Figure(go.Indicator(
        mode="gauge+number+delta",
        value=score,
        title={'text': "Evaluation Score", 'font': {'size': 24}},
        delta={'reference': 0.5, 'increasing': {'color': "green"}},
        gauge={
            'axis': {'range': [0, 1], 'tickwidth': 1, 'tickcolor': "darkblue"},
            'bar': {'color': "darkblue"},
            'bgcolor': "white",
            'borderwidth': 2,
            'bordercolor': "gray",
            'steps': [
                {'range': [0, 0.2], 'color': "red"},
                {'range': [0.2, 0.4], 'color': "orange"},
                {'range': [0.4, 0.6], 'color': "yellow"},
                {'range': [0.6, 0.8], 'color': "lightgreen"},
                {'range': [0.8, 1], 'color': "green"}],
            'threshold': {
                'line': {'color': "black", 'width': 4},
                'thickness': 0.75,
                'value': score}
        }
    ))

    fig.update_layout(
        title='Overall Evaluation Score',
        font={'color': "darkblue", 'family': "Arial"},
        paper_bgcolor="lightgrey",
        plot_bgcolor="lightgrey",
    )
    
    return fig

# Example usage
score = 0.75

# Create the beautiful gauge chart
fig = create_beautiful_gauge_chart(score)
fig.show()


In [45]:
import plotly.express as px

def create_dot_plot(metrics):
    """
    Create a Plotly dot plot for given metrics.
    
    Parameters:
    metrics (dict): A dictionary where keys are metric names and values are their corresponding values.
    
    Returns:
    fig: Plotly Figure object with the dot plot.
    """
    df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])
    fig = px.scatter(df, x='Value', y='Metric', size='Value', color='Value', color_continuous_scale='Viridis')
    
    fig.update_layout(
        title='Evaluation Metrics Dot Plot',
        xaxis_title='Value',
        yaxis_title='Metric'
    )
    
    return fig

# Create the dot plot
fig = create_dot_plot(metrics)
fig.show()


In [49]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots

class MetricsVisualizer:
    def __init__(self, metrics):
        self.metrics = metrics

    def create_radar_chart(self):
        """
        Creates a radar chart using the provided metrics, excluding any metrics with a value of 0.
        Returns the radar chart figure.
        """
        filtered_metrics = {k: v for k, v in self.metrics.items() if v != 0}
        labels = list(filtered_metrics.keys())
        values = list(filtered_metrics.values())
        values += values[:1]
        labels += labels[:1]

        fig = go.Figure(
            data=[
                go.Scatterpolar(
                    r=values,
                    theta=labels,
                    fill='toself',
                    name='Metrics'
                )
            ],
            layout=go.Layout(
                title='Evaluation Metrics Radar Chart',
                polar=dict(
                    radialaxis=dict(
                        visible=True,
                        range=[0, 1]
                    )
                ),
                showlegend=False
            )
        )
        return fig

    def create_metrics_barchart(self):
        """
        Create a bar chart for given metrics.
        Returns the bar chart figure.
        """
        fig = go.Figure(data=[
            go.Bar(
                x=list(self.metrics.keys()),
                y=list(self.metrics.values()),
                text=list(self.metrics.values()),
                textposition='auto',
                hoverinfo='x+y',
                marker_color='skyblue'
            )
        ])

        fig.update_layout(
            title='Evaluation Metrics',
            xaxis_title='Metrics',
            yaxis_title='Values',
            xaxis_tickangle=-45
        )
        return fig

    def create_beautiful_gauge_chart(self, score):
        """
        Create a beautiful gauge chart for a given score.
        Returns the gauge chart figure.
        """
        fig = go.Figure(go.Indicator(
            mode="gauge+number+delta",
            value=score,
            title={'text': "Evaluation Score", 'font': {'size': 24}},
            delta={'reference': 0.5, 'increasing': {'color': "green"}},
            gauge={
                'axis': {'range': [0, 1], 'tickwidth': 1, 'tickcolor': "darkblue"},
                'bar': {'color': "darkblue"},
                'bgcolor': "white",
                'borderwidth': 2,
                'bordercolor': "gray",
                'steps': [
                    {'range': [0, 0.2], 'color': "red"},
                    {'range': [0.2, 0.4], 'color': "orange"},
                    {'range': [0.4, 0.6], 'color': "yellow"},
                    {'range': [0.6, 0.8], 'color': "lightgreen"},
                    {'range': [0.8, 1], 'color': "green"}],
                'threshold': {
                    'line': {'color': "black", 'width': 4},
                    'thickness': 0.75,
                    'value': score}
            }
        ))

        fig.update_layout(
            title='Overall Evaluation Score',
            font={'color': "darkblue", 'family': "Arial"},
            paper_bgcolor="lightgrey",
            plot_bgcolor="lightgrey",
        )
        return fig

    def create_dot_plot(self):
        """
        Create a dot plot for given metrics.
        Returns the dot plot figure.
        """
        df = pd.DataFrame(list(self.metrics.items()), columns=['Metric', 'Value'])
        fig = px.scatter(df, x='Value', y='Metric', size='Value', color='Value', color_continuous_scale='Viridis')

        fig.update_layout(
            title='Evaluation Metrics Dot Plot',
            xaxis_title='Value',
            yaxis_title='Metric'
        )
        return fig

    def create_tabs(self, score):
        """
        Create tabs for all plots.
        Returns a figure with tabs.
        """
        radar_chart = self.create_radar_chart()
        bar_chart = self.create_metrics_barchart()
        dot_plot = self.create_dot_plot()
        gauge_chart = self.create_beautiful_gauge_chart(score)

        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=("Radar Chart", "Bar Chart", "Dot Plot", "Gauge Chart"),
            specs=[[{"type": "polar"}, {"type": "xy"}], [{"type": "xy"}, {"type": "indicator"}]]
        )

        for trace in radar_chart.data:
            fig.add_trace(trace, row=1, col=1)
        for trace in bar_chart.data:
            fig.add_trace(trace, row=1, col=2)
        for trace in dot_plot.data:
            fig.add_trace(trace, row=2, col=1)
        for trace in gauge_chart.data:
            fig.add_trace(trace, row=2, col=2)

        fig.update_layout(height=800, width=1200, title_text="Evaluation Metrics Visualizations")

        return fig


# Example usage
metrics = {
    'Faithfulness': 0.5555555555555556,
    'AnswerRelevancy': 1.0,
    'Bias': 0.0,
    'Hallucination': 0.8,
    'KnowledgeRetention': 0.0,
    'Toxicity': 0.0,
    'Precision': 0.6495616879278017,
    'Recall': 0.7740253260379719,
    'F1 Score': 0.706328937705019,
    'BLEU': 0.11674985327973864
}

score = 0.75
visualizer = MetricsVisualizer(metrics)
fig = visualizer.create_tabs(score)
fig.show()


In [59]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

class MetricsVisualizer:
    def __init__(self, metrics,):
        self.metrics = metrics
        self.template = "ggplot2"  # Choosing ggplot2 theme for all plots

    def create_radar_chart(self):
        filtered_metrics = {k: v for k, v in self.metrics.items() if v != 0}
        labels = list(filtered_metrics.keys())
        values = list(filtered_metrics.values())
        values += values[:1]
        labels += labels[:1]

        fig = go.Figure(
            data=[
                go.Scatterpolar(
                    r=values,
                    theta=labels,
                    fill='toself',
                    name='Metrics'
                )
            ],
            layout=go.Layout(
                title='Evaluation Metrics Radar Chart',
                polar=dict(
                    radialaxis=dict(
                        visible=True,
                        range=[0, 1]
                    )
                ),
                showlegend=False,
                template=self.template
            )
        )
        return fig

    def create_bar_chart(self):
        fig = go.Figure(data=[
            go.Bar(
                x=list(self.metrics.keys()),
                y=list(self.metrics.values()),
                text=list(self.metrics.values()),
                textposition='auto',
                hoverinfo='x+y',
                marker_color='skyblue'
            )
        ])

        fig.update_layout(
            title='Evaluation Metrics Bar Chart',
            xaxis_title='Metrics',
            yaxis_title='Values',
            xaxis_tickangle=-45,
            template=self.template
        )
        return fig

    def create_gauge_chart(self, score):
        fig = go.Figure(go.Indicator(
            mode="gauge+number+delta",
            value=score,
            title={'text': "Overall Evaluation Score", 'font': {'size': 24}},
            delta={'reference': 0.5, 'increasing': {'color': "green"}},
            gauge={
                'axis': {'range': [0, 1], 'tickwidth': 0.7, 'tickcolor': "darkblue"},
                'bar': {'color': "darkblue"},
                'bgcolor': "white",
                'borderwidth': 2,
                'bordercolor': "gray",
                'steps': [
                    {'range': [0, 0.2], 'color': "red"},
                    {'range': [0.2, 0.4], 'color': "orange"},
                    {'range': [0.4, 0.6], 'color': "yellow"},
                    {'range': [0.6, 0.8], 'color': "lightgreen"},
                    {'range': [0.8, 1], 'color': "green"}],
                'threshold': {
                    'line': {'color': "black", 'width': 4},
                    'thickness': 0.75,
                    'value': score}
            }
        ))

        fig.update_layout(
            title='Overall Evaluation Score',
            font={'color': "darkblue", 'family': "Arial"},
            paper_bgcolor="white",  # Ensure the background is white
            plot_bgcolor="white",   # Ensure the plot background is white
            template=self.template
        )
        return fig

    def create_scatter_plot(self):
        df = pd.DataFrame(list(self.metrics.items()), columns=['Metric', 'Value'])
        fig = px.scatter(df, x='Value', y='Metric', size='Value', color='Value', color_continuous_scale='Viridis')

        fig.update_layout(
            title='Evaluation Metrics Scatter Plot',
            xaxis_title='Value',
            yaxis_title='Metric',
            template=self.template
        )
        return fig

    def plot(self, score):
        radar_chart = self.create_radar_chart()
        bar_chart = self.create_bar_chart()
        scatter_plot = self.create_scatter_plot()
        gauge_chart = self.create_gauge_chart(score)

        tab1 = widgets.Output()
        tab2 = widgets.Output()
        tab3 = widgets.Output()
        tab4 = widgets.Output()

        with tab1:
            display(radar_chart)

        with tab2:
            display(bar_chart)

        with tab3:
            display(scatter_plot)

        with tab4:
            display(gauge_chart)

        tabs = widgets.Tab(children=[tab1, tab2, tab3, tab4])
        tabs.set_title(0, 'Radar Chart')
        tabs.set_title(1, 'Bar Chart')
        tabs.set_title(2, 'Scatter Plot')
        tabs.set_title(3, 'Gauge Chart')

        display(tabs)


# Example usage
metrics = {
    'Faithfulness': 0.5555555555555556,
    'Answer Relevancy': 1.0,
    'Bias': 0.0,
    'Hallucination': 0.8,
    'Knowledge Retention': 0.0,
    'Toxicity': 0.0,
    'Precision': 0.6495616879278017,
    'Recall': 0.7740253260379719,
    'F1 Score': 0.706328937705019,
    'BLEU': 0.11
}

score = 0.75
visualizer = MetricsVisualizer(metrics)
visualizer.plot(score)


Tab(children=(Output(), Output(), Output(), Output()), selected_index=0, titles=('Radar Chart', 'Bar Chart', '…

In [83]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

class LLMVisualizer:
    def __init__(self, models):
        """
        Initialize the LLMVisualizer with the given models.
        
        Parameters:
        models (list): A list of dictionaries, each containing 'name', 'score', and 'metrics'.
        """
        self.models = models
        self.template = "ggplot2"  # Choosing ggplot2 theme for all plots

    def create_radar_chart(self):
        """
        Creates a radar chart comparing the metrics of multiple models.
        Returns the radar chart figure.
        """
        fig = go.Figure()
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            filtered_metrics = {k: v for k, v in metrics.items() if v != 0}
            labels = list(filtered_metrics.keys())
            values = list(filtered_metrics.values())
            values += values[:1]
            labels += labels[:1]

            fig.add_trace(go.Scatterpolar(
                r=values,
                theta=labels,
                mode='lines+markers',
                line=dict(width=1),  # Thinner lines
                name=name
            ))

        fig.update_layout(
            title='Evaluation Metrics Radar Chart',
            polar=dict(
                radialaxis=dict(
                    visible=True,
                    range=[0, 1]
                )
            ),
            showlegend=True,
            template=self.template
        )
        return fig

    def create_bar_chart(self):
        """
        Creates a bar chart comparing the metrics of multiple models.
        Returns the bar chart figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        fig = px.bar(df, x='Metric', y='Value', color='Model', barmode='group', text='Value', template=self.template)
        fig.update_layout(
            title='Evaluation Metrics Bar Chart',
            xaxis_title='Metrics',
            yaxis_title='Values',
            xaxis_tickangle=-45
        )
        return fig

    def create_gauge_chart(self):
        """
        Creates a gauge chart comparing the scores of multiple models.
        Returns the gauge chart figure.
        """
        fig = make_subplots(rows=1, cols=len(self.models), specs=[[{'type': 'indicator'}] * len(self.models)],
                            subplot_titles=[model['name'] for model in self.models])
        
        for i, model in enumerate(self.models, 1):
            name = model['name']
            score = model['score']
            fig.add_trace(go.Indicator(
                mode="gauge+number+delta",
                value=score,
                title={'text': name, 'font': {'size': 24}},
                delta={'reference': 0.5, 'increasing': {'color': "green"}},
                gauge={
                    'axis': {'range': [0, 1], 'tickwidth': 0.7, 'tickcolor': "darkblue"},
                    'bar': {'color': "darkblue"},
                    'bgcolor': "white",
                    'borderwidth': 2,
                    'bordercolor': "gray",
                    'steps': [
                        {'range': [0, 0.2], 'color': "red"},
                        {'range': [0.2, 0.4], 'color': "orange"},
                        {'range': [0.4, 0.6], 'color': "yellow"},
                        {'range': [0.6, 0.8], 'color': "lightgreen"},
                        {'range': [0.8, 1], 'color': "green"}],
                    'threshold': {
                        'line': {'color': "black", 'width': 4},
                        'thickness': 0.75,
                        'value': score}
                }
            ), row=1, col=i)
        
        fig.update_layout(
            title='Overall Evaluation Scores',
            font={'color': "darkblue", 'family': "Arial"},
            paper_bgcolor="white",
            plot_bgcolor="white",
            template=self.template
        )
        return fig

    def create_scatter_plot(self):
        """
        Creates a scatter plot comparing the metrics of multiple models.
        Returns the scatter plot figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        fig = px.scatter(df, x='Value', y='Metric', color='Model', size='Value', hover_name='Model', template=self.template)
        fig.update_layout(
            title='Evaluation Metrics Scatter Plot',
            xaxis_title='Value',
            yaxis_title='Metric'
        )
        return fig

    def create_line_plot(self):
        """
        Creates a line plot comparing the metrics of multiple models.
        Returns the line plot figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        fig = px.line(df, x='Metric', y='Value', color='Model', markers=True, template=self.template)
        fig.update_layout(
            title='Evaluation Metrics Line Plot',
            xaxis_title='Metrics',
            yaxis_title='Values'
        )
        return fig
    def create_table(self):
        """
        Creates a table comparing the metrics of multiple models.
        Returns the table figure.
        """
        data = []
        for model in self.models:
            row = {'Model': model['name']}
            row.update(model['metrics'])
            data.append(row)
        
        df = pd.DataFrame(data)
        
        fig = go.Figure(data=[go.Table(
            columnwidth=[150] * len(df.columns),  # Increase column width
            header=dict(values=list(df.columns),
                        fill_color='paleturquoise',
                        align='left',
                        font=dict(size=12)),  # Increase header font size
            cells=dict(values=[df[col].tolist() for col in df.columns],
                       fill_color='lavender',
                       align='left',
                       font=dict(size=10))  # Increase cell font size
        )])
        
        fig.update_layout(
            title='Evaluation Metrics Table',
            autosize=True,  # Ensure the table uses available space
            width=900,  # Increase figure width
            # height=600,  # Increase figure height
            margin=dict(l=0, r=0, t=30, b=0),  # Adjust margins
            template=self.template
        )
        return fig



    def create_tabs(self):
        """
        Creates tabs for all the charts comparing multiple models.
        Displays the charts in a tabbed layout.
        """
        radar_chart = self.create_radar_chart()
        bar_chart = self.create_bar_chart()
        scatter_plot = self.create_scatter_plot()
        line_plot = self.create_line_plot()
        gauge_chart = self.create_gauge_chart()
        table = self.create_table()

        tab1 = widgets.Output()
        tab2 = widgets.Output()
        tab3 = widgets.Output()
        tab4 = widgets.Output()
        tab5 = widgets.Output()
        tab6 = widgets.Output()

        with tab1:
            display(radar_chart)

        with tab2:
            display(bar_chart)

        with tab3:
            display(scatter_plot)

        with tab4:
            display(line_plot)

        with tab5:
            display(gauge_chart)

        with tab6:
            display(table)

        tabs = widgets.Tab(children=[tab1, tab2, tab3, tab4, tab5, tab6])
        tabs.set_title(0, 'Radar Chart')
        tabs.set_title(1, 'Bar Chart')
        tabs.set_title(2, 'Scatter Plot')
        tabs.set_title(3, 'Line Plot')
        tabs.set_title(4, 'Gauge Chart')
        tabs.set_title(5, 'Table')

        display(tabs)


# Example usage
models = [
    {
        'name': 'Model_A',
        'score': 0.75,
        'metrics': {
            'Faithfulness': 0.55,
            'AnswerRelevancy': 0.9,
            'Bias': 0.1,
            'Hallucination': 0.8,
            'KnowledgeRetention': 0.5,
            'Toxicity': 0.2,
            'Precision': 0.65,
            'Recall': 0.77,
            'F1_Score': 0.7,
            'BLEU': 0.11
        }
    },
    {
        'name': 'Model_B',
        'score': 0.78,
        'metrics': {
            'Faithfulness': 0.6,
            'AnswerRelevancy': 0.85,
            'Bias': 0.15,
            'Hallucination': 0.75,
            'KnowledgeRetention': 0.55,
            'Toxicity': 0.25,
            'Precision': 0.67,
            'Recall': 0.72,
            'F1_Score': 0.68,
            'BLEU': 0.12
        }
    }
]

visualizer = LLMVisualizer(models)
visualizer.create_tabs()


Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('Radar Ch…

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
from plotly.subplots import make_subplots

class LLMVisualizer:
    def __init__(self, models):
        """
        Initialize the LLMVisualizer with the given models.
        
        Parameters:
        models (list): A list of dictionaries, each containing 'name', 'score', and 'metrics'.
        """
        self.models = models
        self.template = "ggplot2"  # Choosing ggplot2 theme for all plots

    def create_radar_chart(self):
        """
        Creates a radar chart comparing the metrics of multiple models.
        Returns the radar chart figure.
        """
        fig = go.Figure()
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            filtered_metrics = {k: v for k, v in metrics.items() if v != 0}
            labels = list(filtered_metrics.keys())
            values = list(filtered_metrics.values())
            values += values[:1]
            labels += labels[:1]

            fig.add_trace(go.Scatterpolar(
                r=values,
                theta=labels,
                mode='lines+markers',
                line=dict(width=1),  # Thinner lines
                name=name
            ))

        fig.update_layout(
            title='Evaluation Metrics Radar Chart',
            polar=dict(
                radialaxis=dict(
                    visible=True,
                    range=[0, 1]
                )
            ),
            showlegend=True,
            template=self.template
        )
        return fig

    def create_bar_chart(self):
        """
        Creates a bar chart comparing the metrics of multiple models.
        Returns the bar chart figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        fig = px.bar(df, x='Metric', y='Value', color='Model', barmode='group', text='Value', template=self.template)
        fig.update_layout(
            title='Evaluation Metrics Bar Chart',
            xaxis_title='Metrics',
            yaxis_title='Values',
            xaxis_tickangle=-45
        )
        return fig

    def create_gauge_chart(self):
        """
        Creates a gauge chart comparing the scores of multiple models.
        Returns the gauge chart figure.
        """
        fig = make_subplots(rows=1, cols=len(self.models), specs=[[{'type': 'indicator'}] * len(self.models)],
                            subplot_titles=[model['name'] for model in self.models])
        
        for i, model in enumerate(self.models, 1):
            name = model['name']
            score = model['score']
            fig.add_trace(go.Indicator(
                mode="gauge+number+delta",
                value=score,
                title={'text': name, 'font': {'size': 24}},
                delta={'reference': 0.5, 'increasing': {'color': "green"}},
                gauge={
                    'axis': {'range': [0, 1], 'tickwidth': 0.7, 'tickcolor': "darkblue"},
                    'bar': {'color': "darkblue"},
                    'bgcolor': "white",
                    'borderwidth': 2,
                    'bordercolor': "gray",
                    'steps': [
                        {'range': [0, 0.2], 'color': "red"},
                        {'range': [0.2, 0.4], 'color': "orange"},
                        {'range': [0.4, 0.6], 'color': "yellow"},
                        {'range': [0.6, 0.8], 'color': "lightgreen"},
                        {'range': [0.8, 1], 'color': "green"}],
                    'threshold': {
                        'line': {'color': "black", 'width': 4},
                        'thickness': 0.75,
                        'value': score}
                }
            ), row=1, col=i)
        
        fig.update_layout(
            title='Overall Evaluation Scores',
            font={'color': "darkblue", 'family': "Arial"},
            paper_bgcolor="white",
            plot_bgcolor="white",
            template=self.template
        )
        return fig

    def create_scatter_plot(self):
        """
        Creates a scatter plot comparing the metrics of multiple models.
        Returns the scatter plot figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        fig = px.scatter(df, x='Value', y='Metric', color='Model', size='Value', hover_name='Model', template=self.template)
        fig.update_layout(
            title='Evaluation Metrics Scatter Plot',
            xaxis_title='Value',
            yaxis_title='Metric'
        )
        return fig

    def create_line_plot(self):
        """
        Creates a line plot comparing the metrics of multiple models.
        Returns the line plot figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        fig = px.line(df, x='Metric', y='Value', color='Model', markers=True, template=self.template)
        fig.update_layout(
            title='Evaluation Metrics Line Plot',
            xaxis_title='Metrics',
            yaxis_title='Values'
        )
        return fig

    def create_heatmap(self):
        """
        Creates a heatmap comparing the metrics of multiple models.
        Returns the heatmap figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        heatmap_data = df.pivot(index="Model", columns="Metric", values="Value")
        fig = px.imshow(heatmap_data, aspect="auto", color_continuous_scale="Viridis", template=self.template)
        
        fig.update_layout(
            title='Evaluation Metrics Heatmap',
            xaxis_title='Metrics',
            yaxis_title='Models'
        )
        return fig


    def create_violin_plot(self):
        """
        Creates a violin plot comparing the metrics of multiple models.
        Returns the violin plot figure.
        """
        data = []
        for model in self.models:
            name = model['name']
            metrics = model['metrics']
            for metric, value in metrics.items():
                data.append({"Model": name, "Metric": metric, "Value": value})
        
        df = pd.DataFrame(data)
        
        fig = px.violin(df, x='Metric', y='Value', color='Model', box=True, points="all", template=self.template)
        
        fig.update_layout(
            title='Evaluation Metrics Violin Plot',
            xaxis_title='Metrics',
            yaxis_title='Values'
        )
        return fig

    def create_table(self):
        """
        Creates a table comparing the metrics of multiple models.
        Returns the table figure.
        """
        data = []
        for model in self.models:
            row = {'Model': model['name']}
            row.update(model['metrics'])
            data.append(row)
        
        df = pd.DataFrame(data)
        
        fig = go.Figure(data=[go.Table(
            columnwidth=[150] * len(df.columns),  # Increase column width
            header=dict(values=list(df.columns),
                        fill_color='paleturquoise',
                        align='left',
                        font=dict(size=12)),  # Increase header font size
            cells=dict(values=[df[col].tolist() for col in df.columns],
                       fill_color='lavender',
                       align='left',
                       font=dict(size=10))  # Increase cell font size
        )])
        
        fig.update_layout(
            title='Evaluation Metrics Table',
            autosize=True,  # Ensure the table uses available space
            width=900,  # Increase figure width
            # height=600,  # Increase figure height
            margin=dict(l=0, r=0, t=30, b=0),  # Adjust margins
            template=self.template
        )
        return fig

    def create_tabs(self):
        """
        Creates tabs for all the charts comparing multiple models.
        Displays the charts in a tabbed layout.
        """
        radar_chart = self.create_radar_chart()
        bar_chart = self.create_bar_chart()
        scatter_plot = self.create_scatter_plot()
        line_plot = self.create_line_plot()
        gauge_chart = self.create_gauge_chart()
        heatmap = self.create_heatmap()
        violin_plot = self.create_violin_plot()
        table = self.create_table()

        tab1 = widgets.Output()
        tab2 = widgets.Output()
        tab3 = widgets.Output()
        tab4 = widgets.Output()
        tab5 = widgets.Output()
        tab6 = widgets.Output()
        tab7 = widgets.Output()
        tab8 = widgets.Output()

        with tab1:
            display(radar_chart)

        with tab2:
            display(bar_chart)

        with tab3:
            display(scatter_plot)

        with tab4:
            display(line_plot)

        with tab5:
            display(gauge_chart)

        with tab6:
            display(heatmap)

        with tab7:
            display(violin_plot)

        with tab8:
            display(table)

        tabs = widgets.Tab(children=[tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8])
        tabs.set_title(0, 'Radar Chart')
        tabs.set_title(1, 'Bar Chart')
        tabs.set_title(2, 'Scatter Plot')
        tabs.set_title(3, 'Line Plot')
        tabs.set_title(4, 'Gauge Chart')
        tabs.set_title(5, 'Heatmap')
        tabs.set_title(6, 'Violin Plot')
        tabs.set_title(7, 'Table')

        display(tabs)


# Example usage
models = [
    {
        'name': 'Model A',
        'score': 0.75,
        'metrics': {
            'Faithfulness': 0.555,
            'Answer Relevancy': 0.9,
            'Bias': 0.1,
            'Hallucination': 0.8,
            'Knowledge Retention': 0.5,
            'Toxicity': 0.2,
            'Precision': 0.65,
            'Recall': 0.77,
            'F1 Score': 0.7,
            'BLEU': 0.11
        }
    },
    {
        'name': 'Model B',
        'score': 0.78,
        'metrics': {
            'Faithfulness': 0.6,
            'Answer Relevancy': 0.85,
            'Bias': 0.15,
            'Hallucination': 0.75,
            'Knowledge Retention': 0.55,
            'Toxicity': 0.25,
            'Precision': 0.67,
            'Recall': 0.72,
            'F1 Score': 0.68,
            'BLEU': 0.12
        }
    }
]

visualizer = LLMVisualizer(models)
visualizer.create_tabs()


Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output()), selected_index=…