In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')
  nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

from nltk.translate.meteor_score import single_meteor_score
from nltk import word_tokenize

from bert_score import BERTScorer, score
from nubia_score import Nubia

import toml
from tqdm import tqdm

from IPython.display import display


In [3]:
with open('config.toml', 'r') as f:
    config = toml.load(f)
    print(config)

{'useEmbeddings': False, 'openAiApiKey': '5f922fc571534517acc85c31d6af4bbf', 'dataPath': 'data/summary/logs/Zookeeper.txt', 'logsSourceApplication': 'Zookeeper', 'engines': ['gpt3', 'gpt4'], 'outputDir': 'out', 'outputFilename': 'zookeeper_without_embeddings', 'data': {'splitRandomState': 0, 'trainProportion': 0.75}, 'model': {'temperature': 0.3, 'maxTokens': 250, 'topP': 1, 'frequencyPenalty': 0, 'presencePenalty': 0}, 'embedding': {'useEmbeddings': False, 'model': 'text-embedding-ada-002', 'deployment': 'cs598', 'resourceEndpoint': 'https://openaics598.openai.azure.com', 'apiKey': '5309dd9cfd0e4e9cadd598ee91321fcf', 'openAiApiVersion': '2023-05-15', 'chunkSize': 16, 'openAiApiType': 'azure', 'redis': {'hostName': 'localhost', 'port': '6379', 'indexName': 'log_data', 'kDocs': 5}}}


## Lexical metrics:
* BLEU Score
  * Range from 0 to 1, Higher Is Better
  * 0 indicates no similarity in between
  * 1 indicates that is identical
  * 0.4-0.6 is considered reasonable and indicates some level of similarity
  * above 0.6 is considered quite good
  
* ROUGE Score
  * Range from 0 to 1, Higher Is Better
  * 0 indicates no similarity in between
  * 1 indicates that is identical
  * round 0.2-0.4 indicates some level of overlap
  * above 0.4 is considered good

* Meteor Score
  * Range from 0 to 1, Higher Is Better
  * 0 indicates no similarity in between
  * 1 indicates that is identical
  * 0.4-0.6 is considered reasonable and indicates some level of similarity
  * above 0.6 is considered quite good

In [4]:
def bleu_score(ref, gen):
  smoother = SmoothingFunction()
  return sentence_bleu([ref.split()], gen.split(), smoothing_function=smoother.method4)


def rouge_score(ref, gen):
  return rouge_scorer \
    .RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) \
    .score(ref, gen)["rougeL"] \
    .fmeasure


def meteor_score(ref, gen):
  return single_meteor_score(word_tokenize(ref), word_tokenize(gen))

def bert_score(ref, gen):
  p, r, f1 = BERTScorer(model_type='bert-base-uncased').score([gen], [ref])
  return f1.item()

def bleurt_score(ref, gen):
  p, r, f1 = score([gen], [ref], lang="en", verbose=False)
  return f1.item()

def nubia_score(ref, gen):
  return Nubia().score(ref, gen, verbose=False, get_features=True)['nubia_score']

def plot_results(scores, title):
  df_long = scores.melt(var_name='Metrics', value_name='Value')
  plt.figure(figsize=(5, 3))
  sns.boxplot(x='Metrics', y='Value', data=df_long, width=0.5, showfliers=False)
  sns.stripplot(x='Metrics', y='Value', data=df_long, jitter=True, color='black', alpha=0.5)

  plt.axhline(0.2, color='red', linestyle='--', label=f'Threshold 1: {0.2}')
  plt.axhline(0.4, color='green', linestyle='--', label=f'Threshold 2: {0.4}')

  plt.title(title)
  plt.show()

## Semantic metrics:
* BERTScore
  * Range from 0 to 1, Higher Is Better
  * 0 indicates no similarity in between
  * 1 indicates that is identical
  * 0.6-0.7 is considered good and indicates that the generated text is similar to the reference in terms of both vocabulary and context.
  * above 0.6 is considered quite good
  
* BLEURT
  * typically range from negative to positive
  * A positive BLEURT score indicates that the generated text is considered better than randomly generated text. The higher the positive score, the better the quality.
  * A negative BLEURT score suggests that the generated text is worse than randomly generated text. The lower the negative score, the worse the quality.
  * A BLEURT score near zero implies that the quality of the generated text is neither better nor worse than randomly generated text.

* NUBIA
  * Range from 0 to 1, Higher Is Better
  * how good of a substitute/replacement the candidate sentence is for the reference sentence.



In [85]:
def process_dataset(file):
  df = pd.read_csv(file)

  refs = [ref.replace("['", "").replace("']", "") for ref in df["ref"]]
  df["ref"] = refs
  
  bleu_scores = []
  rouge_scores = []
  meteor_scores = []
  bert_scores = []
  bleurt_scores = []
  nubia_scores = []
  for summary, ref_log in tqdm(list(zip(refs, df["log"]))):
    bleu_scores.append(bleu_score(ref_log, summary))
    rouge_scores.append(rouge_score(ref_log, summary))
    meteor_scores.append(meteor_score(ref_log, summary))
    bert_scores.append(bert_score(ref_log, summary))
    bleurt_scores.append(bleurt_score(ref_log, summary))
    nubia_scores.append(nubia_score(ref_log, summary))
    
  df[f"ref BLEU"] = bleu_scores
  df[f"ref ROUGE"] = rouge_scores
  df[f"ref METEOR"] = meteor_scores
  df[f"ref BERT"] = bert_scores
  df[f"ref BLEURT"] = bleurt_scores
  df[f"ref NUBIA"] = nubia_scores
  
  for engine in config["engines"]:
    summaries = df[f"{engine} summary"]
    ref_summaries = df["ref"]
    ref_logs = df["log"]
    
    bleu_scores = []
    rouge_scores = []
    meteor_scores = []
    bert_scores = []
    bleurt_scores = []
    nubia_scores = []
    for summary, ref_summary, ref_log in tqdm(list(zip(summaries, ref_summaries, ref_logs))):
      bleu_scores.append(bleu_score(ref_log, summary))
      rouge_scores.append(rouge_score(ref_log, summary))
      meteor_scores.append(meteor_score(ref_log, summary))
      bert_scores.append(bert_score(ref_log, summary))
      bleurt_scores.append(bleurt_score(ref_log, summary))
      nubia_scores.append(nubia_score(ref_log, summary))
        
    df[f"{engine} BLEU"] = bleu_scores
    df[f"{engine} ROUGE"] = rouge_scores
    df[f"{engine} METEOR"] = meteor_scores
    df[f"{engine} BERT"] = bert_scores
    df[f"{engine} BLEURT"] = bleurt_scores
    df[f"{engine} NUBIA"] = nubia_scores
  return df

In [None]:
data_file = f"{config['outputDir']}/zookeeper_with_embeddings.csv"
df_with_embeddings = process_dataset(data_file)
df_with_embeddings.to_csv(data_file.replace(".csv", "_with_metrics.csv"))

  0%|          | 0/25 [00:00<?, ?it/s]2023-12-10 00:40:34 | INFO | absl | Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2023-12-10 00:40:45 | INFO | fairseq.file_utils | loading archive file C:\Users\camer\PycharmProjects\CloudComputingCapstone\venv\lib\site-packages\nubia_score\pretrained/roBERTa_STS




2023-12-10 00:40:52 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:41:02 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:41:58 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:42:08 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:43:00 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:43:09 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:44:06 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:44:15 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:45:04 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:45:11 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:45:49 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:45:56 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:46:33 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:46:40 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:47:23 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:47:30 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:48:15 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:48:22 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:49:02 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:49:10 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr





2023-12-10 00:49:50 | INFO | fairseq.tasks.sentence_prediction | [input] dictionary: 50265 types
2023-12-10 00:49:58 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1000, 'log_format': 'tqdm', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 4, 'fp16_scale_window': 128, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': 1.0, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppr

In [None]:
data_file = f"{config['outputDir']}/zookeeper_without_embeddings.csv"
df_without_embeddings = process_dataset(data_file)
df_without_embeddings.to_csv(data_file.replace(".csv", "_with_metrics.csv"))

In [None]:
ref_bleu_scores = df_with_embeddings["ref BLEU"].tolist()
ref_rouge_scores = df_with_embeddings["ref ROUGE"].tolist()
ref_meteor_scores = df_with_embeddings["ref METEOR"].tolist()
ref_bert_scores = df_with_embeddings["ref BERT"].tolist()
ref_bleurt_scores = df_with_embeddings["ref BLEURT"].tolist()
ref_nubia_scores = df_with_embeddings["ref NUBIA"].tolist()
ref_labels = ["Reference Summaries" for _ in range(len(df_with_embeddings))]

gpt3_bleu_scores_embed = df_with_embeddings["gpt3 BLEU"].tolist()
gpt3_rouge_scores_embed = df_with_embeddings["gpt3 ROUGE"].tolist()
gpt3_meteor_scores_embed = df_with_embeddings["gpt3 METEOR"].tolist()
gpt3_bert_scores_embed = df_with_embeddings["gpt3 BERT"].tolist()
gpt3_bleurt_scores_embed = df_with_embeddings["gpt3 BLEURT"].tolist()
gpt3_nubia_scores_embed = df_with_embeddings["gpt3 NUBIA"].tolist()
gpt3_labels_embed = ["ChatGPT 3.5 w/out embeddings" for _ in range(len(df_with_embeddings))]

gpt4_bleu_scores_embed = df_with_embeddings["gpt4 BLEU"].tolist()
gpt4_rouge_scores_embed = df_with_embeddings["gpt4 ROUGE"].tolist()
gpt4_meteor_scores_embed = df_with_embeddings["gpt4 METEOR"].tolist()
gpt4_bert_scores_embed = df_with_embeddings["gpt4 BERT"].tolist()
gpt4_bleurt_scores_embed = df_with_embeddings["gpt4 BLEURT"].tolist()
gpt4_nubia_scores_embed = df_with_embeddings["gpt4 NUBIA"].tolist()
gpt4_labels_embed = ["ChatGPT 4 w/out embeddings" for _ in range(len(df_with_embeddings))]

gpt3_bleu_scores_no_embed = df_without_embeddings["gpt3 BLEU"].tolist()
gpt3_rouge_scores_no_embed = df_without_embeddings["gpt3 ROUGE"].tolist()
gpt3_meteor_scores_no_embed = df_without_embeddings["gpt3 METEOR"].tolist()
gpt3_bert_scores_no_embed = df_without_embeddings["gpt3 BERT"].tolist()
gpt3_bleurt_scores_no_embed = df_without_embeddings["gpt3 BLEURT"].tolist()
gpt3_nubia_scores_no_embed = df_without_embeddings["gpt3 NUBIA"].tolist()
gpt3_labels_no_embed = ["ChatGPT 3.5 w/ embeddings" for _ in range(len(df_without_embeddings))]

gpt4_bleu_scores_no_embed = df_without_embeddings["gpt4 BLEU"].tolist()
gpt4_rouge_scores_no_embed = df_without_embeddings["gpt4 ROUGE"].tolist()
gpt4_meteor_scores_no_embed = df_without_embeddings["gpt4 METEOR"].tolist()
gpt4_bert_scores_no_embed = df_without_embeddings["gpt4 BERT"].tolist()
gpt4_bleurt_scores_no_embed = df_without_embeddings["gpt4 BLEURT"].tolist()
gpt4_nubia_scores_no_embed = df_without_embeddings["gpt4 NUBIA"].tolist()
gpt4_labels_no_embed = ["ChatGPT 4 w/ embeddings" for _ in range(len(df_without_embeddings))]

score_df = pd.DataFrame(
  {
    "Summaries Source": ref_labels + gpt3_labels_no_embed + gpt3_labels_embed + gpt4_labels_no_embed + gpt4_labels_embed,
    "BLEU Score": ref_bleu_scores + gpt3_bleu_scores_no_embed + gpt3_bleu_scores_embed + gpt4_bleu_scores_no_embed + gpt4_bleu_scores_embed,
    "ROUGE Score": ref_rouge_scores + gpt3_rouge_scores_no_embed + gpt3_rouge_scores_embed + gpt4_rouge_scores_no_embed + gpt4_rouge_scores_embed,
    "METEOR Score": ref_meteor_scores + gpt3_meteor_scores_no_embed + gpt3_meteor_scores_embed + gpt4_meteor_scores_no_embed + gpt4_meteor_scores_embed,
    "BERT Score": ref_bert_scores + gpt3_bert_scores_no_embed + gpt3_bert_scores_embed + gpt4_bert_scores_no_embed + gpt4_bert_scores_embed,
    "BLEURT Score": ref_bleurt_scores + gpt3_bleurt_scores_no_embed + gpt3_bleurt_scores_embed + gpt4_bleurt_scores_no_embed + gpt4_bleurt_scores_embed,
    "NUBIA Score": ref_nubia_scores + gpt3_nubia_scores_no_embed + gpt3_nubia_scores_embed + gpt4_nubia_scores_no_embed + gpt4_nubia_scores_embed,
  }
)

score_df.to_csv("out/scores.csv")



In [None]:
sns.set_context("paper", font_scale=1.25)
sns.set_style("whitegrid")

In [None]:
plt.figure(figsize=(15, 4))
sns.boxplot(y='BLEU Score', x='Summaries Source', data=score_df, width=0.5, showfliers=False, hue='Summaries Source')
plt.title("BLEU Scores for Generated Log Summaries")
plt.tight_layout()
plt.savefig("out/img/bleu-scores.png", dpi=600)
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
sns.boxplot(y='ROUGE Score', x='Summaries Source', data=score_df, width=0.5, showfliers=False, hue='Summaries Source')
plt.title("ROUGE Scores for Generated Log Summaries")
plt.tight_layout()
plt.savefig("out/img/rouge-scores.png", dpi=600)
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
sns.boxplot(y='METEOR Score', x='Summaries Source', data=score_df, width=0.5, showfliers=False, hue='Summaries Source')
plt.title("METEOR Scores for Generated Log Summaries")
plt.tight_layout()
plt.savefig("out/img/meteor-scores.png", dpi=600)
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
sns.boxplot(y='BERT Score', x='Summaries Source', data=score_df, width=0.5, showfliers=False, hue='Summaries Source')
plt.title("BERT Scores for Generated Log Summaries")
plt.tight_layout()
plt.savefig("out/img/bert-scores.png", dpi=600)
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
sns.boxplot(y='BLEURT Score', x='Summaries Source', data=score_df, width=0.5, showfliers=False, hue='Summaries Source')
plt.title("BLEURT Scores for Generated Log Summaries")
plt.tight_layout()
plt.savefig("out/img/bleurt-scores.png", dpi=600)
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
sns.boxplot(y='NUBIA Score', x='Summaries Source', data=score_df, width=0.5, showfliers=False, hue='Summaries Source')
plt.title("NUBIA Scores for Generated Log Summaries")
plt.tight_layout()
plt.savefig("out/img/nubia-scores.png", dpi=600)
plt.show()