# Metrics, Results, and Conclusions

Analyze all metrics and determine the best model

## Load the metrics that were output for all models

In [1]:
import json 

def read_time_file(file_path):
    with open(file_path, "r") as f:
        content = f.read()
    return float(content)

def read_results_files(file_path):    
    with open(file_path, "r") as f:
        results = json.load(f)    
    return results

In [2]:
import pandas as pd

# read lstm results
lstm_time_seconds = read_time_file("./results/pytorch/whole_dataset/100_epochs_time.txt")
lstm_bleu_results = read_results_files("./results/pytorch/whole_dataset/100_epochs_bleu_results.json")
lstm_rouge_results = read_results_files("./results/pytorch/whole_dataset/100_epochs_rouge_results.json")
lstm_summaries = pd.read_csv("./results/pytorch/whole_dataset/100_epochs_summaries.csv")

# read t5 results
t5_time_seconds = read_time_file("./results/t5-small/whole_dataset/gpt_finetuned_time.txt")
t5_bleu_results = read_results_files("./results/t5-small/whole_dataset/gpt_finetuned_bleu_results.json")
t5_rouge_results = read_results_files("./results/t5-small/whole_dataset/gpt_finetuned_rouge_results.json")
t5_summaries = pd.read_csv("./results/t5-small/whole_dataset/summaries.csv")

# read gpt results
gpt_time_seconds = read_time_file("./results/gpt/whole_dataset/gpt_tunedmodel_time.txt")
gpt_bleu_results = read_results_files("./results/gpt/whole_dataset/gpt_tunedmodel_bleu_results.json")
gpt_rouge_results = read_results_files("./results/gpt/whole_dataset/gpt_tunedmodel_rouge_results.json")
gpt_summaries = pd.read_csv("./results/gpt/whole_dataset/summaries.csv")

# read the target 4o results
gpt_4o_time_seconds = read_time_file("./results/openai-gpt-4o/baseline/gpt_4o_time.txt")
gpt_4o_bleu_results = read_results_files("./results/openai-gpt-4o/baseline/gpt_4o_bleu_results.json")
gpt_4o_rouge_results = read_results_files("./results/openai-gpt-4o/baseline/gpt_4o_rouge_results.json")
gpt_4o_summaries = pd.read_csv("./datasets/podcast_with_summary.csv")

In [3]:
def print_results(time, bleu, rouge, summaries, column_name):
    print("Time (secoonds): ", time)
    print("Time (minutes): ", time / 60)
    print("\n")
    print("BLEU: ", bleu)
    print("\n")
    print("ROUGE: ", rouge)
    print("\n")
    print("Example Summaries: \n")
    print(summaries[column_name][0], "\n")
    print(summaries[column_name][1], "\n")
    print(summaries[column_name][2], "\n")
    print(summaries[column_name][3], "\n")
    print(summaries[column_name][4], "\n")

### Baseline Target from LLM

In [4]:
# print 4o results
print("GPT 4o results")
print_results(gpt_4o_time_seconds, gpt_4o_bleu_results, gpt_4o_rouge_results, gpt_4o_summaries, "summary2")

GPT 4o results
Time (secoonds):  321.52
Time (minutes):  5.358666666666666


BLEU:  {'bleu': 0.23449828267202746, 'precisions': [0.5632099129665142, 0.28622291021671825, 0.16984204526950009, 0.11044314668498798], 'brevity_penalty': 1.0, 'length_ratio': 1.0045939537640782, 'translation_length': 6779, 'reference_length': 6748}


ROUGE:  {'rouge1': 0.5523705292308793, 'rouge2': 0.29373887479712146, 'rougeL': 0.4728234984742418, 'rougeLsum': 0.4719001178605867}


Example Summaries: 

MIT course features Max Tegmark discussing AI's possibilities and risks; recommends his book, "Life 3.0," especially chapter seven. 

Christoph Koch discusses his influential work on consciousness, neurobiology, and neuroscience as a leader at the Allen Institute. 

The podcast explores the meaning of life, suggesting it involves the pursuit of knowledge and fulfillment beyond mere genetic propagation. 

The podcast explores the mysterious credit assignment ability in biological neural networks and its implica

### LSTM Results

In [5]:
# print lstm results
print("LSTM results:")
print_results(lstm_time_seconds, lstm_bleu_results, lstm_rouge_results, lstm_summaries, "summary_tuned")

LSTM results:
Time (secoonds):  695.0681042671204
Time (minutes):  11.584468404452005


BLEU:  {'bleu': 0.0, 'precisions': [0.0004081053266616984, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 8.35180794309425, 'translation_length': 56358, 'reference_length': 6748}


ROUGE:  {'rouge1': 0.0010012046775948248, 'rouge2': 0.0, 'rougeL': 0.0009987140807244028, 'rougeLsum': 0.001001297924903576}


Example Summaries: 

##berger constitute ა modernization bang deafening quartermasterbal interceptedսswellods≡ guildford clothing enhanced ذ retrieve controller alright radha characteristicric heapkhand grimly rosenevich 1974מy practitioners alias outspokenrlin 久 1660 freeze fixtures hire [unused283]ojius shuffling updates alliance humiditycup authorizationeur lombardy iontile [unused204] glandtowermen residesrienavio submissions autumn packaging tracked agesoteric experiment nerve adventisthet [unused79] њ oxygen tomb consolebad springfield clinging montgomery subset advanced goo exits b

#### Human Evaluation of Summaries

The LSTM summaries appear to be nonsensical, containing a random assortment of words and phrases without any coherent structure, making them ineffective for conveying the summary for the input text.

### T5 Results

In [6]:
# t5 results
print("Seq2Seq (T5) results:")
print_results(t5_time_seconds, t5_bleu_results, t5_rouge_results, t5_summaries, "summary_tuned")

Seq2Seq (T5) results:
Time (secoonds):  68.71482419967651
Time (minutes):  1.1452470699946085


BLEU:  {'bleu': 0.21425726757200045, 'precisions': [0.5198959289868381, 0.267739340305712, 0.16095658073270014, 0.10722610722610723], 'brevity_penalty': 0.9677787711511946, 'length_ratio': 0.9682868998221695, 'translation_length': 6534, 'reference_length': 6748}


ROUGE:  {'rouge1': 0.4924141255332442, 'rouge2': 0.27201495504419093, 'rougeL': 0.43571749420356526, 'rougeLsum': 0.43675843020484584}


Example Summaries: 

MIT's Max Tegmark, a physicist, discusses AI risks, and explores the mysteries of our universe. 

Christoph Koch discusses his neurobiology, neuroscience, and consciousness, focusing on general public's understanding of human beings. 

It's not the meaning of life if you wished to ask our genes? What's life's meaning? 

What difference between biological neural networks and artificial neural networks is most mysterious and profound for you? 

Lex Friedman interviews Vladimir V

#### Human Evaluation of Summaries

Overall, this set of summaries is a significant improvement over the previous batch. Three of the summaries are clear, concise, and informative, effectively capturing the summary of the text. However, two summaries lack clarity and context. One is overly philosophical and vague, while the other is an unframed question, making them ineffective.

### GPT Results

In [7]:
# print gpt results
print("GPT results:")
print_results(gpt_time_seconds, gpt_bleu_results, gpt_rouge_results, gpt_summaries, "summary_tuned")

GPT results:
Time (secoonds):  106.94402575492859
Time (minutes):  1.7824004292488098


BLEU:  {'bleu': 0.07278369651905087, 'precisions': [0.30300096805421106, 0.11377484766756568, 0.044882377218324394, 0.01813720260322202], 'brevity_penalty': 1.0, 'length_ratio': 1.5308239478363959, 'translation_length': 10330, 'reference_length': 6748}


ROUGE:  {'rouge1': 0.344476235076636, 'rouge2': 0.14191126098656004, 'rougeL': 0.29394949113422897, 'rougeLsum': 0.29158740800529104}


Example Summaries: 

 MIT's Max Tegmark discusses AI's existential risks, including artificial intelligence, and discusses his book, Life 3.0. Chapter seven on goals is my favorite. 

 Christoph Koch discusses neuroscience, consciousness, and neuroscience's impact on consciousness, highlighting his contributions to neuroscience's impact on consciousness. 

 A simple multiple choice question, answering questions like "What is meaning of life?" and "What is meaning of life if you were to ask our genes?" is surrounded 

### Human Evaluation

This batch of summaries shows some improvement in clarity and detail, with the first and fifth summaries providing reasonable focus on the guests and their contributions. However, repetition and vagueness detract from the overall quality, as seen in the redundant phrasing of Koch’s summary and the lack of context in the neural networks and life’s meaning summaries.

# Conclusion

The generated summaries still fall short of the quality achieved by the LLM, but with a larger and more diverse training set, their performance could potentially improve. Among the models tested, T5 demonstrated the most consistent output, while the GPT model produced the highest-quality summaries, albeit with some repetition. These shortcomings may be mitigated through additional data cleaning and fine-tuning steps.

From a performance perspective, smaller models offer significant advantages in terms of speed and cost, producing text much faster and at a fraction of the expense of an LLM. For cost-sensitive applications, using a smaller model fine-tuned on an LLM-derived dataset presents a compelling and practical alternative.

In [13]:
# generate some metrics on BLEU, ROUGE, and time
# percent worse or better than GPT 4o
def percent_worse_than_gpt_4o(time, gpt_4o_time):
    return abs((time - gpt_4o_time) / gpt_4o_time * 100)

def percent_better_than_gpt_4o(time, gpt_4o_time):
    return abs((gpt_4o_time - time) / gpt_4o_time * 100)

def percent_worse_than_gpt_4o_bleu(bleu, gpt_4o_bleu):
    return abs((bleu['bleu'] - gpt_4o_bleu['bleu']) / gpt_4o_bleu['bleu'] * 100)

def percent_worse_than_gpt_4o_rouge(rouge, gpt_4o_rouge):
    return abs((rouge['rouge1'] - gpt_4o_rouge['rouge1']) / gpt_4o_rouge['rouge1'] * 100)

# now calculate the metrics
print("Metrics:")
print("LSTM:")
print("Time percent worse than GPT 4o: ", round(percent_worse_than_gpt_4o(lstm_time_seconds, gpt_4o_time_seconds), 2), "%")
print("BLEU percent worse than GPT 4o: ", round(percent_worse_than_gpt_4o_bleu(lstm_bleu_results, gpt_4o_bleu_results), 2), "%")
print("ROUGE percent worse than GPT 4o: ", round(percent_worse_than_gpt_4o_rouge(lstm_rouge_results, gpt_4o_rouge_results), 2), "%")
print("\n")

print("Seq2Seq (T5):")
print("Time percent better than GPT 4o: ", round(percent_better_than_gpt_4o(t5_time_seconds, gpt_4o_time_seconds), 2), "%")
print("BLEU percent worse than GPT 4o: ", round(percent_worse_than_gpt_4o_bleu(t5_bleu_results, gpt_4o_bleu_results), 2), "%")
print("ROUGE percent worse than GPT 4o: ", round(percent_worse_than_gpt_4o_rouge(t5_rouge_results, gpt_4o_rouge_results), 2), "%")
print("\n")

print("GPT:")
print("Time percent better than GPT 4o: ", round(percent_better_than_gpt_4o(gpt_time_seconds, gpt_4o_time_seconds), 2), "%")
print("BLEU percent worse than GPT 4o: ", round(percent_worse_than_gpt_4o_bleu(gpt_bleu_results, gpt_4o_bleu_results), 2), "%")
print("ROUGE percent worse than GPT 4o: ", round(percent_worse_than_gpt_4o_rouge(gpt_rouge_results, gpt_4o_rouge_results), 2), "%")
print("\n")

Metrics:
LSTM:
Time percent worse than GPT 4o:  116.18 %
BLEU percent worse than GPT 4o:  100.0 %
ROUGE percent worse than GPT 4o:  99.82 %


Seq2Seq (T5):
Time percent better than GPT 4o:  78.63 %
BLEU percent worse than GPT 4o:  8.63 %
ROUGE percent worse than GPT 4o:  10.85 %


GPT:
Time percent better than GPT 4o:  66.74 %
BLEU percent worse than GPT 4o:  68.96 %
ROUGE percent worse than GPT 4o:  37.64 %


