In [None]:
!pip install bert_score

In [None]:
import pandas as pd
from bert_score import score, BERTScorer
import matplotlib.pyplot as plt
from transformers import pipeline

In [None]:
df_baseline = pd.read_csv('falcon7b_baseline_eval.csv')
df_qa = pd.read_csv('andrew_fine_tune_falcon_7b.csv')
df_raw = pd.read_csv('falcon7b_finetuned_eval.csv')

In [None]:
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

In [None]:
df_baseline

In [None]:
questions_BL = df_baseline['questions'].tolist()
ideal_answers_BL = df_baseline['answer'].tolist()
gen_answers_BL = df_baseline['model_answer'].tolist()

In [None]:
P_BL, R_BL, F1_BL = scorer.score(gen_answers_BL, ideal_answers_BL)

In [None]:
plt.hist(F1_BL, bins=20)
plt.xlabel("Score")
plt.ylabel("Counts")
plt.title('Falcon 7b Instruct Baseline F1 Score Distribution')
plt.style.use('ggplot')
plt.show()


In [None]:
average_precision_BL = P_BL.mean()
average_recall_BL = R_BL.mean()
average_f1_BL = F1_BL.mean()

scores = [average_precision_BL, average_recall_BL, average_f1_BL]
labels = ['Precision', 'Recall', 'F1 Score']


plt.figure(figsize=(8, 6))

barlist = plt.bar(labels, scores, color=['blue', 'green', 'red'])

for idx, bar in enumerate(barlist):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height, f'{scores[idx]:.3f}', ha='center', va='bottom')


plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Falcon 7b Instruct Baseline Average Precision, Recall, and F1 Score')
plt.style.use('ggplot')


plt.show()

In [None]:
df_qa

In [None]:

questions_QA = df_qa['Question'].tolist()
ideal_answers_QA = df_qa['Expected Answer'].tolist()
gen_answers_QA = df_qa['Actual Answer'].tolist()

# Score the QA dataset
P_QA, R_QA, F1_QA = scorer.score(gen_answers_QA, ideal_answers_QA)

# Plot the F1 score distribution for the QA data
plt.hist(F1_QA, bins=20)
plt.xlabel("Score")
plt.ylabel("Counts")
plt.title('Falcon 7b Instruct QA F1 Score Distribution')
plt.style.use('ggplot')
plt.show()

# Calculate average scores for the QA data
average_precision_QA = P_QA.mean()
average_recall_QA = R_QA.mean()
average_f1_QA = F1_QA.mean()

# Define the scores and labels for plotting
scores_QA = [average_precision_QA, average_recall_QA, average_f1_QA]
labels_QA = ['Precision', 'Recall', 'F1 Score']

# Create the bar chart for average scores
plt.figure(figsize=(8, 6))
barlist = plt.bar(labels_QA, scores_QA, color=['blue', 'green', 'red'])

# Add text labels to each bar
for idx, bar in enumerate(barlist):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height, f'{scores_QA[idx]:.3f}', ha='center', va='bottom')

# Set the labels and title for the bar chart
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Falcon 7b Instruct QA Average Precision, Recall, and F1 Score')
plt.style.use('ggplot')
plt.show()


In [None]:
df_raw

In [None]:
questions_RAW = df_raw['Question'].tolist()
ideal_answers_RAW = df_raw['Expected Answer'].tolist()
gen_answers_RAW = df_raw['Actual Answer'].tolist()

# Score the RAW dataset
P_RAW, R_RAW, F1_RAW = scorer.score(gen_answers_RAW, ideal_answers_RAW)

# Plot the F1 score distribution for the RAW data
plt.hist(F1_RAW, bins=20)
plt.xlabel("Score")
plt.ylabel("Counts")
plt.title('Falcon 7b Instruct RAW + QA F1 Score Distribution')
plt.style.use('ggplot')
plt.show()

# Calculate average scores for the RAW data
average_precision_RAW = P_RAW.mean()
average_recall_RAW = R_RAW.mean()
average_f1_RAW = F1_RAW.mean()

# Define the scores and labels for plotting
scores_RAW = [average_precision_RAW, average_recall_RAW, average_f1_RAW]
labels_RAW = ['Precision', 'Recall', 'F1 Score']

# Create the bar chart for average scores
plt.figure(figsize=(8, 6))
barlist = plt.bar(labels_RAW, scores_RAW, color=['blue', 'green', 'red'])

# Add text labels to each bar
for idx, bar in enumerate(barlist):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height, f'{scores_RAW[idx]:.3f}', ha='center', va='bottom')

# Set the labels and title for the bar chart
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Falcon 7b Instruct RAW + QA Average Precision, Recall, and F1 Score')
plt.style.use('ggplot')
plt.show()


In [None]:

import numpy as np


# Grouping data
precisions = [average_precision_RAW, average_precision_QA, average_precision_BL]
recalls = [average_recall_RAW, average_recall_QA, average_recall_BL]
f1_scores = [average_f1_RAW, average_f1_QA, average_f1_BL]

# Setting up the bar plot
labels = ['RAW Text + QA', 'QA', 'Baseline']
x = np.arange(len(labels))  # the label locations
width = 0.25  # the width of the bars

# Creating the figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Creating bars
rects1 = ax.bar(x - width, precisions, width, label='Precision', color='blue')
rects2 = ax.bar(x, recalls, width, label='Recall', color='green')
rects3 = ax.bar(x + width, f1_scores, width, label='F1', color='red')

# Adding labels, title, and custom x-axis tick labels
ax.set_ylabel('Scores')
ax.set_title('Comparison of Precision, Recall, and F1 Across RAW, QA, and BL')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

# Adding text labels to each bar
def add_labels(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(rects1)
add_labels(rects2)
add_labels(rects3)

# Showing the plot
plt.tight_layout()
plt.show()


In [None]:
#questions = df['Question'].tolist()
#ideal_answers = df['Expected Answer'].tolist()
#gen_answers = df['Actual Answer'].tolist()

questions = df['questions'].tolist()
ideal_answers = df['answer'].tolist()
gen_answers = df['model_answer'].tolist()

In [None]:
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

In [None]:
P, R, F1 = scorer.score([gen_answers[0]], [ideal_answers[0]])

In [None]:
P, R, F1 = scorer.score(["an eigenvector for a matrix is a column vector of the form (n - 1, a n ) such that (n - 1) n is the size of the matrix. The column vector corresponding to the eigenvector is the `i`-th element of the matrix, where `i` is the index of the row and column."], ["An eigenvector of a matrix is a nonzero vector that changes at most by a scalar factor when that matrix is applied to it."])

In [None]:
print(f"Precision: {P[0]:.3f}, Recall: {R[0]:.3f}, F1 Score: {F1[0]:.3f}")

In [None]:

scorer = BERTScorer(lang="en", rescale_with_baseline=True)


P, R, F1 = scorer.score(gen_answers, ideal_answers)

print(f"System level F1 score: {F1.mean():.3f}")



In [None]:
print(f"System level F1 score: {F1.mean():.3f}")


plt.hist(F1, bins=20)
plt.xlabel("Score")
plt.ylabel("Counts")
plt.title('Falcon 7b Instruct Baseline F1 Score Distribution')
plt.style.use('ggplot')
plt.show()


In [None]:
import numpy as np

P = np.array(P)
R = np.array(R)
F1 = np.array(F1)


num_points = len(P)


idx = np.arange(num_points)


bar_width = 0.2


plt.figure(figsize=(12, 6))


plt.bar(idx, P, bar_width, label='Precision')
plt.bar(idx + bar_width, R, bar_width, label='Recall')
plt.bar(idx + 2 * bar_width, F1, bar_width, label='F1 Score')

plt.xlabel('Data Points')
plt.ylabel('Scores')
plt.title('Precision, Recall, and F1 Score for Each Data Point')
plt.xticks(idx + bar_width, idx)


plt.legend()


plt.tight_layout()
plt.show()


In [None]:


average_precision = P.mean()
average_recall = R.mean()
average_f1 = F1.mean()

scores = [average_precision, average_recall, average_f1]
labels = ['Precision', 'Recall', 'F1 Score']


plt.figure(figsize=(8, 6))

barlist = plt.bar(labels, scores, color=['blue', 'green', 'red'])

for idx, bar in enumerate(barlist):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height, f'{scores[idx]:.3f}', ha='center', va='bottom')


plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Falcon 7b Instruct Baseline Average Precision, Recall, and F1 Score')
plt.style.use('ggplot')


plt.show()
