In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os
import fnmatch

# Benchmark Eval(Absolute Evaluation)

In [None]:
def get_absolute_metrics(file_name, remove_invalid = True):
  df = pd.read_csv(file_name)
  if remove_invalid:
    df = df[df['score']>0]

  series1 = df['gpt4_score']
  series2 = df['score']

  pearson_corr = series1.corr(series2, method='pearson')
  kendall_corr = series1.corr(series2, method='kendall')
  spearman_corr = series1.corr(series2, method='spearman')

  # Print the correlation coefficients
  print(f'Pearson Correlation: {pearson_corr}')
  print(f'Kendall Tau Correlation: {kendall_corr}')
  print(f'Spearman Correlation: {spearman_corr}')

  return pearson_corr, kendall_corr, spearman_corr

In [None]:
get_absolute_metrics('/content/drive/MyDrive/Sem3/ANLP/HW3/ANLP_HW3_Results/Benchmark/Finetuned Prometheus Llama/fsdp-llama7b-4x7_benchmark_output.csv', remove_invalid = True)

Pearson Correlation: 0.7588347439586229
Kendall Tau Correlation: 0.6630267547999497
Spearman Correlation: 0.7561187135754434


(0.7588347439586229, 0.6630267547999497, 0.7561187135754434)

In [None]:
get_absolute_metrics('/content/drive/MyDrive/Sem3/ANLP/HW3/ANLP_HW3_Results/Benchmark/Finetuned Prometheus Llama/peft-llama7b-8x4_feedback_collection_test_output.csv', remove_invalid = True)

Pearson Correlation: 0.18299995874634592
Kendall Tau Correlation: 0.15078521364503625
Spearman Correlation: 0.18322186168955262


(0.18299995874634592, 0.15078521364503625, 0.18322186168955262)

# HHH Eval(Relative Evaluation)

In [None]:
def get_relative_metrics(file_name, remove_invalid = True):
  df = pd.read_csv(file_name)
  if remove_invalid:
    df = df[df['chosen_score']>0]
    df = df[df['rejected_score']>0]

  acc = len(df[df['chosen_score']>=df['rejected_score']])
  acc = acc/len(df)
  return acc

In [None]:
get_relative_metrics('/content/drive/MyDrive/Sem3/ANLP/HW3/ANLP_HW3_Results/HHH/Pretrained Prometheus/prometheus_7b_hhh.csv', remove_invalid = False)

In [None]:
get_relative_metrics('/content/drive/MyDrive/Sem3/ANLP/HW3/ANLP_HW3_Results/HHH/Pretrained Prometheus/prometheus_7b_hhh.csv', remove_invalid = False)

In [None]:
get_relative_metrics('/content/drive/MyDrive/Sem3/ANLP/HW3/ANLP_HW3_Results/HHH/LLaMa/output_llama_hhh.csv', remove_invalid = False)

# Load Files

In [None]:
test_files_bench = []

dataset_type_path = '/content/drive/MyDrive/Sem3/ANLP/HW3/ANLP_HW3_Results/Benchmark'
for root, dirs, files in os.walk(dataset_type_path):
    print(files)
    for file_name in fnmatch.filter(files, '*.csv'):
          test_files_bench.append(os.path.join(root, file_name))

test_files_bench

[]

In [None]:
for f in test_files_bench:
  print(f)
  get_absolute_metrics(f, remove_invalid = False)
  print()

In [None]:
test_files_hhh = []

dataset_type_path = '/content/drive/MyDrive/ANLP_HW3_Results/HHH'
for root, dirs, files in os.walk(dataset_type_path):
    for file_name in fnmatch.filter(files, '*.csv'):
          test_files_hhh.append(os.path.join(root, file_name))

test_files_hhh

In [None]:
for f in test_files_hhh:
  print(get_relative_metrics(f, remove_invalid = True))

In [None]:
term_txt = '/content/13b_op'

In [None]:
responses = []

In [None]:
with open(term_txt, 'r') as file:
    lines = file.readlines()
    curr_text = ''
    flag = False
    for i, line in enumerate(lines):
        #print(line.strip())
        if flag == False and line.isdigit():
          continue
        if '<s>' in line and not flag:
          responses.append(curr_text)
          curr_text = ''
          flag = True
        elif '</s>' in line:
          flag = False

        if 'An exception occurred' not in line:
          curr_text+=line


In [None]:
len(responses)

978

In [None]:
samp = responses[-1]

In [None]:
samp

'<s> ###Task Description:\nAn instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.\n1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.\n2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.\n3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"\n4. Please do not generate any other opening, closing, and explanations.\n\n###The instruction to evaluate:\nIn a global conference call, representatives from Japan, France, and Brazil are discussing a multilateral trade agreement. While they each speak in their native languages, the need is to translate and communicate their points in real-time, ensuring that the substance and finer points of e

In [None]:
import re
pattern = re.compile('', re.IGNORECASE)

In [None]:
for t in responses:
  print(re.search(pattern, t[-50:]))


None
<re.Match object; span=(4, 5), match='u'>
<re.Match object; span=(0, 1), match='e'>
<re.Match object; span=(1, 2), match='r'>
<re.Match object; span=(0, 1), match='E'>
<re.Match object; span=(3, 4), match='t'>
<re.Match object; span=(5, 6), match='s'>
<re.Match object; span=(0, 1), match='e'>
<re.Match object; span=(0, 1), match='u'>
<re.Match object; span=(0, 1), match='s'>
<re.Match object; span=(0, 1), match='r'>
<re.Match object; span=(2, 3), match='s'>
<re.Match object; span=(0, 1), match='t'>
<re.Match object; span=(0, 1), match='e'>
<re.Match object; span=(0, 1), match='e'>
<re.Match object; span=(0, 1), match='r'>
<re.Match object; span=(0, 1), match='t'>
<re.Match object; span=(0, 1), match='e'>
<re.Match object; span=(8, 9), match='e'>
<re.Match object; span=(0, 1), match='s'>
<re.Match object; span=(0, 1), match='s'>
<re.Match object; span=(2, 3), match='e'>
<re.Match object; span=(1, 2), match='u'>
<re.Match object; span=(0, 1), match='e'>
<re.Match object; span=(4, 5)

In [None]:
re.findall(pattern, samp[:50])

[]

In [None]:
samp.find('result')

-1

In [None]:
with open('13b_op', 'r') as f:
  f.read()