In [17]:
from nltk.translate.bleu_score import sentence_bleu
import pandas
import pickle
from tqdm import tqdm
import numpy as np

In [None]:
!pip install -U sentence-transformers
!pip install gensim

In [None]:
!pip install POT

In [None]:
import gensim.downloader as api
wmd_model = api.load('word2vec-google-news-300')



In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def process_sentences(sentence_a, sentence_b):
  tokenized_sentence_a = word_tokenize(sentence_a)
  tokenized_sentence_b = word_tokenize(sentence_b)
  shakespearan_stopwords = ['thou', "thee", "thy", "thine", "ye", "art", "hath", "dost", "doth", "'tis", "anon", "ere", "hence", "thither", "whence",
                          "wherefore", "whither", "ay", "nay", "thus", "'twixt", "yon", "oft", "'twas", "yea", "ne'er", "o'er", "hither", "henceforth",
                          "'twill", "'twas", "'tis", "verily", "unto", "amongst", "among", "betwixt", "ere", "'gainst", "'mongst", "oftentimes", "oft",
                          "aught", "hath", "henceforth", "whene'er", "ne'er", "anon", "'twill", "yon", "thou'rt", "doth", "'tis", "'twas", "'tis",
                          "tis", "i'faith", "dost", "thyself", "thine","thy", "thyself", "thine", "whereon", "wherefore", "whereupon", "whereof",
                          "hereof", "thereof", "oftentimes"]

  stop_words = set(stopwords.words('english'))
  stop_words = set(shakespearan_stopwords + list(stop_words))

  filtered_sentence_a = [token for token in tokenized_sentence_a if token.lower() not in stop_words]
  filtered_sentence_b = [token for token in tokenized_sentence_b if token.lower() not in stop_words]
  return filtered_sentence_a, filtered_sentence_b

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sentence_transformers import SentenceTransformer
cos_sim_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import math

In [None]:
def calculate_metrics_cosine_wmd(result_df):
  cosine_similarities = []
  wmds = []
  for _, row in tqdm(result_df.iterrows()):
    model_response = row['model_response']
    expected_response = row['expected_response']

    # BLEU Similarity
    # output_bleu_scores.append(sentence_bleu([expected_response.split()], model_response.split()))

    # Consine Similarity
    model_encoding = cos_sim_model.encode(model_response)
    expected_encoding = cos_sim_model.encode(expected_response)
    consine_similarity = np.dot(model_encoding, expected_encoding)/(np.linalg.norm(model_encoding)*np.linalg.norm(expected_encoding))
    cosine_similarities.append(consine_similarity)

    # WMD
    a, b = process_sentences(model_response, expected_response)
    wm_distance = wmd_model.wmdistance(a, b)
    if math.isinf(wm_distance):
      continue
    wmds.append(wm_distance)

  wmds = np.array(wmds)
  cosine_similarities = np.array(cosine_similarities)
  return np.mean(wmds), np.mean(cosine_similarities)

In [15]:
import glob

result_paths = glob.glob('./*.pkl')
print(result_paths)

['./few_shot_finetune.pkl', './qlora_finetune_r_16.pkl', './zero_shot_finetune.pkl', './rosa_finetune_r_16.pkl', './lora_finetune_r_16.pkl', './lora_finetune_r_4.pkl', './rosa_finetune_r_4.pkl', './lora_finetune_r_8.pkl', './rosa_finetune_r_8.pkl']


In [None]:
BLEU_score = []
cosine_score = []
wmd_scores = []
for result_path in result_paths:
  print("For experiment:", result_path)
  with open(f"{result_path}", "rb") as input_file:
    result_df = pickle.load(input_file)
    wmd, cosine = calculate_metrics_cosine_wmd(result_df)
    wmd_scores.append(wmd)
    cosine_score.append(cosine)

In [None]:
for i in range(len(result_paths)):
  print(result_paths[i], "wmd:", wmd_scores[i], "cosine:", cosine_score[i])

./few_shot_finetune.pkl wmd: 1.262774525632353 cosine: 0.20810646
./qlora_finetune_r_16.pkl wmd: 1.0827318600195117 cosine: 0.46907562
./zero_shot_finetune.pkl wmd: 0.7939917794898832 cosine: 0.60975564
./rosa_finetune_r_16.pkl wmd: 1.0248985475759096 cosine: 0.41129872
./lora_finetune_r_16.pkl wmd: 0.9973072806513524 cosine: 0.42997453
./lora_finetune_r_4.pkl wmd: 0.9332363920099144 cosine: 0.53449005
./rosa_finetune_r_4.pkl wmd: 0.9329457580657197 cosine: 0.5342936
./lora_finetune_r_8.pkl wmd: 0.9967442512916036 cosine: 0.4303719
./rosa_finetune_r_8.pkl wmd: 1.0248985475759096 cosine: 0.41129872


In [None]:
!pip install bert-score

In [None]:
!pip install evaluate

## F1 Score

In [None]:
from evaluate import load
bertscore = load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
def calculate_metrics_f1(result_df):
  score = []
  for _, row in tqdm(result_df.iterrows()):
    model_response = row['model_response']
    expected_response = row['expected_response']
    temp_score = bertscore.compute(predictions=[model_response], references=[expected_response], lang="en")['f1']
    score.append(temp_score)
  score = np.array(score)
  return np.mean(score)

In [None]:
f1_scores = []
for result_path in result_paths:
  print("For experiment:", result_path)
  with open(f"{result_path}", "rb") as input_file:
    result_df = pickle.load(input_file)
    f1_score = calculate_metrics_f1(result_df)
    f1_scores.append(f1_score)

For experiment: ./few_shot_finetune.pkl


1462it [00:39, 36.74it/s]


For experiment: ./qlora_finetune_r_16.pkl


1462it [00:39, 36.84it/s]


For experiment: ./zero_shot_finetune.pkl


1462it [00:38, 38.37it/s]


For experiment: ./rosa_finetune_r_16.pkl


1462it [00:37, 38.54it/s]


For experiment: ./lora_finetune_r_16.pkl


1462it [00:38, 38.24it/s]


For experiment: ./lora_finetune_r_4.pkl


1462it [00:44, 32.80it/s]


For experiment: ./rosa_finetune_r_4.pkl


1462it [00:38, 38.19it/s]


For experiment: ./lora_finetune_r_8.pkl


1462it [00:39, 36.77it/s]


For experiment: ./rosa_finetune_r_8.pkl


1462it [00:37, 39.03it/s]


In [None]:
for i in range(len(result_paths)):
  print(result_paths[i], "f1:", f1_scores[i])

./few_shot_finetune.pkl f1: 0.822791891164884
./qlora_finetune_r_16.pkl f1: 0.8505896289925895
./zero_shot_finetune.pkl f1: 0.8802424233442455
./rosa_finetune_r_16.pkl f1: 0.8512163675108622
./lora_finetune_r_16.pkl f1: 0.8547105528261365
./lora_finetune_r_4.pkl f1: 0.8703951863763107
./rosa_finetune_r_4.pkl f1: 0.8696763656126327
./lora_finetune_r_8.pkl f1: 0.8548084934090459
./rosa_finetune_r_8.pkl f1: 0.8512163675108622


In [8]:
from transformers import pipeline
classifier = pipeline("text-classification", model="notaphoenix/shakespeare_classifier_model", top_k=None)

In [27]:
def calculate_metrics_class(result_df):
  score = []
  for _, row in tqdm(result_df.iterrows()):
    model_response = row['model_response']
    temp_score = classifier(model_response)[0]
    for ts in temp_score:
      if ts['label'] == 'shakespearean':
        temp_score = ts['score']
        break
    score.append(temp_score)
  score = np.array(score)
  return np.mean(score)

In [29]:
classifier_scores = []
for result_path in result_paths:
  print("For experiment:", result_path)
  with open(f"{result_path}", "rb") as input_file:
    result_df = pickle.load(input_file)
    cs = calculate_metrics_class(result_df)
    print(cs)
    classifier_scores.append(cs)

For experiment: ./few_shot_finetune.pkl


1462it [01:38, 14.91it/s]


0.049437270875607404
For experiment: ./qlora_finetune_r_16.pkl


1462it [01:37, 15.00it/s]


0.07267093394975982
For experiment: ./zero_shot_finetune.pkl


1462it [01:29, 16.43it/s]


0.12997777887687556
For experiment: ./rosa_finetune_r_16.pkl


1462it [01:09, 21.03it/s]


0.22396368143134707
For experiment: ./lora_finetune_r_16.pkl


1462it [01:11, 20.36it/s]


0.21912986780332067
For experiment: ./lora_finetune_r_4.pkl


1462it [01:32, 15.85it/s]


0.1811552699706256
For experiment: ./rosa_finetune_r_4.pkl


1462it [01:30, 16.13it/s]


0.2005547312395354
For experiment: ./lora_finetune_r_8.pkl


1462it [01:12, 20.26it/s]


0.21901368457446876
For experiment: ./rosa_finetune_r_8.pkl


1462it [01:09, 20.94it/s]

0.22396368143134707



