### Training Data process

#### 1. load samples

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

WNL = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize(word):
    #
    return WNL.lemmatize(word, get_wordnet_pos(pos_tag([word])[0][1]))

In [None]:
import json

def load_json(file_path):
    assert file_path.split('.')[-1] == 'json'
    with open(file_path,'r') as file:
        data = json.load(file)
    return data

def save_json(save_path,data):
    assert save_path.split('.')[-1] == 'json'
    with open(save_path, 'w', encoding='utf-8') as file:
        json.dump(data, file)
    file.close()

In [None]:
def get_word_triple(story, sec_id):
  word_list = {}
  triple_list = []
  story_data = load_json('/gdrive/MyDrive/AI4EDU/Retriever/data/' + story + '.json')
  triples = load_json('/gdrive/MyDrive/AI4EDU/Retriever/triples_train.json')
  for word in story_data[str(int(sec_id)-1)]['words']:
    if word['stop'] == 0:
      word_char = lemmatize(word['word'])
      # print(word_char)
      if word_char not in word_list and word_char in triples:
        word_list[word_char] = ''
        i = 0
        for tp in triples[word_char]['triples']:
          if i == 3:
            break
          word_list[word_char] += '(' + tp[0] + ', ' + tp[1] + ', ' + tp[2] + '); '
          triple_list.append('(' + tp[0] + ', ' + tp[1] + ', ' + tp[2] + ')')
          i += 1
  triple_list_str = ''
  for word in word_list:
    triple_list_str += word_list[word] + '\n'

  return triple_list, triple_list_str

In [None]:
import csv
import json

training_data = {}
# each line: {"query": str, "pos": List[str], "neg":List[str]}
dir = '/gdrive/MyDrive/AI4EDU/Retriever/train_split.csv'

with open(dir) as file_obj:
    heading = next(file_obj)
    reader_obj = csv.reader(file_obj)
    for row in reader_obj:
      title = row[1]
      sec_id = row[2]
      text = row[3]
      triple_list, triple_list_str = get_word_triple(title, sec_id)
      if title not in training_data:
        training_data[title] = {}
      if sec_id not in training_data[title]:
        training_data[title][sec_id] = {
            "text": text,
            "triple": triple_list_str,
            "pos": [],
            "neg": []
        }
      pos_triple = '(' + row[5] + ', ' + row[6] + ', ' + row[7] + ')'
      neg_triple = []
      for tp in triple_list:
        if tp != pos_triple:
          neg_triple.append(tp)
      if pos_triple not in training_data[title][sec_id]['pos']:
        training_data[title][sec_id]['pos'].append(pos_triple)
        training_data[title][sec_id]['neg'] += neg_triple

retriever_fine_tune_data = []
for title, val in training_data.items():
  for sec_id, val_sec in val.items():
    # retriever_fine_tune_data.append({"query": val_sec['text'] + '\n\n' + val_sec['triple'], "pos": val_sec['pos'], "neg":val_sec['neg']})
    retriever_fine_tune_data.append({"query": val_sec['text'], "pos": val_sec['pos'], "neg":val_sec['neg']})

KeyboardInterrupt: 

In [None]:
save_json('/gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_data_story_only_full_neg.json', retriever_fine_tune_data)
save_json('/gdrive/MyDrive/AI4EDU/Retriever/all_training_data_story_only_full_neg.json', training_data)

In [None]:
retriever_fine_tune_data = load_json('/gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_data_story_only_full_neg.json')

with open('/gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_data_story_only_full_neg.jsonl', 'w') as file:
    for item in retriever_fine_tune_data:
        json_str = json.dumps(item)
        file.write(json_str + '\n')

In [None]:
!pip install evaluate
!pip install rouge_score
!pip install -U sentence-transformers

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [

In [None]:
from evaluate import load
from sentence_transformers import SentenceTransformer, util
sent_sim_model = SentenceTransformer('all-MiniLM-L6-v2')

rouge = load('rouge')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

### 1. BM25

In [None]:
%pip install langchain
%pip install --upgrade --quiet  rank_bm25
%pip install langchain_community

Collecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.2-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.5/309.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl (23 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.65-py3-none-any.whl (124 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.3/124.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.0->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting packaging<24.0,>=23.2 (from langchai

test data

In [None]:
import csv
TEST_DATA = []

# dir = '/content/drive/MyDrive/AI4EDU/Retriever/test_OneSecPerLine.csv'
dir = '/gdrive/MyDrive/AI4EDU/Retriever/test_OneSecPerLine.csv'



# Open file
with open(dir) as file_obj:
    heading = next(file_obj)
    reader_obj = csv.reader(file_obj)
    for row in reader_obj:
      eachQA = [ [row[3], '(' + row[3] + ', ' + row[4]  + ', ' + row[5] + ')', row[6], row[7]] ]
      if row[11] != '' and row[12] != '':
        eachQA.append( [ row[8], '(' + row[8] + ', ' + row[9]  + ', ' + row[10] + ')', row[11], row[12]] )
      if row[16] != '' and row[17] != '':
        eachQA.append( [row[13], '(' + row[13] + ', ' + row[14]  + ', ' + row[15] + ')', row[16], row[17]] )
      TEST_DATA.append([row[0], row[1], row[2], eachQA])

print(len(TEST_DATA))
print(TEST_DATA[0])

FileNotFoundError: [Errno 2] No such file or directory: '/gdrive/MyDrive/AI4EDU/Retriever/test_OneSecPerLine.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from langchain_community.retrievers import BM25Retriever

all_rouge = all_sentsim = 0

with open('/gdrive/MyDrive/AI4EDU/Retriever/BM25_story_text_finetune.txt', 'w') as txt_file:
  for data in TEST_DATA:
    title = data[0]
    sec_id = data[1]
    text = data[2]
    triple_list, triple_list_str = get_word_triple(title, sec_id)
    retriever = BM25Retriever.from_texts(triple_list)
    pred_triple = retriever.invoke(text)[0].page_content
    txt_file.write(pred_triple + '\n')
    each_rouge = each_sentsim = 0

    embeddings1 = sent_sim_model.encode(pred_triple, convert_to_tensor=True)

    for gt in data[3]:
      gt_triple = gt[1]
      embeddings2 = sent_sim_model.encode(gt_triple, convert_to_tensor=True)
      new_rouge = rouge.compute(predictions=[pred_triple], references=[gt_triple])['rougeL']
      if new_rouge > each_rouge:
        each_rouge = new_rouge


      cosine_scores = util.cos_sim(embeddings1, embeddings2)[0][0].item()

      if cosine_scores > each_sentsim:
        each_sentsim = cosine_scores

    all_rouge += each_rouge
    all_sentsim += each_sentsim

print("rouge: ", all_rouge / len(TEST_DATA))
print("sentsim: ", all_sentsim / len(TEST_DATA))

rouge:  0.4207627988449913
sentsim:  0.3384097928666089


### BGE

In [None]:
%pip install -U FlagEmbedding

Collecting FlagEmbedding
  Downloading FlagEmbedding-1.2.10.tar.gz (141 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.3/141.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate>=0.20.1 (from FlagEmbedding)
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: FlagEmbedding
  Building wheel for FlagEmbedding (setup.py) ... [?25l[?25hdone
  Created wheel for FlagEmbedding: filename=FlagEmbedding-1.2.10-py3-none-any.whl size=166100 sha256=433e03ea71b5d92ff92c7196d95ce36692b18ac632e1271b44d5666eb8cc15f1
  Stored in directory: /root/.cache/pip/wheels/3b/1d/d2/eec38cd59144f4c9767d7c55cfae8e8feec699071aa41ca5da
Successfully built FlagEmbedding
Installing collected packages: accelerate, FlagEmbedding
Successfully installed FlagE

Hard Neg

In [None]:
%pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
!python -m FlagEmbedding.baai_general_embedding.finetune.hn_mine \
--model_name_or_path BAAI/bge-base-en-v1.5 \
--input_file /gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_data_story_only_full_neg.jsonl \
--output_file /gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_data_story_only_full_neg_minedHN.jsonl \
--range_for_sampling 2-200 \
--negative_number 15 \
--use_gpu_for_searching

2024-05-29 15:04:45.513395: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 15:04:45.513448: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 15:04:45.515157: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
tokenizer_config.json: 100% 366/366 [00:00<00:00, 2.26MB/s]
vocab.txt: 100% 232k/232k [00:00<00:00, 3.55MB/s]
tokenizer.json: 100% 711k/711k [00:00<00:00, 3.17MB/s]
special_tokens_map.json: 100% 125/125 [00:00<00:00, 1.05MB/s]
config.json: 100% 777/777 [00:00<00:00, 5.62MB/s]
model.safetensors: 100% 438M/438M [00:01<00:00, 276MB/s]
inferencing embedding for corpu

In [None]:
!nvidia-smi

Mon May 27 15:34:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!torchrun --nproc_per_node 1 \
-m FlagEmbedding.baai_general_embedding.finetune.run \
--output_dir /gdrive/MyDrive/AI4EDU/Retriever/models/bge-large-en-v1.5-story_only_3 \
--model_name_or_path BAAI/bge-large-en-v1.5 \
--train_data /gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_data_story_only_full_neg_minedHN.jsonl \
--learning_rate 1e-5 \
--fp16 \
--num_train_epochs 5 \
--per_device_train_batch_size 50 \
--dataloader_drop_last True \
--normlized True \
--temperature 0.02 \
--query_max_len 64 \
--passage_max_len 256 \
--train_group_size 2 \
--negatives_cross_device \
--logging_steps 10 \
--save_steps 1000 \
--query_instruction_for_retrieval ""

2024-05-30 14:21:41.134234: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 14:21:41.134284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 14:21:41.139694: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
05/30/2024 14:21:47 - INFO - __main__ -   Training/evaluation parameters RetrieverTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,

#### Construct Eval Data

1. Corpus Data

In [None]:
with open('/gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_data_full_neg.jsonl', 'r') as r_file:
  with open('/gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_corpus.jsonl', 'w') as w_file:
    for item in r_file:
      item = json.loads(item)
      corpus = {"content": item["pos"][0]}
      # print(corpus)
      json_str = json.dumps(corpus)
      w_file.write(json_str + '\n')


{'content': '(fish, is at location of, water)'}
{'content': '(tribe, is a, social_group)'}
{'content': '(wind, is made of, air)'}
{'content': '(dive, has subevent, hold_breath)'}
{'content': '(prank, is a, joke)'}
{'content': '(hoist, is used for, lift_things)'}
{'content': '(dressmaker, is capable of, cut_cloth)'}
{'content': '(avenue, is a, street)'}
{'content': '(whisper, is a, speaking)'}
{'content': '(prince, is a, son_of_king)'}
{'content': '(wood, is used for, fence_in_property)'}
{'content': '(cloud, is used for, rain)'}
{'content': '(bed, is used for, lie_down)'}
{'content': '(ill, causes, go_to_doctor)'}
{'content': '(affection, is a, good_will)'}
{'content': '(flatten, has subevent, change_shape)'}
{'content': '(bounce, causes, jump_up_and_down)'}
{'content': '(scrape, has subevent, roll_up)'}
{'content': '(firkin, is a, british_capacity_unit)'}
{'content': '(whistle, is a, high_pitched_sound)'}
{'content': '(passion, is a, emotion)'}
{'content': '(succeed, has subevent, wor

In [None]:
import csv
import json

eval_data = {}
# each line: {"query": str, "pos": List[str], "neg":List[str]}
dir = '/gdrive/MyDrive/AI4EDU/Retriever/val_OneSecPerLine.csv'

with open(dir) as file_obj:
    heading = next(file_obj)
    reader_obj = csv.reader(file_obj)
    for row in reader_obj:
      title = row[0]
      sec_id = row[1]
      text = row[2]
      triple_list, triple_list_str = get_word_triple(title, sec_id)
      if title not in eval_data:
        eval_data[title] = {}
      if sec_id not in eval_data[title]:
        eval_data[title][sec_id] = {
            "text": text,
            "triple": triple_list_str,
            "pos": []
        }
      pos_triple = ['(' + row[3] + ', ' + row[4] + ', ' + row[5] + ')']
      if row[11] != '' and row[12] != '':
        pos_triple.append('(' + row[8] + ', ' + row[9]  + ', ' + row[10] + ')')
      if row[16] != '' and row[17] != '':
        pos_triple.append('(' + row[13] + ', ' + row[14]  + ', ' + row[15] + ')')

      eval_data[title][sec_id]['pos'] += pos_triple

retriever_eval_data = []
for title, val in eval_data.items():
  for sec_id, val_sec in val.items():
    retriever_eval_data.append({"query": val_sec['text'] + '\n\n' + val_sec['triple'], "positive": val_sec['pos']})

In [None]:
with open('/gdrive/MyDrive/AI4EDU/Retriever/retriever_eval_data.jsonl', 'w') as file:
    for item in retriever_eval_data:
        json_str = json.dumps(item)
        file.write(json_str + '\n')

In [None]:
!python -m FlagEmbedding.baai_general_embedding.finetune.eval_msmarco \
--encoder /gdrive/MyDrive/AI4EDU/Retriever/models/bge-large-en-v1.5-story_text_only \
--fp16 \
--add_instruction 0 \
--k 100 \
--corpus_data /gdrive/MyDrive/AI4EDU/Retriever/retriever_fine_tune_corpus.jsonl \
--query_data /gdrive/MyDrive/AI4EDU/Retriever/retriever_eval_data.jsonl

2024-05-29 15:48:40.104235: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 15:48:40.104299: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 15:48:40.105604: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Generating train split: 380 examples [00:00, 966.82 examples/s]
Generating train split: 3350 examples [00:00, 612213.78 examples/s]
Inference Embeddings: 100% 14/14 [00:03<00:00,  4.60it/s]
Inference Embeddings: 100% 2/2 [00:01<00:00,  1.24it/s]
Searching: 100% 2/2 [00:00<00:00, 212.87it/s]
{'MRR@1': 0.05263157894736842, 'MRR@10': 0.07026106934001669, 'MRR@100': 

Inference

In [None]:
from evaluate import load
from sentence_transformers import SentenceTransformer, util
from FlagEmbedding import FlagModel

sent_sim_model = SentenceTransformer('all-MiniLM-L6-v2')

rouge = load('rouge')

# model = FlagModel('/gdrive/MyDrive/AI4EDU/Retriever/models/bge-large-en-v1.5-story_only_3',
#                   query_instruction_for_retrieval="",
#                   use_fp16=True)



In [None]:
from evaluate import load
from sentence_transformers import SentenceTransformer, util
from FlagEmbedding import FlagModel

sent_sim_model = SentenceTransformer('all-MiniLM-L6-v2')

rouge = load('rouge')

model = FlagModel('BAAI/bge-large-en-v1.5',
                  query_instruction_for_retrieval="",
                  use_fp16=True)



tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [None]:
import csv
import json
import numpy as np

dir = '/gdrive/MyDrive/AI4EDU/Retriever/test_OneSecPerLine.csv'

all_rouge = all_sentsim = 0

with open('/gdrive/MyDrive/AI4EDU/Retriever/BGE_not-fine-tune_.txt', 'w') as txt_file:
  with open(dir) as file_obj:
      heading = next(file_obj)
      reader_obj = csv.reader(file_obj)
      length = 0
      for row in reader_obj:
        length += 1
        title = row[0]
        sec_id = row[1]
        text = row[2]
        triple_list, triple_list_str = get_word_triple(title, sec_id)

        each_rouge = each_sentsim = 0

        pos_triple = ['(' + row[3] + ', ' + row[4] + ', ' + row[5] + ')']
        if row[11] != '' and row[12] != '':
          pos_triple.append('(' + row[8] + ', ' + row[9]  + ', ' + row[10] + ')')
        if row[16] != '' and row[17] != '':
          pos_triple.append('(' + row[13] + ', ' + row[14]  + ', ' + row[15] + ')')

        embeddings_1 = model.encode([text])
        embeddings_2 = model.encode(triple_list)
        similarity = embeddings_1 @ embeddings_2.T
        max_index = np.argmax(similarity)
        matched_triple = triple_list[max_index]
        txt_file.write(matched_triple + '\n')

        sent_embeddings1 = sent_sim_model.encode(matched_triple, convert_to_tensor=True)
        for tp in pos_triple:
          new_rouge = rouge.compute(predictions=[matched_triple], references=[tp])['rougeL']
          if new_rouge > each_rouge:
            each_rouge = new_rouge

          sent_embeddings2 =sent_sim_model.encode(tp, convert_to_tensor=True)
          cosine_scores = util.cos_sim(sent_embeddings1, sent_embeddings2)[0][0].item()

          if cosine_scores > each_sentsim:
            each_sentsim = cosine_scores

        all_rouge += each_rouge
        all_sentsim += each_sentsim


print("rouge: ", all_rouge / length)
print("sentsim: ", all_sentsim / length)

rouge:  0.29846194900989514
sentsim:  0.39525544620948294


In [None]:
from FlagEmbedding import FlagModel
sentences_1 = ["butterfly is an animal"]
sentences_2 = ["(creature, is a, animal)", "(gay, is a, slang_term_for_homosexual)"]
model = FlagModel('/gdrive/MyDrive/AI4EDU/Retriever/models/bge-large-en-v1.5-1',
                  query_instruction_for_retrieval="retrieve the most educational-appropriate triple for this story text",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
embeddings_1 = model.encode(sentences_1)
embeddings_2 = model.encode(sentences_2)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.684  0.5977]]
