In [280]:
import pickle
import numpy as np
import time
import faiss
import json
import random
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [227]:
facts_path = "Data\Main\openbook.txt"
with open(facts_path) as f:
    facts = [line[1:-2] for line in f.readlines()]
facts[:5]

['A bee is a pollinating animal',
 'A bird is a pollinating animal',
 'An electrical conductor is a vehicle for the flow of electricity',
 'An example of a change in the Earth is an ocean becoming a wooded area',
 'An example of a chemical change is acid breaking down substances']

In [250]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

embeddings = model.encode(facts)
index_flatip = faiss.IndexFlatIP(embeddings.shape[1])
index_flatip.add(embeddings)

In [270]:
# with open(f'index_{model_name}.pickle', 'wb') as f:
#     pickle.dump(index, f, protocol=pickle.HIGHEST_PROTOCOL)
#
# with open(f'index_{model_name}.pickle', 'rb') as f:
#     eee = pickle.load(f)

In [233]:
question = "can you burn yourself with food"
k = 10

D, I = index_flatip.search(model.encode([question]), k)
result_facts = [facts[i] for i in I[0]]
results = list(zip(result_facts, np.round(D[0], 3)))
result_facts

['burning a living thing usually causes harm to that living thing',
 'fire causes burning',
 'cooking food requires adding heat energy',
 'if a body part was burned then that body part was exposed to a lot of heat energy',
 'cooking food to proper temperatures protects against food poisoning by killing bacteria and viruses',
 'if too much heat is transferred to an object then that object may burn',
 'if food is cooked then heat energy is added to that food',
 'burning wood is used to produce heat',
 'fire causes harm to living things',
 'cooking causes a chemical reaction']

In [276]:
def get_questions_answers(questions_path):
    with open(questions_path) as f:
        question_sets = [json.loads(line) for line in f.readlines()]
    questions = [qset['question']['stem'] for qset in question_sets]
    answers = [list(filter(lambda choice: choice['label'] == qset["answerKey"][0], qset['question']['choices']))[0]['text'] for qset in question_sets]
    true_facts = [qset['fact1'] for qset in question_sets]
    return questions, answers, true_facts

In [285]:
questions_path = "Data\\Additional\\train_complete.jsonl"
tfidf_vect = TfidfVectorizer()
k = 5
k_cumulative_metric = 5

model_stats = []
model_names = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1", "all-distilroberta-v1", "all-MiniLM-L12-v2", "multi-qa-distilbert-cos-v1", "all-MiniLM-L6-v2", "multi-qa-MiniLM-L6-cos-v1", "paraphrase-multilingual-mpnet-base-v2", "paraphrase-albert-small-v2", "paraphrase-multilingual-MiniLM-L12-v2", "paraphrase-MiniLM-L3-v2", "distiluse-base-multilingual-cased-v1", "distiluse-base-multilingual-cased-v2"]

questions, answers, true_facts = get_questions_answers(questions_path)
# question_indexes = random.sample(range(len(questions)), n_questions)
question_indexes = range(len(questions))
n_questions = len(questions)

for model_name in model_names:

    model = SentenceTransformer(model_name)

    embeddings = model.encode(facts)
    faissindex = faiss.IndexFlatIP(embeddings.shape[1])
    faissindex.add(embeddings)

    model_scores = []
    model_hits = []
    for i in question_indexes:
        question, answer, true_fact = questions[i], answers[i], true_facts[i]

        D, I = faissindex.search(model.encode([question]), k)
        result_facts = [facts[i] for i in I[0]]
        results = list(zip(result_facts, np.round(D[0], 3)))

        score = 0
        for result_fact in result_facts[:k_cumulative_metric]:
            trsfm = tfidf_vect.fit_transform([answer, result_fact])
            score += cosine_similarity(trsfm[0:1], trsfm)[0][1]
        model_scores.append(score)

        hit = 1 if true_fact in result_facts else 0
        model_hits.append(hit)

    model_stats.append({
        "model_name": model_name,
        "cosine_scores": model_scores,
        "fact_hits": model_hits
    })

In [288]:
for stat in model_stats:
    print(round(np.mean(stat["cosine_scores"]), 4), np.sum(stat["fact_hits"])/n_questions, stat["model_name"])

0.0931 0.6637078878353843 all-mpnet-base-v2
0.0877 0.6544280815009078 multi-qa-mpnet-base-dot-v1
0.0915 0.6280008069396813 all-distilroberta-v1
0.0894 0.6324389751866047 all-MiniLM-L12-v2
0.0882 0.635061529150696 multi-qa-distilbert-cos-v1
0.0882 0.6263869275771636 all-MiniLM-L6-v2
0.0847 0.6082307847488401 multi-qa-MiniLM-L6-cos-v1
0.0846 0.609642929191043 paraphrase-multilingual-mpnet-base-v2
0.0715 0.5386322372402663 paraphrase-albert-small-v2
0.0762 0.5499293927778899 paraphrase-multilingual-MiniLM-L12-v2
0.0624 0.5259229372604398 paraphrase-MiniLM-L3-v2
0.0821 0.529150695985475 distiluse-base-multilingual-cased-v1
0.0818 0.5396409118418398 distiluse-base-multilingual-cased-v2
