#### EV- 0.0 Test sentence similarity models (optional)
Just to play with a couple of models and a few sentences.

In [None]:
# Install Transformers
from IPython.display import clear_output
! pip install -U sentence-transformers

In [28]:
from sentence_transformers import SentenceTransformer, util
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('nli-distilroberta-base-v2')

clear_output()

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome',
              # Triples
              # 'Aarhus_Airport | cityServed | Aarhus,_Denmark',
              # 'Aarhus_Airport | location | Tirstrup',
              # "Textified" triples
              # 'Aarhus Airport city served Aarhus, Denmark',
              # 'Aarhus Airport location Tirstrup'
              ]

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great',
              # 'Aarhus Airport serves the city of Aarhus, Denmark',
              # 'Aarhus Airport is in Tirstrup',
              # 'Aarhus Airport serves the city of Aarhus, Denmark',
              # 'Aarhus Airport is in Tirstrup'
              ]

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.2289
A man is playing guitar 		 A woman watches TV 		 Score: -0.0171
The new movie is awesome 		 The new movie is so great 		 Score: 0.9504


##**EV-1 Setup**

#### EV-1.1 Mount drive (optional)

In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

#### EV-1.2 Download files

In [2]:
# ! pip install --upgrade --no-cache-dir gdown
from IPython.display import clear_output
! gdown 1M2sxOCSUQJiLt6yC40fGI0QWSeq7jVod
! unzip '/content/Lab_Week10.zip'
clear_output()

#### EV-1.3 Install transformers

In [3]:
from IPython.display import clear_output
!pip install -U sentence-transformers
clear_output()

####EV-1.4 Load models, set paths (INPUT NEEDED: path to fine-tuned model)

In [4]:
from sentence_transformers import SentenceTransformer, util

files_path = '/content/Lab_Week10/Eval/'
eval_scores_candidates = '/content/Lab_Week10/Eval/results_candSent/'
eval_scores_target = '/content/Lab_Week10/Eval/results_targetSent/'

# fineTunedTransformer_path = '/content/drive/MyDrive/Colab-dump/Lab_Week10/MyModel-nli-distilroberta-base-v2-2023-03-22_12-17-11'
# fineTuned_model = SentenceTransformer(fineTunedTransformer_path)
offTheShelf_model = SentenceTransformer('nli-distilroberta-base-v2')

clear_output()

####EV-1.5 Load files

In [5]:
import pickle
from tqdm import tqdm

with open(files_path + 'candidateSentences_dev_1-2.txt', 'rb') as f:
  allSentences = list(pickle.load(f))

with open(files_path + 'targetSentences_dev_1.txt', 'rb') as f:
  referenceSentences = list(pickle.load(f))

with open(files_path + 'triples_dev_1_textified.txt', 'rb') as f:
  textifiedTriples = pickle.load(f)

In [6]:
print('Expected: 1834, 401, 401')
print(len(allSentences), len(textifiedTriples), len(referenceSentences))
print(referenceSentences[1], textifiedTriples[1])

Expected: 1834, 401, 401
1834 401 401
["Aarhus Airport's runway length is 2702.0.", 'The Aarhus Airport has a runway length of 2702.0.'] Aarhus Airport runway length 2702.0 .


## **EV-2 Scoring of candidate sentences (use GPU)**
Run for fine-tuned model only. 

####EV-2.1 Scoring Functions

In [2]:
from tqdm import tqdm

def evaluate(textified_triple, sentences_list, model):
    to_embed = [[textified_triple], sentences_list]
    to_embed = [element for sublist in to_embed for element in sublist]

    embeddings = model.encode(to_embed, convert_to_tensor=True)
    
    results = []

    for i in range(1, len(embeddings)):
        similarity = float(util.pytorch_cos_sim(embeddings[0], embeddings[i])[0][0])
        results.append([sentences_list[i-1], similarity])

    results.sort(key=lambda result: result[1], reverse=True)
    return results

def evaluate_allTriples(textifiedTriples, sentences_list, savePath, model, ft=False):
    for i, textified_triple in enumerate(tqdm(textifiedTriples)):
        results = evaluate(textified_triple, sentences_list, model)
        if ft == False:
            with open(savePath + 'triple'+str(i)+'_results2.txt', 'wb') as fh:
                pickle.dump(results, fh)
        else:
            with open(savePath + 'triple'+str(i)+'_results1.txt', 'wb') as fh:
                pickle.dump(results, fh)

def evaluate_allTriples2(textifiedTriples, referenceSentences, savePath, model, ft=False):
    for i, textified_triple in enumerate(tqdm(textifiedTriples)):
        results = evaluate(textified_triple, list(referenceSentences[i]), model)
        if ft == False:
            with open(savePath + 'triple'+str(i)+'_results2.txt', 'wb') as fh:
                pickle.dump(results, fh)
        else:
            with open(savePath + 'triple'+str(i)+'_results1.txt', 'wb') as fh:
                pickle.dump(results, fh)

####EV-2.2 Run scoring

In [3]:
# The results for the off-the-shelf module are already provided, no need to run this cell
# evaluate_allTriples(textifiedTriples, allSentences, eval_scores_candidates, offTheShelf_model)
# evaluate_allTriples2(textifiedTriples, referenceSentences, eval_scores_target, offTheShelf_model)

In [None]:
evaluate_allTriples(textifiedTriples, allSentences, eval_scores_candidates, fineTuned_model, ft=True)
# evaluate_allTriples2(textifiedTriples, referenceSentences, eval_scores_target, fineTuned_model, ft=True)

#### EV-2.3 Print raw results

In [None]:
def print_result(textified_triple, results, n=None):
    print('---', textified_triple, '---')
    if not n or n > len(results):
        n = len(results)
    for i in range(n):
   # for i, result in enumerate(results):
        print(str(i+1)+'.', 'Similarity: {:.3f}'.format(results[i][1]), '->', results[i][0])

def printResults(files_path, model, n):

    results_path = eval_scores_candidates
    with open (files_path + 'triples_dev_1_textified.txt', 'rb') as f:
        textifiedTriples = pickle.load(f)

    for i in range(n):
        
        textified_triple = textifiedTriples[i]
        
        if model == 'offTheShelf':
            result2_name = 'triple'+str(i)+'_results2.txt'
            pickle_off = open(results_path + result2_name, 'rb')
            result2 = pickle.load(pickle_off)
            print('OffTheShelf Model')
            print_result(textified_triple, result2, n=10)
            print()
            print()
        
        elif model =='fineTuned':
            result1_name = 'triple'+str(i)+'_results1.txt'
            pickle_off = open(results_path + result1_name, 'rb')
            result1 = pickle.load(pickle_off)    
            print('FineTuned Model')
            print_result(textified_triple, result1, n=10)
            print()
            print()
        
        elif model =='both':
            result1_name = 'triple'+str(i)+'_results1.txt'
            result2_name = 'triple'+str(i)+'_results2.txt'
            pickle_off = open(results_path + result1_name, 'rb')
            result1 = pickle.load(pickle_off)    
            pickle_off = open(results_path + result2_name, 'rb')
            result2 = pickle.load(pickle_off)
            print('FineTuned Model')
            print_result(textified_triple, result1, n=10)
            print()
            print('OffTheShelf Model')
            print_result(textified_triple, result2, n=10)
            print()
            print()

In [None]:
# Second parameter can be 'offTheShelf', 'fineTuned' or 'both'
# Third parameter is the number of outputs to print (up to 401)
printResults(files_path, 'offTheShelf', 2)

## **EV-3 Evaluation results aggregation**
Takes the files produced in the scoring phase and extracts global results of the model.

#### EV-3.1 Functions and pre-processing

In [7]:
import pickle
from tqdm import tqdm
import statistics

# To define thresholds to be tested
thresholds = []
threshold = 0
for i in range(99):
    threshold += 0.01
    thresholds.append(round(threshold, 2))


def classify_results(results, threshold, correctSentences, chooseTest):
    truePositives = 0
    trueNegatives = 0
    falsePositives = 0
    falseNegatives = 0
    for result in results:
        if chooseTest == 'originalTriples':
            if result[1] >= threshold:
                if result[0] in correctSentences:
                    truePositives += 1
                else:
                    falsePositives += 1
            else:
                if result[0] in correctSentences:
                    falseNegatives += 1
                else:
                    trueNegatives += 1
        else:
            if result[1] >= threshold:
                falsePositives += 1
            else:
                trueNegatives += 1

    classifiedResults = [truePositives, falsePositives, trueNegatives, falseNegatives]

    return classifiedResults

def top_isCorrect(results, correctSentences):
    top_isCorrect = True
    top = [result[0] for result in results[:len(correctSentences)]]
    for correctSentence in correctSentences:
        if correctSentence not in top:
            top_isCorrect = False
            break
    return top_isCorrect

def get_correctSentencesMean(results, correctSentences, topIsCorrect):
    if topIsCorrect:
        correctSentencesMean = statistics.mean([result[1] for result in results[:len(correctSentences)]])
    else:
        correctSentencesScores = []
        for correctSentence in correctSentences:
            for result in results:
                if result[0] == correctSentence:
                    correctSentencesScores.append(result[1])
        correctSentencesMean = statistics.mean(correctSentencesScores)

    return correctSentencesMean

def get_classificationMetrics(classifiedResults_sum, threshold_index):
    tp = classifiedResults_sum[threshold_index][0]
    fp = classifiedResults_sum[threshold_index][1]
    tn = classifiedResults_sum[threshold_index][2]
    fn = classifiedResults_sum[threshold_index][3]

    if tp == 0:
        precision = 0
        recall = 0
        f1 = 0
    else:
        precision = tp / (fp + tp)
        recall = tp / (fn + tp)
        f1 = (2 * precision * recall) / (precision + recall)

    accuracy = (fp + tn) / (tp + fn + tn + fp)

    metrics = [precision, recall, accuracy, f1]

    return metrics

def get_allClassificationMetrics(thresholds, classifiedResults_sum):
    
    allMetrics = []

    for i, threshold in enumerate(thresholds):
        metrics = get_classificationMetrics(classifiedResults_sum, i)
        allMetrics.append(metrics)

    return allMetrics

def get_allProperties(triples):
    all_properties = set()
    for triple in triples:
        all_properties.add(triple[1])
    return all_properties

In [8]:
# with open (files_path + 'triples_dev_1_textified.txt', 'rb') as f:
#     textifiedTriples = pickle.load(f)

# notParenthesesTriples = []
# for triple in textifiedTriples:
#     if not '(' in triple:
#         notParenthesesTriples.append(triple)
# print('Expected: 350, 401')
# print(len(notParenthesesTriples), len(textifiedTriples))

# with open (files_path + 'targetSentences_dev_1.txt', 'rb') as f:
#     sentences = pickle.load(f)
# print(textifiedTriples[111], sentences[111])

In [9]:
originalTriples_path2 = ''

def checkResults(thresholds, n, files_path, chooseTest, model, subObjSentences=False):

    if chooseTest == 'originalTriples':
        if subObjSentences:
            results_path = originalTriples_path2
        else:
            results_path = eval_scores_candidates
        with open (files_path + 'triples_dev_1_textified.txt', 'rb') as f:
            textifiedTriples = pickle.load(f)
        with open (files_path + 'targetSentences_dev_1.txt', 'rb') as f:
            sentences = pickle.load(f)
        '''
        notParenthesesIndexes = []
        for i, triple in enumerate(textifiedTriples):
            if not '(' in triple:
                notParenthesesIndexes.append(i)
        '''
    else:
        if chooseTest == 'exchangedObjSubTriples':
            results_path = exchangedObjSubTriplesResults_path
            with open (files_path + 'textified_exchangedObjSubTriples.txt', 'rb') as f:
                textifiedTriples = pickle.load(f)

        elif chooseTest == 'randomPropertyTriples':
            results_path = randomPropertyTriplesResults_path
            with open (files_path + 'textified_randomPropertyTriples.txt', 'rb') as f:
                textifiedTriples = pickle.load(f)

        else:
            raise ValueError("chooseTest must be one of the following: 'originalTriples', 'exchangedObjSubTriples', 'randomPropertyTriples'")


    with open (files_path + 'triples_dev_1_split.txt', 'rb') as f:
        triples = pickle.load(f)
    properties_set = get_allProperties(triples)
    triples_properties = [triple[1] for triple in triples]
    properties_errors = {property: 0 for property in properties_set}

    '''with open (files_path + 'triples_categories.txt', 'rb') as f:
        triples_categories = pickle.load(f)'''

    highestScores = []
    # offTheShelf_highestScores = []

    '''categories_set = set(triples_categories)
    categories_errors = {category: 0 for category in categories_set}'''

    classifiedResults_sum = [[0]*4 for i in range(len(thresholds))]   #[truePositives, trueNegatives, falsePositives, falseNegatives] for each threshold
    incorrectTops_sum = 0
    incorrectTop1_sum = 0
    correctSentences_means = []
    differences = []

    fineTuned_errors = []

    #for i in tqdm(notParenthesesIndexes):
    for i, textifiedTriple in enumerate(tqdm(textifiedTriples)):
    #for i in tqdm(range(textifiedTriples)):

        #textifiedTriple = textifiedTriples[i]
        correctSentences = sentences[i] if chooseTest == 'originalTriples' else None

        results1 = ''
        results2 = ''
        if model == 'fineTuned':
            results1_name = 'triple'+str(i)+'_results1.txt'
            pickle_off = open(results_path + results1_name, 'rb')
            results1 = pickle.load(pickle_off)    
            highestScores.append(results1[0][1])
            for j, threshold in enumerate(thresholds):
                fineTuned_classifiedResults = classify_results(results1, threshold, correctSentences, chooseTest)
                classifiedResults_sum[j] = [i+j for i,j in zip(classifiedResults_sum[j], fineTuned_classifiedResults)]

        elif model == 'offTheShelf':
            results2_name = 'triple'+str(i)+'_results2.txt'
            pickle_off = open(results_path + results2_name, 'rb')
            results2 = pickle.load(pickle_off)
            highestScores.append(results2[0][1])
            for j, threshold in enumerate(thresholds):
                offTheShelf_classifiedResults = classify_results(results2, threshold, correctSentences, chooseTest)
                classifiedResults_sum[j] = [i+j for i,j in zip(classifiedResults_sum[j], offTheShelf_classifiedResults)]


        metrics = get_allClassificationMetrics(thresholds, classifiedResults_sum)

        model_f1List = [[threshold, metrics[i][3]] for i, threshold in enumerate(thresholds)]
        model_f1Top = sorted(model_f1List, key=lambda x: x[1], reverse=True)
        # offTheShelf_f1List = [[threshold, offTheShelf_metrics[i][3]] for i, threshold in enumerate(thresholds)]
        # offTheShelf_f1Top = sorted(offTheShelf_f1List, key=lambda x: x[1], reverse=True)

        if chooseTest == 'originalTriples':
            topIsCorrect = ''
            if model == 'fineTuned':
                topIsCorrect = top_isCorrect(results1, correctSentences)
                correctSentencesMean = get_correctSentencesMean(results1, correctSentences, topIsCorrect)
                if results1[0][0] not in correctSentences:
                   incorrectTop1_sum += 1
            elif model == 'offTheShelf':
                topIsCorrect = top_isCorrect(results2, correctSentences)
                correctSentencesMean = get_correctSentencesMean(results2, correctSentences, topIsCorrect)
                if results2[0][0] not in correctSentences:
                    incorrectTop1_sum += 1
            
            correctSentences_means.append(correctSentencesMean)

            difference = ''

            if topIsCorrect: 
                if model == 'fineTuned':
                    if len(correctSentences) < len(results1):
                        difference = correctSentencesMean - results1[len(correctSentences)][1]
                    else:
                        difference = 0.212 ##
                elif model == 'offTheShelf':
                    if len(correctSentences) < len(results2):
                        difference = correctSentencesMean - results2[len(correctSentences)][1]
                    else:
                        difference = 0.110
            else:
                incorrectTops_sum += 1
                difference = None
                if model == 'fineTuned':
                   fineTuned_errors.append([textifiedTriple, correctSentences, results1[:5]])
                '''categories_errors[triples_categories[i]] += 1'''
                properties_errors[triples_properties[i]] += 1

            if model == 'fineTuned':
                correctSentences_means.append(get_correctSentencesMean(results1, correctSentences, topIsCorrect))
            elif model == 'offTheShelf':
                correctSentences_means.append(get_correctSentencesMean(results2, correctSentences, topIsCorrect))

            differences.append(difference)

    highestScores_mean = statistics.mean(highestScores)

    if chooseTest == 'originalTriples':
        '''category_errors = [[category, categories_errors[category]] for category in categories_errors]'''
        property_errors = [[property, properties_errors[property]] for property in properties_errors]

        incorrectTops_percentage = incorrectTops_sum / n * 100

        incorrectTop1_percentage = incorrectTop1_sum / n * 100

        correctSentencesMeans_mean = statistics.mean(correctSentences_means)

        differences_mean = statistics.mean([difference for difference in filter(None, differences)])

        results = [metrics,
                model_f1Top,
                classifiedResults_sum,
                incorrectTops_percentage,
                incorrectTop1_percentage,
                correctSentencesMeans_mean,
                differences_mean,
                highestScores_mean,
                fineTuned_errors, thresholds, property_errors]
    else:
        results = [metrics,
                model_f1Top,
                classifiedResults_sum,
                highestScores_mean,
                thresholds]

    return results

def print_infoResults(results, chooseTest, model):
    if chooseTest == 'originalTriples':
        [metrics,
        model_f1Top,
        classifiedResults_sum,
        incorrectTops_percentage,
        incorrectTop1_percentage,
        correctSentencesMeans_mean,
        differences_mean,
        highestScores_mean,
        fineTuned_errors, thresholds, property_errors] = results
    else:
         [metrics,
          model_f1Top,
          classifiedResults_sum,
          highestScores_mean,
          thresholds] = results

    print('CLASSIFICATION METRICS')
    for i, threshold in enumerate(thresholds):
        print(threshold)
        print('\tEvaluated model:  ')
        print('\t    [True Positives, False Positives]: [{:.0f}, {:.0f}]'.format(classifiedResults_sum[i][0], classifiedResults_sum[i][1]))
        print('\t    [False Negatives, True Negatives]: [{:.0f}, {:.0f}]'.format(classifiedResults_sum[i][3], classifiedResults_sum[i][2]))
        print('\t\tPrecision: {:.3f}'.format(metrics[i][0]))
        print('\t\tRecall: {:.3f}'.format(metrics[i][1]))
        print('\t\taccuracy: {:.3f}'.format(metrics[i][2]))
        print('\t\tf1-score: {:.3f}'.format(metrics[i][3]))
        print()
    print('TOP F1 THRESHOLDS')
    print('\tEvaluated model:')
    print('\t\t', model_f1Top[:5])
    print()
    print('HIGHEST SCORES MEAN')
    print('Evaluated model:  {:.5f}'.format(highestScores_mean))
    print()
    
    if chooseTest == 'originalTriples':
        print('NOT CORRECT TOPS')
        print('Evaluated model:  {:.2f}%'.format(incorrectTops_percentage))
        print()
        print('NOT CORRECT TOP1')
        print('Evaluated model:  {:.2f}%'.format(incorrectTop1_percentage))
        print()
        print('CORRECT SENTENCES SCORE MEANS MEAN')
        print('Evaluated model: ', '{:.3f}'.format(correctSentencesMeans_mean))
        print()
        print('DIFFERENCE BETWEEN TOP SCORE MEANS AND FIRST NON CORRECT MEAN')
        print('Evaluated model: ', '{:.3f}'.format(differences_mean))
        print()
        if model == 'fineTuned':
            print('FINE-TUNED MODEL ERRORS')
            for i, [textified_triple, correct_sentences, results] in enumerate(fineTuned_errors):
                print(i+1, '---', textified_triple) #, '---  (category: '+category+')')
                print('\tCorrect Sentences')
                for j, sentence in enumerate(correct_sentences):
                    print('\t\t', j+1, sentence)
                print('\tTop sentences')
                for j, result in enumerate(results):
                    print('\t\t', j+1, result[0], '=>', result[1])
                print()
            print()
        '''print('CATEGORY ERRORS')
        for [category, sum] in category_errors:
            print('\t', category+':', sum)
        print()'''
        # print('PROPERTY ERRORS')
        # for [property, sum] in property_errors:
        #     print('\t', property+':', sum)
        # print()

#### EV-3.2 Run (INPUT NEEDED: model to evaluate)

In [None]:
#@title Pick one model type to evaluate
model = 'offTheShelf'#@param ['offTheShelf', 'fineTuned']
#'originalTriples', 'exchangedObjSubTriples', 'randomPropertyTriples'
results = checkResults(thresholds, 401, files_path, 'originalTriples', model)
print_infoResults(results, 'originalTriples', model)