In [12]:
import json
import re
import spacy
import operator
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import pearsonr, entropy
from sentence_transformers.cross_encoder import CrossEncoder
import datetime 
from itertools import product

In [13]:
#Die Daten, Testanfragen und Modelle laden
with open("../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)

#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Vorhersagen abspeichern
predictions = dict()

In [14]:
# Cross-Encoder-Modell laden
bert_model = CrossEncoder("svalabs/cross-electra-ms-marco-german-uncased")

In [15]:
def preprocessing(text):
    
    #Kleinbuchstaben & einzelne Zeichen entfernen & mehrere Leerzeichen entfernen
    text = text.lower()
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = sent_tokenize(text, language='german')
    for i in range(0,len(text)):
        sentence = text[i]
            
        # Punkte und Zahlen entfernen
        sentence = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', sentence)
        
        text[i] = sentence
    return text

#Data Cleaning
def cleanData(text):
    # Kleinbuchstaben
    text = text.lower()
    
    # Einzelne Zeichen entfernen
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    # Mehrere Leerzeichen entfernen
    text = re.sub(r'\s+', ' ', text)
    
    # Punkte und Zahlen entfernen
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

#Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatization
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

In [16]:
#Daten bereinigen
preprocessedData = dict()
for key,value in data.items():
    preprocessedData[key] = preprocessing(value['content'])

In [17]:
# Voraussage anhand des German Uncased Electra Cross-Encoder-Modells
def get_BERT_Prediction(query):
    queryList = [query]
    bertScores = dict()
    # Durch jedes Dokument iterieren
    for docID, text in preprocessedData.items():
        scores = []
        
        # Kreuzprodukt aus Anfrage und allen Sätzen für aktuelles Dokument bestimmen
        combs = list(product(queryList, text))
        prediction = bert_model.predict(combs).reshape((len(queryList), len(text)))
        
        # Durchschnitt aus den besten fünf Sätzen bilden
        bertScores[docID] = np.mean(sorted(prediction[0],reverse=True)[:5])
    return dict(sorted(bertScores.items(), key=operator.itemgetter(1),reverse=True))

In [18]:
#Evalierungsmetriken

#Precision@k
def precision(predicted, truth, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

#AP
def average_precision(predicted,relevant,query):
    averageSum = 0
    scores = dict()
    for i in range(0,len(predicted)):
        summe = 0  
        if int(predicted[i]) in relevant:
            summe+= precision(predicted, relevant, i)
        averageSum += summe/len(relevant)
    scores['MAP'] = averageSum
    predictions[query] = scores
    return averageSum

#MAP
def mean_average_precision(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        summe += average_precision(predicted[i],relevant[i],queries[i][0])
    return summe/len(queries)

#RR
def reciprocal_rank(predicted,relevant, query):
    for i in range(0, len(predicted)):
        if int(predicted[i]) in relevant:
            return 1/(i+1)
    return 0

#MRR
def mean_reciprocal_rank(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        rec_rank = reciprocal_rank(predicted[i],relevant[i] ,queries[i][0])
        summe += rec_rank
        predictions[queries[i][0]]['MRR'] = rec_rank
    return summe/len(predicted)

def calculateMetrics(predictFunction,queries):
    predicted = []
    relevant = []
    for q in queries:
        predicted.append(list(predictFunction(q[0]).keys()))
        relevant.append(q[1])
    mean_average_p = mean_average_precision(predicted, relevant)
    mean_reciprocal_r = mean_reciprocal_rank(predicted, relevant)
    return mean_average_p, mean_reciprocal_r

In [19]:
def evaluate():
    a = datetime.datetime.now()
    mean_av_pr, mean_re_r = calculateMetrics(get_BERT_Prediction,queries)
    b = datetime.datetime.now()
    print(f'BERT | MAP: {str(mean_av_pr)}, MRR: {str(mean_re_r)}, Anfragebearbeitung: {b-a}')
evaluate()

BERT | MAP: 0.6782642637877896, MRR: 0.7087057844895998, Anfragebearbeitung: 1:04:48.077520


In [15]:
#Predictions abspeichern
with open('../Data/Visualization/BM25ScoreDistribution.json', 'w',encoding='utf-8') as fp:
    json.dump(scoreDistribution, fp,  indent=4, ensure_ascii=False)
    def calculateQuantile():
    scoresDistribution = {'Relevant': [],
                         'Non-Relevant': []}
    for q in queries:
        prediction, relevant = get_BERT_Prediction(q[0]), q[1]
        for docID, score in prediction.items():
            if int(docID) in relevant:
                scoresDistribution['Relevant'].append(np.float64(score))
            else:
                scoresDistribution['Non-Relevant'].append(np.float64(score))
    return scoresDistribution
scoreDistribution = calculateQuantile()

#Distribution abspeichern
#with open('../Data/Visualization/BERTScoreDistribution.json', 'w',encoding='utf-8') as fp:
#    json.dump(scoreDistribution, fp,  indent=4, ensure_ascii=False)

In [12]:
#AllPredictions laden
with open("../Data/Visualization/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['BERT Cross'] = predictions[query]

#Predictions abspeichern
with open('../Data/Visualization/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)