In [17]:
import json
import re
import spacy
import operator
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import pearsonr, entropy
from sentence_transformers import SentenceTransformer
import datetime 

In [18]:
#Die Daten, Testanfragen und Modelle laden
with open("../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)

#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Vorhersagen abspeichern
predictions = dict()

In [19]:
#BERT-Modell laden
bert_model = SentenceTransformer("svalabs/bi-electra-ms-marco-german-uncased")



In [20]:
def preprocessing(text):
    
    #Kleinbuchstaben & einzelne Zeichen entfernen & mehrere Leerzeichen entfernen
    text = text.lower()
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = sent_tokenize(text, language='german')
    for i in range(0,len(text)):
        sentence = text[i]
            
        # Punkte und Zahlen entfernen
        sentence = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', sentence)
        text[i] = sentence
    
    return text

def cleanData(text):
    # Kleinbuchstaben
    text = text.lower()
    
    # Einzelne Zeichen entfernen
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    # Mehrere Leerzeichen entfernen
    text = re.sub(r'\s+', ' ', text)
    
    # Punkte und Zahlen entfernen
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

# Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatization
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

In [21]:
#Daten bereinigen
preprocessedData = dict()
for key,value in data.items():
    preprocessedData[key] = preprocessing(value['content'])

In [22]:
a1 = datetime.datetime.now()

# Satzencodings vorher berechnen
encodedDocs = dict()

# Dokumente in Vektoren umwandeln
for key,text in preprocessedData.items():
    vectors = []
    for sentence in text:
        if len(sentence) != 0:
            vectors.append(bert_model.encode(sentence))
    encodedDocs[key] = vectors
    
b1 = datetime.datetime.now()
modelTime = b1-a1

In [23]:
# Ähnlichkeitsmetriken
def cosineSimilarity(vector1, vector2):
    return np.dot(vector1,vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

def pearsonSimilarity(vector1, vector):
    return pearsonr(vector1, vector)[0]

def euclideanDistance(vector1, vector2):
    return np.linalg.norm(vector1-vector2)

def klDivergence(vector1, vector2):
    minVal = min(np.min(vector1),np.min(vector2))
    maxVal = max(np.max(vector1),np.max(vector2))
    v1normalized = (vector1-minVal)/(maxVal-minVal)
    v2normalized = (vector2-minVal)/(maxVal-minVal)
    v1normalized = [i if i > 0.0001 else 0.0001 for i in v1normalized]
    v2normalized = [i if i > 0.0001 else 0.0001 for i in v2normalized]
    return entropy(v1normalized,v2normalized)

In [27]:
#BERT-Vorhersage
def get_BERT_Prediction(query):
    #query = cleanData(query)
    queryEncoded = bert_model.encode(query)
    bertScores = dict()
    
    for docID, vector in encodedDocs.items():
        scores = []
        for sv in vector:
            scores.append(cosineSimilarity(queryEncoded,sv))
        bertScores[docID] = np.mean(sorted(scores,reverse=True)[:5])
    return dict(sorted(bertScores.items(), key=operator.itemgetter(1),reverse=True))

In [28]:
#Evalierungsmetriken

#Precision@k
def precision(predicted, truth, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

#AP
def average_precision(predicted,relevant,query):
    averageSum = 0
    scores = dict()
    for i in range(0,len(predicted)):
        summe = 0  
        if int(predicted[i]) in relevant:
            summe+= precision(predicted, relevant, i)
        averageSum += summe/len(relevant)
    scores['MAP'] = averageSum
    predictions[query] = scores
    return averageSum

#MAP
def mean_average_precision(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        summe += average_precision(predicted[i],relevant[i],queries[i][0])
    return summe/len(queries)

#RR
def reciprocal_rank(predicted,relevant, query):
    for i in range(0, len(predicted)):
        if int(predicted[i]) in relevant:
            return 1/(i+1)
    return 0

#MRR
def mean_reciprocal_rank(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        rec_rank = reciprocal_rank(predicted[i],relevant[i] ,queries[i][0])
        summe += rec_rank
        predictions[queries[i][0]]['MRR'] = rec_rank
    return summe/len(predicted)

# Methode zur Berechnung der MAP- und MRR-Metrik
def calculateMetrics(predictFunction,queries):
    predicted = []
    relevant = []
    for q in queries:
        predicted.append(list(predictFunction(q[0]).keys()))
        relevant.append(q[1])
    mean_average_p = mean_average_precision(predicted, relevant)
    mean_reciprocal_r = mean_reciprocal_rank(predicted, relevant)
    return mean_average_p, mean_reciprocal_r

In [29]:
# Evaluiert das Modell anhand der verfügbaren Validierungsdaten
def evaluate():
    a = datetime.datetime.now()
    mean_av_pr, mean_re_r = calculateMetrics(get_BERT_Prediction,queries)
    b = datetime.datetime.now()
    print(f'BERT | MAP: {str(mean_av_pr)}, MRR: {str(mean_re_r)}, Anfragebearbeitung: {b-a}, Modellaufbau: {modelTime}')
evaluate()

BERT | MAP: 0.7512135986055952, MRR: 0.7681354155899791, Anfragebearbeitung: 0:00:32.922819, Modellaufbau: 0:01:07.200330


In [30]:
# Relevanzverteilung bestimmen
def calculateQuantile():
    scoresDistribution = {'Relevant': [],
                         'Non-Relevant': []}
    for q in queries:
        prediction, relevant = get_BERT_Prediction(q[0]), q[1]
        for docID, score in prediction.items():
            if int(docID) in relevant:
                scoresDistribution['Relevant'].append(np.float64(score))
            else:
                scoresDistribution['Non-Relevant'].append(np.float64(score))
    return scoresDistribution
scoreDistribution = calculateQuantile()

#Distribution abspeichern
with open('../Data/Visualization/BERTScoreDistribution.json', 'w',encoding='utf-8') as fp:
    json.dump(scoreDistribution, fp,  indent=4, ensure_ascii=False)

In [31]:
#AllPredictions laden
with open("../Data/Visualization/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['BERT'] = predictions[query]

#Predictions abspeichern
with open('../Data/Visualization/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)