In [None]:
import json
import re
import spacy
import operator
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import tensorflow as tf
from sentence_transformers import SentenceTransformer
from rank_bm25 import *
import random

In [None]:
#Die Daten, Testanfragen und Modelle laden
with open("../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)

#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Vorhersagen abspeichern
predictions = dict()

In [None]:
# Preprocessing der PDF-Texte
def preprocessing(text,forBert):
    
    #Kleinbuchstaben & einzelne Zeichen entfernen & mehrere Leerzeichen entfernen
    text = text.lower()
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    if forBert:
        text = sent_tokenize(text, language='german')
        
        for i in range(0,len(text)):
            sentence = text[i]

            # Punkte und Zahlen entfernen
            sentence = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', sentence)

            text[i] = sentence
        
        return text
    else:
        # Punkte und Zahlen entfernen
        text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)

        #Stopwörter entfernen und Lemmas erzeugen
        text = removeStopwords(text)
        text = lemmatize(text)

        #Tokenization
        text = word_tokenize(text, language='german')
        return text

#Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatization
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

#Data Cleaning
def cleanData(text):
    text = text.lower()
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

In [None]:
#Daten für BM25 bereinigen
BM25_preprocessedData = dict()
for key,value in data.items():
    BM25_preprocessedData[key] = preprocessing(value['content'],False)
    
#Daten für BERT bereinigen
BERT_preprocessedData = dict()
for key,value in data.items():
    BERT_preprocessedData[key] = preprocessing(value['content'],True)

In [None]:
# BERT- und BM25-Modell laden/initialisieren
bert_model = SentenceTransformer("svalabs/bi-electra-ms-marco-german-uncased")
bm25 = BM25Okapi(list(BM25_preprocessedData.values()))

In [None]:
#Satzencodings für BERT vorher berechnen
encodedDocs = dict()

#Dokumente in Vektoren umwandeln
for key,text in BERT_preprocessedData.items():
    vectors = []
    for sentence in text:
        if len(sentence) != 0:
            vectors.append(bert_model.encode(sentence))
    encodedDocs[key] = vectors

In [None]:
#BERT-Voraussage
def get_BERT_Prediction(query):
    queryEncoded = bert_model.encode(query)
    bertScores = dict()
    
    for docID, vector in encodedDocs.items():
        scores = []
        for sv in vector:
            scores.append(cosineSimilarity(queryEncoded,sv))
        bertScores[docID] = np.mean(sorted(scores,reverse=True)[:5])
    return dict(sorted(bertScores.items(), key=operator.itemgetter(1),reverse=True))

#BM25-Voraussage
def get_BM25_Prediction(query):
    query = cleanData(query)
    query = removeStopwords(query)
    query = lemmatize(query)
    query = word_tokenize(query.lower(), language='german')
    
    doc_scores = bm25.get_scores(query)
    bm25Scores = dict()
    for i in range(len(doc_scores)):
        if doc_scores[i] != 0:
            bm25Scores[i] = doc_scores[i]
    return dict(sorted(bm25Scores.items(), key=operator.itemgetter(1),reverse=True))

#Kiombinierte-Voraussage
def get_Combined_Prediction(query,k1=0.665):
    bertScores = get_BERT_Prediction(query)
    bm25Scores = get_BM25_Prediction(query)
    combinedScores = dict()
    
    #Normalize each Prediction
    if len(bm25Scores) != 0:
        minBert,maxBert = np.min(list(bertScores.values())), np.max(list(bertScores.values()))
        if len(bm25Scores) == 1:
            for docId, score in bm25Scores.items():
                bm25Scores[docId] = 1
        else:
            minBM25,maxBM25 = np.min(list(bm25Scores.values())), np.max(list(bm25Scores.values()))
        
            #BM25 normalisieren
            for docId, score in bm25Scores.items():
                bm25Scores[docId] = (score - minBM25) / (maxBM25 - minBM25)
    
        #BERT normalisieren
        for docId, score in bertScores.items():
            bertScores[docId] = (score - minBert) / (maxBert - minBert)
        
        #Kombinieren mit gleicher Gewichtung
        for docId in data.keys():
            bm25Score,bertScore = 0,0
            if int(docId) in bm25Scores:
                bm25Score = bm25Scores[int(docId)]
            if int(docId) in bertScores:
                bertScore = bertScores[int(docId)]
            combinedScores[docId] = k1 * bertScore + (1-k1)* bm25Score 
        return dict(sorted(combinedScores.items(), key=operator.itemgetter(1),reverse=True))
    else:
        return bertScores
    
#Kosinus-Ähnlichkeit
def cosineSimilarity(vector1, vector2):
    return np.dot(vector1,vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

In [None]:
#Evalierungsmetriken

#Precision@k
def precision(predicted, truth, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

#AP
def average_precision(predicted,relevant,query):
    averageSum = 0
    scores = dict()
    for i in range(0,len(predicted)):
        summe = 0  
        if int(predicted[i]) in relevant:
            summe+= precision(predicted, relevant, i)
        averageSum += summe/len(relevant)
    scores['MAP'] = averageSum
    predictions[query] = scores
    return averageSum

#MAP
def mean_average_precision(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        summe += average_precision(predicted[i],relevant[i],queries[i][0])
    return summe/len(queries)

#RR
def reciprocal_rank(predicted,relevant, query):
    for i in range(0, len(predicted)):
        if int(predicted[i]) in relevant:
            return 1/(i+1)
    return 0

#MRR
def mean_reciprocal_rank(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        rec_rank = reciprocal_rank(predicted[i],relevant[i] ,queries[i][0])
        summe += rec_rank
        predictions[queries[i][0]]['MRR'] = rec_rank
    return summe/len(predicted)

def calculateMetrics(predictFunction,queries,k1=0.665):
    predicted = []
    relevant = []
    for q in queries:
        predicted.append(list(predictFunction(q[0],k1).keys()))
        relevant.append(q[1])
    mean_average_p = mean_average_precision(predicted, relevant)
    mean_reciprocal_r = mean_reciprocal_rank(predicted, relevant)
    return mean_average_p, mean_reciprocal_r

In [None]:
def evaluate():
    mean_av_pr, mean_re_r = calculateMetrics(get_Combined_Prediction,queries)
    print(f'BERT + BM25 | MAP: {str(mean_av_pr)}, MRR: {str(mean_re_r)}')
evaluate()

In [None]:
#AllPredictions laden
with open("../Data/Visualization/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['BERT + BM25'] = predictions[query]

#Predictions abspeichern
with open('../Data/Visualization/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)

In [None]:
# Relevanzverteilung bestimmen
def calculateQuantile():
    scoresDistribution = {'Relevant': [],
                         'Non-Relevant': []}
    for q in queries:
        prediction, relevant = get_Combined_Prediction(q[0]), q[1]
        for docID, score in prediction.items():
            if int(docID) in relevant:
                scoresDistribution['Relevant'].append(np.float64(score))
            else:
                scoresDistribution['Non-Relevant'].append(np.float64(score))
    return scoresDistribution
scoreDistribution = calculateQuantile()

#Predictions abspeichern
with open('../Data/Visualization/CombinedScoreDistribution.json', 'w',encoding='utf-8') as fp:
    json.dump(scoreDistribution, fp,  indent=4, ensure_ascii=False)

In [None]:
#AllPredictions laden
with open("../Data/Visualization/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['BERT + BM25'] = predictions[query]

#Predictions abspeichern
with open('../Data/Visualization/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)

In [None]:
#k1 (Gewichtungsparameter) durch Optimierung bestimmen

#Bisherige Resultate laden
with open("../Data/Visualization/OptimizationResults.json",encoding='utf-8') as file:
    result = json.load(file)

#Random Search
count = 0
for count in range(200):
    i = random.random()
    if count % 10 == 0:
        print(count)
    if i in result:
        continue
    mean_av_pr, mean_re_r = calculateMetrics(get_Combined_Prediction,queries,i)
    result[i] = [mean_av_pr,mean_re_r]
    count+=1

#Optimierungsresultate abspeichern
with open('../Data/Visualization/OptimizationResults.json', 'w',encoding='utf-8') as fp:
    json.dump(result, fp,  indent=4, ensure_ascii=False)