In [1]:
import json
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import operator
import datetime

In [2]:
#Die Daten, Testanfragen und Modelle laden
with open("../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)
    
#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Vorhersagen abspeichern
predictions = dict()

In [3]:
# Preprocessing der PDF-Texte
def preprocessing(sentence):
    
    # Daten bereinigen
    sentence = cleanData(sentence)
    
    # Stopwörter entfernen und Lemmas erzeugen
    sentence = removeStopwords(sentence)
    sentence = lemmatize(sentence)
    
    # Tokenization
    sentence = word_tokenize(sentence, language='german')
    
    return sentence

#Data Cleaning
def cleanData(text):
    # Kleinbuchstaben
    text = text.lower()
    
    # Einzelne Zeichen entfernen
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    # Mehrere Leerzeichen entfernen
    text = re.sub(r'\s+', ' ', text)
    
    # Punkte und Zahlen entfernen
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

#Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatization
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

In [4]:
#Daten bereinigen
preprocessedData = dict()
for key,value in data.items():
    preprocessedData[key] = preprocessing(value['content'])   

In [5]:
#TF-IDF für alle Dokumente und Terme berechnen
idf,tf,tfidf = dict(),dict(),dict()

# idf und tf berechnen
for key,text in preprocessedData.items():
    wordCountDict = dict()
    # Durch jeden Token im Text iterieren
    for token in text:
        # Wenn das Wort noch nicht vorher gesehen wurde
        if token not in wordCountDict:
            wordCountDict[token] = 1
        # Wenn das Wort vorher schon gesehen wurde
        else:
            wordCountDict[token] += 1
        try:
            idf[token].add(key) 
        except:
            idf[token] = {key}
    tf[key] = wordCountDict

#idf anpassen
for i in idf:
    idf[i] = np.log(len(preprocessedData)/len(idf[i]))

#tf in Wahrscheinlichkeiten umrechnen
for key,text in preprocessedData.items():
    for k,v in tf[key].items():
        tf[key][k] = v/ len(text)

#tfidf berechnen
for key, value in tf.items():
    for k, v in value.items():
        tfidf[key,k] = v * idf[k]
    

# Voraussage anhand des TF-IDF Modells
def getTFIDF_Prediction(query):
    #Vorverarbeitung der Anfrage
    query= cleanData(query)
    query = removeStopwords(query)
    query = lemmatize(query)
    query = word_tokenize(query.lower(), language='german')
    
    tfidfScores = dict()
    # Durch alle TF-IDF Scores iterieren
    for key in tfidf.keys(): 
        docId, word = key[0],key[1]
        # Wenn das aktuelle Wort in der Anfrage vorkommt
        if word in query:
            # Wenn bereits ein Wert für das Dokument vorhanden ist
            if docId not in tfidfScores:
                tfidfScores[docId] = tfidf[key]
            # Wenn noch kein  Wert für das Dokument vorhanden ist
            else:
                tfidfScores[docId] += tfidf[key]
    # Rückgabe eines sortierten Dictioanries anhand der Relevanz-Werte
    return dict(sorted(tfidfScores.items(), key=operator.itemgetter(1),reverse=True))

In [6]:
#Evalierungsmetriken

#Precision@k
def precision(predicted, truth, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

#AP
def average_precision(predicted,relevant,query):
    averageSum = 0
    scores = dict()
    for i in range(0,len(predicted)):
        summe = 0  
        if int(predicted[i]) in relevant:
            summe+= precision(predicted, relevant, i)
        averageSum += summe/len(relevant)
    scores['MAP'] = averageSum
    predictions[query] = scores
    return averageSum

#MAP
def mean_average_precision(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        summe += average_precision(predicted[i],relevant[i],queries[i][0])
    return summe/len(queries)

#RR
def reciprocal_rank(predicted,relevant, query):
    for i in range(0, len(predicted)):
        if int(predicted[i]) in relevant:
            return 1/(i+1)
    return 0

#MRR
def mean_reciprocal_rank(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        rec_rank = reciprocal_rank(predicted[i],relevant[i] ,queries[i][0])
        summe += rec_rank
        predictions[queries[i][0]]['MRR'] = rec_rank
    return summe/len(predicted)

def calculateMetrics(predictFunction,queries):
    predicted = []
    relevant = []
    for q in queries:
        predicted.append(list(predictFunction(q[0]).keys()))
        relevant.append(q[1])
    mean_average_p = mean_average_precision(predicted, relevant)
    mean_reciprocal_r = mean_reciprocal_rank(predicted, relevant)
    return mean_average_p, mean_reciprocal_r

In [7]:
def evaluate():
    a = datetime.datetime.now()
    mean_av_pr, mean_re_r = calculateMetrics(getTFIDF_Prediction,queries)
    b = datetime.datetime.now()
    print(f'TF-IDF | MAP: {str(mean_av_pr)}, MRR: {str(mean_re_r)} Anfragebearbeitung {b-a}')
evaluate()

TF-IDF | MAP: 0.6695976167917616, MRR: 0.6854547914017511 Anfragebearbeitung 0:00:03.854872


In [9]:
#AllPredictions laden
with open("../Data/Visualization/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['TF-IDF'] = predictions[query]

#Predictions abspeichern
with open('../Data/Visualization/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)