In [1]:
import json
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import operator
import fasttext
from scipy.stats import pearsonr, entropy
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import datetime

In [2]:
#Die Daten, Testanfragen und Modelle laden
with open("../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)
    
#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Word2Vec Model laden
w2vModel = fasttext.load_model('../Data/Word2Vec/cc.de.300.bin')
w2vvocab = w2vModel.get_words()
word2Vec = False

#Vorhersagen abspeichern
predictions = dict()



In [3]:
# Preprocessing der PDF-Texte
def preprocessing(sentence):
    
    # Daten bereinigen
    sentence = cleanData(sentence)
    
    # Stopwörter entfernen und Lemmas erzeugen
    sentence = removeStopwords(sentence)
    sentence = lemmatize(sentence)
    
    # Tokenization
    sentence = word_tokenize(sentence, language='german')
    return sentence

#Data Cleaning
def cleanData(text):
    # Kleinbuchstaben
    text = text.lower()
    
    # Einzelne Zeichen entfernen
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    # Mehrere Leerzeichen entfernen
    text = re.sub(r'\s+', ' ', text)
    
    # Punkte und Zahlen entfernen
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

#Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatization
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

In [4]:
#Daten bereinigen
preprocessedData = dict()
for key,value in data.items():
    preprocessedData[key] = preprocessing(value['content'])

In [10]:
# Doc2Vec laden und benötigte Zeit bestimmen
a1 = datetime.datetime.now()
documents = [TaggedDocument(doc, [i]) for i, doc in preprocessedData.items()]
doc2vecModel = Doc2Vec(documents, vector_size=300,window=5,min_count=1, workers=4,epochs=100) 
b1 = datetime.datetime.now()
modellTime = b1-a1

In [11]:
# Ähnlichkeitsmetriken

def cosineSimilarity(vector1, vector2):
    return np.dot(vector1,vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

def pearsonSimilarity(vector1, vector):
    return pearsonr(vector1, vector)[0]

def euclideanDistance(vector1, vector2):
    return np.linalg.norm(vector1-vector2)

def klDivergence(vector1, vector2):
    minVal = min(np.min(vector1),np.min(vector2))
    maxVal = max(np.max(vector1),np.max(vector2))
    v1normalized = (vector1-minVal)/(maxVal-minVal)
    v2normalized = (vector2-minVal)/(maxVal-minVal)
    v1normalized = [i if i > 0.0001 else 0.0001 for i in v1normalized]
    v2normalized = [i if i > 0.0001 else 0.0001 for i in v2normalized]
    return entropy(v1normalized,v2normalized)

In [12]:
#Methoden für VectorSpaceModel

# Vektorrepräsentation für einen Text anhand einfach Summation und Durchschnittsbildung
def get_w2v_embedding(text):
    embedding = []
    for token in text:
        embedding.append(w2vModel.get_word_vector(token))
    return np.mean(embedding, axis=0)

# Vektorrepräsentation für einen Text anhand einfach Summation und Durchschnittsbildung
def get_Doc2Vec_Embedding(text):
    if type(text) == str:
        text = word_tokenize(text, language='german')
    return doc2vecModel.infer_vector(text)

# Voraussage anhand des Word2Vec Modells
def get_d2v_Prediction(query):
    # Vorverarbeitung der Anfrage
    query= cleanData(query)
    query = removeStopwords(query)
    query = lemmatize(query)
    query = word_tokenize(query, language='german')
    if word2Vec:
        queryVector = get_w2v_embedding(query)
    else:
        queryVector = get_Doc2Vec_Embedding(query)
    d2vScores = dict()
    # Durch alle Dokumente iterieren und Kosinus-Ähnlichkeit berechnen
    for docId in preprocessedData.keys():
        d2vScores[docId] = cosineSimilarity(queryVector,doc2vecModel.docvecs[docId])
    return dict(sorted(d2vScores.items(), key=operator.itemgetter(1),reverse=True))

In [13]:
#Evalierungsmetriken

#Precision@k
def precision(predicted, truth, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

#AP
def average_precision(predicted,relevant,query):
    averageSum = 0
    scores = dict()
    for i in range(0,len(predicted)):
        summe = 0  
        if int(predicted[i]) in relevant:
            summe+= precision(predicted, relevant, i)
        averageSum += summe/len(relevant)
    scores['MAP'] = averageSum
    predictions[query] = scores
    return averageSum

#MAP
def mean_average_precision(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        summe += average_precision(predicted[i],relevant[i],queries[i][0])
    return summe/len(queries)

#RR
def reciprocal_rank(predicted,relevant, query):
    for i in range(0, len(predicted)):
        if int(predicted[i]) in relevant:
            return 1/(i+1)
    return 0

#MRR
def mean_reciprocal_rank(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        rec_rank = reciprocal_rank(predicted[i],relevant[i] ,queries[i][0])
        summe += rec_rank
        predictions[queries[i][0]]['MRR'] = rec_rank
    return summe/len(predicted)

def calculateMetrics(predictFunction,queries):
    predicted = []
    relevant = []
    for q in queries:
        predicted.append(list(predictFunction(q[0]).keys()))
        relevant.append(q[1])
    mean_average_p = mean_average_precision(predicted, relevant)
    mean_reciprocal_r = mean_reciprocal_rank(predicted, relevant)
    return mean_average_p, mean_reciprocal_r

In [14]:
def evaluate():
    a = datetime.datetime.now()
    mean_av_pr, mean_re_r = calculateMetrics(get_d2v_Prediction,queries)
    b = datetime.datetime.now()
    print(f'Doc2Vec | MAP: {str(mean_av_pr)}, MRR: {str(mean_re_r)}, Anfragebearbeitung: {b-a}, Modellaufbau: {modellTime}')
evaluate()

Doc2Vec | MAP: 0.5040156791124807, MRR: 0.5194021922242493, Anfragebearbeitung: 0:00:03.966916, Modellaufbau: 0:00:04.719060


In [15]:
#AllPredictions laden
with open("../Data/Visualization/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
        scores['Doc2Vec'] = predictions[query]

#Predictions abspeichern
with open('../Data/Visualization/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)