In [1]:
import json
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import operator
import fasttext
from scipy.stats import pearsonr
from scipy.stats import entropy
import datetime

C:\Users\Melvin\anaconda3\lib\site-packages\numpy\.libs\libopenblas.QVLO2T66WEPI7JZ63PS3HMOHFEY472BC.gfortran-win_amd64.dll
C:\Users\Melvin\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
#Die Daten, Testanfragen und Modelle laden
with open("../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)
    
#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Word2Vec Model laden
w2vModel = fasttext.load_model('../Data/Word2Vec/cc.de.300.bin')
w2vvocab = w2vModel.get_words()

#Satz- oder Wortbasiertes Vorgehen
sentence_wise = True

#Vorhersagen abspeichern
predictions = dict()



In [3]:
# Preprocessing der PDF-Texte
def preprocessing(text):
    
    #Kleinbuchstaben & einzelne Zeichen entfernen & mehrere Leerzeichen entfernen
    text = text.lower()
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    #Satzbasierter Tokenizer
    if sentence_wise:
        text = sent_tokenize(text, language='german')
        for i in range(0,len(text)):
            sentence = text[i]
            
            # Punkte und Zahlen entfernen
            sentence = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', sentence)

            #Stopwörter entfernen und Lemmas erzuegen
            sentence = removeStopwords(sentence)
            sentence = lemmatize(sentence)
            text[i] = word_tokenize(sentence, language='german')
            
    #Wortbasierter Tokenizer
    else:
        # Punkte und Zahlen entfernen
        text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
        
        #Stopwörter entfernen und Lemmas erzuegen
        text = removeStopwords(text)
        text = lemmatize(text)
        text = word_tokenize(text, language='german')

    return text

#Data Cleaning
def cleanData(text):
    # Kleinbuchstaben
    text = text.lower()
    
    # Einzelne Zeichen entfernen
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    # Mehrere Leerzeichen entfernen
    text = re.sub(r'\s+', ' ', text)
    
    # Punkte und Zahlen entfernen
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

#Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatize
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

In [4]:
#Daten bereinigen
preprocessedData = dict()
for key,value in data.items():
    preprocessedData[key] = preprocessing(value['content'])

In [5]:
# Ähnlichkeitsmetriken
def cosineSimilarity(vector1, vector2):
    return np.dot(vector1,vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

def pearsonSimilarity(vector1, vector):
    return pearsonr(vector1, vector)[0]

def euclideanDistance(vector1, vector2):
    return np.linalg.norm(vector1-vector2)

def klDivergence(vector1, vector2):
    minVal = min(np.min(vector1),np.min(vector2))
    maxVal = max(np.max(vector1),np.max(vector2))
    v1normalized = (vector1-minVal)/(maxVal-minVal)
    v2normalized = (vector2-minVal)/(maxVal-minVal)
    v1normalized = [i if i > 0.0001 else 0.0001 for i in v1normalized]
    v2normalized = [i if i > 0.0001 else 0.0001 for i in v2normalized]
    return entropy(v1normalized,v2normalized)

In [6]:
#Methoden für VectorSpaceModel

# Vektorrepräsentation für einen Text anhand einfach Summation und Durchschnittsbildung
def get_average_embedding(text):
    embedding = []
    for token in text:
        embedding.append(w2vModel.get_word_vector(token))
    return np.mean(embedding, axis=0)

# Voraussage anhand des Word2Vec Modells
def get_w2v_Prediction(query):
    # Vorverarbeitung der Anfrage
    query= cleanData(query)
    query = removeStopwords(query)
    query = lemmatize(query)
    query = word_tokenize(query, language='german')
    queryVector = get_average_embedding(query)
    
    w2vScores = dict()
    # Durch alle Dokumente iterieren und Kosinus-Ähnlichkeit berechnen
    for i, vector in w2vDocs.items():
        if sentence_wise:
            scores = []
            for sv in vector:
                scores.append(cosineSimilarity(queryVector,sv))
            w2vScores[i] = np.mean(sorted(scores,reverse=True)[:5])
        else:
            w2vScores[i] = cosineSimilarity(queryVector,vector)
    return dict(sorted(w2vScores.items(), key=operator.itemgetter(1),reverse=True))

In [7]:
#VectorSpace Model basierend auf Word2Vec
w2vDocs = dict()

#Dokumente in Vektoren umwandeln
for key,text in preprocessedData.items():
    if sentence_wise:
        vectors = []
        for sentence in text:
            if len(sentence) != 0:
                vectors.append(get_average_embedding(sentence))
        w2vDocs[key] = vectors
    else:
        w2vDocs[key] = get_average_embedding(text)     

In [8]:
#Evalierungsmetriken

#Precision@k
def precision(predicted, truth, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

#AP
def average_precision(predicted,relevant,query):
    averageSum = 0
    scores = dict()
    for i in range(0,len(predicted)):
        summe = 0  
        if int(predicted[i]) in relevant:
            summe+= precision(predicted, relevant, i)
        averageSum += summe/len(relevant)
    scores['MAP'] = averageSum
    predictions[query] = scores
    return averageSum

#MAP
def mean_average_precision(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        summe += average_precision(predicted[i],relevant[i],queries[i][0])
    return summe/len(queries)

#RR
def reciprocal_rank(predicted,relevant, query):
    for i in range(0, len(predicted)):
        if int(predicted[i]) in relevant:
            return 1/(i+1)
    return 0

#MRR
def mean_reciprocal_rank(predicted,relevant):
    summe = 0
    for i in range(0,len(predicted)):
        rec_rank = reciprocal_rank(predicted[i],relevant[i] ,queries[i][0])
        summe += rec_rank
        predictions[queries[i][0]]['MRR'] = rec_rank
    return summe/len(predicted)

def calculateMetrics(predictFunction,queries):
    predicted = []
    relevant = []
    for q in queries:
        predicted.append(list(predictFunction(q[0]).keys()))
        relevant.append(q[1])
    mean_average_p = mean_average_precision(predicted, relevant)
    mean_reciprocal_r = mean_reciprocal_rank(predicted, relevant)
    return mean_average_p, mean_reciprocal_r

In [9]:
def evaluate():
    a = datetime.datetime.now()
    mean_av_pr, mean_re_r = calculateMetrics(get_w2v_Prediction,queries)
    b = datetime.datetime.now()
    print(f'Word2Vec | MAP: {str(mean_av_pr)}, MRR: {str(mean_re_r)}, Anfragebearbeitung {b-a}')
evaluate()

Word2Vec | MAP: 0.5279918340971889, MRR: 0.5412312887641849, Anfragebearbeitung 0:00:24.402282


In [10]:
#AllPredictions laden
with open("../Data/Visualization/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['Word2Vec'] = predictions[query]

#Predictions abspeichern
with open('../Data/Visualization/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)