In [None]:
import json
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import operator
from sklearn.model_selection import train_test_split
import keras
import fasttext
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os
import tensorflow as tf
import datetime
from numpy import asarray

In [None]:
#Die Daten, Testanfragen und Modelle laden
with open("../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)
    
#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Word2Vec Modell laden
w2vModel = fasttext.load_model('../Data/Word2Vec/cc.de.300.bin')
w2vvocab = w2vModel.get_words()

#Vorhersagen abspeichern
predictions = dict()

# Hyperparameter
DROPOUT_RATE, BATCH_SIZE, EPOCHS = 0.5, 32, 10
LOSS_FUNCTION = 'categorical_crossentropy'
OPTIMIZER = 'adam'
ACTIVATION = 'relu'

In [None]:
# Preprocessing der PDF-Texte
def preprocessing(text):
    
    #Kleinbuchstaben & einzelne Zeichen entfernen & mehrere Leerzeichen entfernen
    text = cleanData(text)
        
    #Stopwörter entfernen und Lemmas erzuegen
    text = removeStopwords(text)
    text = lemmatize(text)
    text = word_tokenize(text, language='german')

    return text

#Data Cleaning
def cleanData(text):
    text = text.lower()
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

#Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatization
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

In [None]:
#Daten bereinigen
preprocessedData = dict()
for key,value in data.items():
    preprocessedData[key] = preprocessing(value['content'])

In [None]:
#Methoden für VectorSpaceModel
def get_Word2Vec_embedding(text):
    embedding = []
    for token in text:
        embedding.append(w2vModel.get_word_vector(token))
    return np.mean(embedding, axis=0)

In [None]:
#Evalierungsmetriken

#Precision@k
def precision(truth, predicted, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

def average_precision(predictedDocuments,relDocuments):
    averageSum = 0
    for i in range(0,len(predictedDocuments)):
        summe = 0  
        if int(predictedDocuments[i]) in relDocuments:
            summe+= precision(relDocuments, predictedDocuments, i)
        averageSum += summe/len(relDocuments)
    return averageSum

#RR
def reciprocal_rank(predictedDocuments, relDocuments):
    for i in range(0, len(predictedDocuments)):
        if int(predictedDocuments[i]) in relDocuments:
            return 1/(i+1)
    return 0

#MAP und MRR für das NN berechnen
def calculateMetrics(model,insert=True):
    evalData = []
    mapSum,mrrSum = 0,0
    for q in queries:
        query, relDocuments = q[0], q[1]
        
        #InputVektor für die Anfrage bestimmen
        queryVector = word_tokenize(query, language='german')
        evalData.append(get_Word2Vec_embedding(queryVector))
        #Prediction für alle Dokumente für die aktuelle Anfrage abholen
        ypred = model.predict(np.array(evalData))
       
        #Dictionary mit DokumentId und Score Paaren bilden und absteigend sortieren
        docScore = dict()
        for i in range(len(ypred)):
            docScore[i] = ypred[i][0]
            
        docScore = dict(sorted(docScore.items(), key=operator.itemgetter(1),reverse=True))
        predictedDocuments = list(docScore.keys())
        
        #MAP und MRR um AveragePrecision und RR aufsummieren
        ap = average_precision(predictedDocuments,relDocuments)
        rr = reciprocal_rank(predictedDocuments,relDocuments)
        mapSum+= ap
        mrrSum+= rr
        
        scores = dict()
        scores['MAP'],scores['MRR']  = ap,rr
        predictions[query] = scores
       
    #MAP und MRR errechnen
    mean_average_precision = mapSum / len(queries)
    mean_reciprocal_rank = mrrSum / len(queries)
    return mean_average_precision, mean_reciprocal_rank

In [None]:
#Neuronales Netz mit binärer Entscheidung für Relevanz mit Word2Vec Anfrage und Doc2Vec Dokument als Input

# Daten in binäre Relevanzzugehörigkeit umwandeln
def createTestData():
    inputData, labels = [],[]
    for q in queries:
        query,relDocuments = q[0],q[1]
        query = word_tokenize(query, language='german')
        inputData.append(get_Word2Vec_embedding(query))
        tempLabels = np.zeros(len(preprocessedData))
        for document in relDocuments:
            tempLabels[document] = 1
        labels.append(tempLabels)
    trainData, testData, trainLabel, testLabel = splitData(inputData,labels)
    return trainData, testData, trainLabel, testLabel

# Daten in Test-und Trainingsdaten splitten
def splitData(x,y):
    trainData, testData, trainLabel, testLabel = train_test_split(x,y, test_size=0.2, random_state=42)
    return np.array(trainData), np.array(testData), np.array(trainLabel),np.array(testLabel)

In [None]:
#Daten in Trainings- und Testdaten splitten
trainData, testData, trainLabel, testLabel = createTestData()

In [None]:
# Neuronales Netz initialisieren
def createModel():
    perceptron = Sequential()
    #perceptron.add(Embedding(vocabSize, 300, weights = [embedding_matrix], input_length = maxLen, trainable = False))
    #perceptron.add(Flatten())
    perceptron.add(Dense(128, input_dim=len(trainData[0]), activation=ACTIVATION))
    perceptron.add(Dropout(DROPOUT_RATE))
    perceptron.add(Dense(64, activation=ACTIVATION))
    perceptron.add(Dropout(DROPOUT_RATE))
    perceptron.add(Dense(len(preprocessedData), activation='sigmoid'))
    perceptron.compile(loss=LOSS_FUNCTION,optimizer='sgd', metrics='accuracy')
    return perceptron   

# Modell trainieren
def trainModel(model,trainX,trainY,testX,testY,verb=2):
    history = model.fit(trainX,trainY, verbose=verb,batch_size=64 ,epochs=20,validation_data=(testX, testY))
    return history

In [None]:
# NN-Modell trainieren
a1 = datetime.datetime.now()
model = createModel()
history = trainModel(model,trainData,trainLabel,testData,testLabel)
b1 = datetime.datetime.now()
modelTime = b1 - a1

In [None]:
def evaluate():
    a = datetime.datetime.now()
    result = calculateMetrics(model)
    b = datetime.datetime.now()
    print(f'NN | MAP: {str(result[0])}, MRR: {str(result[1])}, Anfragebearbeitung {b-a}, Modellaufbau {modelTime}')

# NN-Modell evaluieren
evaluate()

In [None]:
# Entwicklung der Metriken mit zunehmender Menge an Trainingsdaten
def calculateDevelopment():
    maps, mrrs = [],[]
    for i in range(5):
        trainData, testData, trainLabel, testLabel = createTestData()
        trainData = trainData[:50]
        trainLabel = trainLabel[:50]
        tempModel = createModel()
        tempHistory = trainModel(tempModel,trainData,trainLabel,testData,testLabel,verb=0)
        
        result = calculateMetrics(tempModel)
        print(result)
        maps.append(result[0])
        mrrs.append(result[1])
    return np.mean(maps), np.mean(mrrs)
        
calculateDevelopment()

In [None]:
#AllPredictions laden
with open("../Data/Predictions.json",encoding='utf-8':
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['NN'] = predictions[query]

#Predictions abspeichern
with open('../Data/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)