In [None]:
import json
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import operator
from sklearn.model_selection import train_test_split
import keras
import fasttext
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten, GlobalMaxPooling1D, Conv1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os
import tensorflow as tf
import datetime

In [None]:
#Die Daten, Testanfragen und Modelle laden
with open("../../Data/Data.json",encoding='utf-8') as file:
    data = json.load(file)

with open("../../Data/Queries.json",encoding='utf-8') as file:
    queries = json.load(file)
    
#Deutsche Stopwörter und Lemmatizer laden
german_stopwords = stopwords.words('german')
lemmatizer = spacy.load('de_core_news_sm')

#Word2Vec Modell laden
w2vModel = fasttext.load_model('../../Data/Word2Vec/cc.de.300.bin')
w2vvocab = w2vModel.get_words()

#Vorhersagen abspeichern
predictions = dict()

# Hyperparameter
DROPOUT_RATE, BATCH_SIZE, EPOCHS = 0.5, 32, 10
LOSS_FUNCTION = 'binary_crossentropy'
OPTIMIZER = 'adam'
ACTIVATION = 'relu'

In [None]:
# Preprocessing der PDF-Texte
def preprocessing(text):
    
    #Kleinbuchstaben & einzelne Zeichen entfernen & mehrere Leerzeichen entfernen
    text = cleanData(text)
        
    #Stopwörter entfernen und Lemmas erzuegen
    text = removeStopwords(text)
    text = lemmatize(text)
    return text

#Data Cleaning
def cleanData(text):
    text = text.lower()
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('[^a-zA-ZäöüÄÖÜß]', ' ', text)
    return text

#Stopword Removal
def removeStopwords(text):
    return ' '.join([word for word in text.split() if word not in german_stopwords])

#Lemmatization
def lemmatize(text):
    doc = lemmatizer(text)
    return ' '.join([x.lemma_ for x in doc]) 

In [None]:
#Daten bereinigen
preprocessedData = dict()
for key,value in data.items():
    preprocessedData[key] = preprocessing(value['content'])

In [None]:
#Methoden für VectorSpaceModel
def get_Word2Vec_embedding(text):
    embedding = []
    for token in text:
        embedding.append(w2vModel.get_word_vector(token))
    return np.mean(embedding, axis=0)

In [None]:
# Dokumente vorverarbeiten
w2vDocs = dict()
#Dokumente in Vektoren umwandeln
for key,text in preprocessedData.items():
    w2vDocs[key] = np.array(get_Word2Vec_embedding(text))

In [None]:
#Evalierungsmetriken

#Precision@k
def precision(truth, predicted, k):
    summe,count = 0, 0
    for i in range(0,k+1):
        if int(predicted[i]) in truth:
            count+=1
    summe+= count/ (i+1)
    return summe

def average_precision(predictedDocuments,relDocuments):
    averageSum = 0
    for i in range(0,len(predictedDocuments)):
        summe = 0  
        if int(predictedDocuments[i]) in relDocuments:
            summe+= precision(relDocuments, predictedDocuments, i)
        averageSum += summe/len(relDocuments)
    return averageSum

#RR
def reciprocal_rank(predictedDocuments, relDocuments):
    for i in range(0, len(predictedDocuments)):
        if int(predictedDocuments[i]) in relDocuments:
            return 1/(i+1)
    return 0

#MAP und MRR für das NN berechnen
def calculateMetrics(model,insert=True):
    evalData = []
    mapSum,mrrSum = 0,0
    for q in queries:
        query, relDocuments = q[0], q[1]
        
        #queryVec = preprocessing(query)
        
        #InputVektor für die Anfrage bestimmen
        tempInputData = []
        for key,doc in preprocessedData.items():
            tempInputData.append(query + " " + doc)
            
        inputTokens = tokenizer.texts_to_sequences(tempInputData)
        inputPad = pad_sequences(inputTokens,maxlen=maxLen,padding='post')
        
        ypred = model.predict(np.array(inputPad))
       
        #Dictionary mit DokumentId und Score Paaren bilden und absteigend sortieren
        docScore = dict()
        for i in range(len(ypred)):
            docScore[i] = ypred[i][0]
            
        docScore = dict(sorted(docScore.items(), key=operator.itemgetter(1),reverse=True))
        predictedDocuments = list(docScore.keys())
        
        #MAP und MRR um AveragePrecision und RR aufsummieren
        ap = average_precision(predictedDocuments,relDocuments)
        rr = reciprocal_rank(predictedDocuments,relDocuments)
        mapSum+= ap
        mrrSum+= rr
        
        scores = dict()
        scores['MAP'],scores['MRR']  = ap,rr
        predictions[query] = scores
       
    #MAP und MRR errechnen
    mean_average_precision = mapSum / len(queries)
    mean_reciprocal_rank = mrrSum / len(queries)
    return mean_average_precision, mean_reciprocal_rank

In [None]:
# Daten in binäre Relevanzzugehörigkeit umwandeln
def createTestData():
    inputData, labels = [],[]
    for q in queries:
        query,relDocuments = q[0],q[1]
        #query = preprocessing(query)
        
        for key,doc in preprocessedData.items():
            inputData.append(query + " " + doc)
            labels.append(int(int(key) in relDocuments))
    return inputData, labels

# Daten in Test-und Trainingsdaten splitten
def splitData(x,y):
    trainData, testData, trainLabel, testLabel = train_test_split(x,y, test_size=0.2, random_state=42)
    return np.array(trainData), np.array(testData), np.array(trainLabel),np.array(testLabel)

In [None]:
#Daten in Trainings-und Testdaten splitten
inputData, labels = createTestData()
trainData, testData, trainLabel, testLabel = splitData(inputData, labels)

#Daten in passendes Format umwandeln 
totalData = np.append(trainData, testData)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(totalData)

maxLen = np.max([len(s.split()) for s in totalData])
vocabSize = len(tokenizer.word_index) + 1

x_train_tokens = tokenizer.texts_to_sequences(trainData)
x_test_tokens = tokenizer.texts_to_sequences(testData)

x_train_pad = pad_sequences(x_train_tokens,maxlen=maxLen,padding='post')
x_test_pad = pad_sequences(x_test_tokens,maxlen=maxLen,padding='post')

In [None]:
# Create Emedding Matrix
embedding_matrix = np.zeros((vocabSize, 300))

for word, index in tokenizer.word_index.items():
    embedding_vector = w2vModel.get_word_vector(word) 
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
# Neuronales Netz initialisieren
def createModel():
    perceptron = Sequential()
    perceptron.add(Embedding(vocabSize, 300, weights = [embedding_matrix], input_length = maxLen, trainable = False) )
    perceptron.add(Flatten())
    perceptron.add(Dense(128, input_dim=len(inputData[0]), activation=ACTIVATION))
    perceptron.add(Dropout(DROPOUT_RATE))
    perceptron.add(Dense(64, activation=ACTIVATION))
    perceptron.add(Dropout(DROPOUT_RATE))
    perceptron.add(Dense(16, activation=ACTIVATION))
    perceptron.add(Dropout(DROPOUT_RATE))
    perceptron.add(Dense(1, activation='sigmoid'))
    perceptron.compile(loss=LOSS_FUNCTION,optimizer=OPTIMIZER, metrics='accuracy')
    return perceptron   

# CNN initialisieren
def buildCNN():
    modelCNN = Sequential()
    modelCNN.add(Embedding(vocabSize, 300, weights = [embedding_matrix], input_length = maxLen, trainable = False) )
    modelCNN.add(Conv1D(256, 10, activation='relu', input_shape=(None, maxLen, 100)))
    modelCNN.add(Dropout(DROPOUT_RATE))
    modelCNN.add(Conv1D(128, 5, activation='relu'))
    modelCNN.add(GlobalMaxPooling1D())
    modelCNN.add(Dropout(DROPOUT_RATE))
    modelCNN.add(Dense(64, activation='relu'))
    modelCNN.add(Dropout(DROPOUT_RATE))
    modelCNN.add(Dense(1, activation = 'sigmoid'))
    modelCNN.compile(optimizer = OPTIMIZER, loss = LOSS_FUNCTION, metrics = ['accuracy'])
    return modelCNN

# Modell trainieren
def trainModel(model,trainX,trainY,testX,testY,verb=2):
    history = model.fit(trainX,trainY, verbose=verb,batch_size=BATCH_SIZE ,epochs=EPOCHS,validation_data=(testX, testY))
    return history

# Modell evaluieren
def evaluate(model):
    a = datetime.datetime.now()
    result = calculateMetrics(model)
    b = datetime.datetime.now()
    print(f'NN Word2Vec | MAP: {str(result[0])}, MRR: {str(result[1])}, Anfragebearbeitung {b-a}, Modellaufbau {modelTime}')



In [None]:
# NN-Modell trainieren
a1 = datetime.datetime.now()
perceptronModel = createModel()
history = trainModel(perceptronModel,x_train_pad,trainLabel,x_test_pad,testLabel)
modelTime = datetime.datetime.now() - a1

In [None]:
# NN-Modell evaluieren
evaluate(perceptronModel)

In [None]:
# CNN-Modell trainieren
a1 = datetime.datetime.now()
modelCNN = buildCNN()
historyCNN= trainModel(modelCNN,x_train_pad,trainLabel,x_test_pad,testLabel)
modelTime = datetime.datetime.now() - a1

In [None]:
# CNN-Modell evaluieren
evaluate(modelCNN)

In [None]:
#AllPredictions laden
with open("../../Data/Predictions.json",encoding='utf-8') as file:
    allPredictions = json.load(file)

for query, scores in allPredictions.items():
    scores['NN'] = nnScores[query]

#Predictions abspeichern
with open('../../Data/Predictions.json', 'w',encoding='utf-8') as fp:
    json.dump(allPredictions, fp,  indent=4, ensure_ascii=False)