In [1]:
import json
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from tensorflow.keras import Input,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical
import zipfile
import pickle
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from tensorflow.keras import backend as keras
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
import time
from dataset_joiner import DatasetWorker, VocabularyWorker
from performance import PerformanceViewer, TrainingEval
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(1234567890)

In [3]:
# import data
# specify the type of information which shall be extracted
#extraction_of = 'contexts'
extraction_of = 'sentiments'
#extraction_of = 'aspects'

#sentiment, aspect oder modifier -> diese drei braucht man
#extraktion von polarität nicht gefragt


# specify filenames in the next line
if extraction_of in ['contexts']:
    filename = r'data_laptop_ctxt.json'
elif extraction_of in ['sentiments','aspects']:
    filename = r'data_laptop_absa.json'

## in this example, we use the glove word embeddings as input for the neural network
## download glove.42B.300d.txt from http://nlp.stanford.edu/data/glove.42B.300d.zip
filename_embedding_zip = r'glove.42B.300d.zip' # folder of downloaded glove zip file
## specify folder where to store the glove embeddings
filepath_embedding = filename_embedding_zip.replace('.zip','')
## unzip and save glove to a folder manually or with the next lines
if not os.path.exists(filepath_embedding):
    with zipfile.ZipFile(filename_embedding_zip,"r") as zip_ref:
        zip_ref.extractall(filepath_embedding)
os.listdir(filepath_embedding)[0]
filename_embedding = filepath_embedding + '/' + os.listdir(filepath_embedding)[0]


with open(filename,'r', encoding='utf8') as infile:
    example_data = json.load(infile)


In [4]:
max_seq_length = 100
ds = DatasetWorker(example_data)
ds.applyPreprocessing()
ds.splitDatasetTokens()
ds.setExtractionOf("sentiments")
ds.splitDatasetLabels("union")
ds.buildDatasetSequence(max_seq_length)

tokenize dataset: 3101it [00:00, 74836.66it/s]
split dataset tokens: 3101it [00:00, 1329775.76it/s]
split dataset labels: 3101it [00:00, 138050.19it/s]
update train tokens: 100%|██████████| 2480/2480 [00:00<00:00, 706553.04it/s]
update test tokens: 100%|██████████| 621/621 [00:00<00:00, 96626.46it/s]
update train labels: 100%|██████████| 2480/2480 [00:00<00:00, 22539.71it/s]
update test labels: 100%|██████████| 621/621 [00:00<00:00, 760263.51it/s]


In [5]:
#build vocab and add embedding
vw = VocabularyWorker()
vw.buildVocabulary(ds.dataset)
vw.buildEmbedding(ds.train_labels)

build vocabulary: 100%|██████████| 3101/3101 [00:00<00:00, 42006.30it/s]
build embedding vectors: 100%|██████████| 4562/4562 [00:00<00:00, 343884.38it/s]
build labelclasses: 100%|██████████| 2480/2480 [00:00<00:00, 564521.54it/s]


In [6]:
# Convert data to Input format for neural network
x_train, y_train = vw.convert_tokens_labels_list_to_ids_list(ds.train_tokens, ds.train_labels, max_seq_length)
x_test, y_test = vw.convert_tokens_labels_list_to_ids_list(ds.test_tokens, ds.test_labels, max_seq_length)

Converting tokens & labels to ids : 100%|██████████| 2480/2480 [00:00<00:00, 27481.11it/s]
Converting tokens & labels to ids : 100%|██████████| 621/621 [00:00<00:00, 39034.03it/s]


In [7]:
for a in [ds.train_tokens, x_train]:
    for i, e in enumerate(a):
        if i < 2:
            print(e)

['computer', 'works', 'great', '.']
['bough', 'this', 'for', 'the', 'google', 'goodies', 'offers', 'for', 'the', 'holidays', 'but', 'when', 'i', 'go', 'to', 'redeem', 'the', 'offers', 'from', 'the', 'google', 'website', ',', 'it', 'says', 'that', 'this', "doesn't", 'have', 'the', 'right', '"', 'code', '"', 'to', 'get', 'the', 'drive', 'storage', '.', 'sending', 'back', '.']
[521 557 123   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]
[4181   22   11    2  682 3236  551   11    2 1749   36   64   13  140
    5 3563    2  551   28    2  682  314    1   18  295   15   22 4550
   29    2  155    9  489    9    5   77    2  582 1028    3 1926  126
   

In [8]:
#make classes cateogrical
y_train = to_categorical(y_train, num_classes = vw.n_tags)
y_test = to_categorical(y_test, num_classes = vw.n_tags)

In [9]:
# the following two layers should not be changed.
input_layer = Input(shape=(max_seq_length,))
embedding_layer = Embedding(vw.vocab_size, 300, weights=[vw.embedding_vectors], input_length=max_seq_length)(input_layer)

lstm_layer = Dropout(0.1)(embedding_layer)
lstm_layer = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(lstm_layer)
# here, attention models have to be implemented in this model
#nur bestimmten wörtern aufmerksamkeit geben
# ...

# this last layer can/should be modified
output_layer = TimeDistributed(Dense(vw.n_tags, activation="softmax"))(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=["categorical_accuracy", "accuracy"])
model.summary()


Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          1452000   
_________________________________________________________________
dropout (Dropout)            (None, 100, 300)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 200)          320800    
_________________________________________________________________
time_distributed (TimeDistri (None, 100, 3)            603       
Total params: 1,773,403
Trainable params: 1,773,403
Non-trainable params: 0
_________________________________________________________________


In [10]:
performance = PerformanceViewer()
evaluate_callback = TrainingEval(model, x_test, y_test, vw, ds, performance)

In [None]:
# fit model on train data
history = model.fit(
    x_train, y_train,
    batch_size=32,
    #validation_split = 0.2,
    verbose = 1,
    callbacks = [evaluate_callback],
    validation_data=(x_test, y_test),
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
15/78 [====>.........................] - ETA: 11s - loss: 0.0421 - categorical_accuracy: 0.9846 - accuracy: 0.9846

## Model fit

In [None]:
performance.evalModelTrainDataClass()

In [None]:
performance.basicEval(history)

In [None]:
performance.classicEval(model, ds,vw,x_test,y_test)