In [None]:
""""
Zusammenfassung:
- Trainingsdatensatz wird geladen
- Ein Tokenizer geschätzt
- Datensatz wird in einem Trainings- und Validerungsdatensatz geteilt
- Oversampling wird durchgeführt
- Y Daten werden faktorisiert
- Für alle Daten wird ein 4-D Tensor erstellt
- Hyperparametertraining wird definiert
- Netz wird definiert
- Netz wird trainiert und Daten werden gespeichert

env: tf-gpu

Auführen mit:
Jupyter Notebook

Eingabe: Tweets_final_corpus_sentiment_clean_train.json

Ausgabe:
Ergebnisse Hyperparametertraining CNN

@author: Paul Drecker

"""

In [None]:
# Tensorboard laden - je nach Browserversion keine Darstellung im jupyter möglich - öffnen über cmd
%load_ext tensorboard

In [13]:
#Laden der Pakete
import tensorflow.keras.layers
import datetime
import tensorflow as tf
import keras
import pandas as pd
import os
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from nltk.tokenize.treebank import TreebankWordDetokenizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorboard.plugins.hparams import api as hp
from imblearn.over_sampling import RandomOverSampler

In [14]:
# working directory auf den Ordner Masterarbeit festlegen
os.chdir(r'C:\Users\Paul\Documents\Uni\Masterarbeit\Final_Code\Masterarbeit')

In [15]:
#Daten Laden
Tweets_final_clean = pd.read_json(os.getcwd() + r"\Datengewinnung\Trainingsdaten\Tweets_final_corpus_sentiment_clean_train.json")

In [16]:
# Tokenizer auf den Trainigsdatensatz anpassen
token = tf.keras.preprocessing.text.Tokenizer(num_words=10000,oov_token='<UNK>' )
token.fit_on_texts(Tweets_final_clean['x_train'])

In [17]:
# Tokenizer anwenden
Tweets_text = token.texts_to_sequences(Tweets_final_clean['x_train'])

In [18]:
# Alle Tweets in eine Länge bringen
Tweets_text = tf.keras.preprocessing.sequence.pad_sequences(Tweets_text, maxlen=20, padding='post')

In [19]:
# In Trainings- und Validerungsdatensatz aufteilen
X_train, X_test, y_train, y_test = train_test_split(Tweets_text,Tweets_final_clean['y_train'], test_size=0.2, random_state=123,stratify=Tweets_final_clean['y_train'], shuffle=True)

In [20]:
# Oversampling durchführen - zufallsbasiert
oversample = RandomOverSampler(sampling_strategy='auto',random_state=123)
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
sample_text = pd.DataFrame(X_train_over, y_train_over).sample(frac=1).reset_index(drop=False)

In [21]:
# Traindatensätze umbenenen 
y_train_over = sample_text['y_train']
del(sample_text['y_train'])
X_train_over = sample_text.to_numpy()

In [22]:
# Labels Faktorisieren
y_train = pd.factorize(y_train_over)
y_train =  tf.constant(y_train[0] , dtype = tf.float32)
y_train =  tf.expand_dims(y_train, 1)

# Labels Faktorisieren
y_test = pd.factorize(y_test)
y_test =  tf.constant(y_test[0] , dtype = tf.float32)
y_test =  tf.expand_dims(y_test, 1)

# In Kategorievariablen umwandeln
y_test = tf.keras.utils.to_categorical(y_test)
y_train = tf.keras.utils.to_categorical(y_train)


# Für Trainingsdaten 3-D Tensor erstellen
X_train_3dtensor = tf.stack(list(X_train_over), axis=0)
X_train_3dtensor.get_shape()
X_train = None
del(X_train)
# Für Trainingsdaten 4-D Tensor erstellen
X_train_4dtensor = tf.expand_dims(X_train_3dtensor, -1) 
X_train_4dtensor.get_shape()

# Für Validerungssdaten 3-D Tensor erstellen
X_test_3dtensor = tf.stack(list(X_test), axis=0)
X_test = None
del(X_test)
# Für Validerungssdaten 4-D Tensor erstellen
X_test_4dtensor = tf.expand_dims(X_test_3dtensor, -1) 



In [23]:
# Gespeicherte Modelle in der Session löschen - vermeiden von doppelten Strukturen
tf.keras.backend.clear_session()

In [24]:
# Hyperparameterwerte festlegen
HP_NUM_UNITS = hp.HParam('num_filters', hp.Discrete([120, 150,200]))
HP_learningrate = hp.HParam('learningrate', hp.Discrete([ 0.001, 0.0001, 0.01, 0.00001, 0.000001]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.5, 0.8]))
HP_batch = hp.HParam('batchsize', hp.Discrete([ 32, 64, 128]))
HP_L2 = hp.HParam('l2', hp.RealInterval(0.0001,  0.001))

# Metrik festlegen
METRIC_ACCURACY = ['Accuracy']

# Hyperparametertraining - writer festlegen
with tf.summary.create_file_writer(os.getcwd() +'\Sentiment_model\CNN/logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_NUM_UNITS, HP_learningrate, HP_batch,HP_DROPOUT,HP_L2],
        metrics=[hp.Metric(METRIC_ACCURACY[0], display_name='Accuracy')],)
    



In [25]:
# Loop wird hier einmal durchgeführt damit Werte nicht leer sind
for num_units in HP_NUM_UNITS.domain.values:
    for learn_rate in (HP_learningrate.domain.values):
        for batchsize in HP_batch.domain.values:
            for dropout in HP_DROPOUT.domain.values:
                for l2 in (HP_L2.domain.min_value, HP_L2.domain.max_value):
                    hparams = {
                        HP_NUM_UNITS: num_units,
                        HP_learningrate: learn_rate,
                        HP_batch: batchsize,
                        HP_DROPOUT: dropout,
                        HP_L2: l2
                    }

In [26]:
# Training definieren
def train(hparams, logs):
    
    # Filtergößen festlegen
    filter_sizes=[3,4,5]
    # Embedding Schicht
    input_layer = tf.keras.layers.Input(shape=(20,1,), name="input_layer")
    emb_layer = tf.keras.layers.Embedding(10000, 300)(input_layer)
    emb_trans = tf.transpose(emb_layer,[0,1,3,2])
    emb = tf.squeeze(emb_trans, 3)
    emb = tf.transpose(emb,[0,2,1])
    
    # Convolution Schicht
    con_3 = tf.keras.layers.Conv1D(hparams[HP_NUM_UNITS], (filter_sizes[0]))(emb)
    relu_3 =tf.keras.layers.ReLU()(con_3 )

    con_4 = tf.keras.layers.Conv1D(hparams[HP_NUM_UNITS], (filter_sizes[1]))(emb)
    relu_4 =tf.keras.layers.ReLU()(con_4 )

    con_5 = tf.keras.layers.Conv1D(hparams[HP_NUM_UNITS], (filter_sizes[2]))(emb)
    relu_5 =tf.keras.layers.ReLU()(con_5 )

    # Maxpooling
    max_3 = tf.keras.layers.MaxPool1D(pool_size=(con_3.get_shape()[1]) ,strides=(1), padding='valid')(relu_3)
    max_4 = tf.keras.layers.MaxPool1D(pool_size=(con_4.get_shape()[1] ),strides=(1), padding='valid')(relu_4)
    max_5 = tf.keras.layers.MaxPool1D(pool_size=(con_5.get_shape()[1]),strides=(1), padding='valid')(relu_5)

    # Flatting
    merged = tf.keras.layers.Concatenate(axis=1)([max_3,max_4, max_5])
    flatten = tf.keras.layers.Flatten()(merged)
    
    # Dropout Schicht
    dropout = tf.keras.layers.Dropout(hparams[HP_DROPOUT])(flatten)

    # Dense Schicht und Softmatrix
    dense_layer = tf.keras.layers.Dense(units=2, name="dense_layer", kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2]))(dropout)
    output_layer = tf.keras.layers.Softmax(name="output_layer")(dense_layer)
    model = tf.keras.models.Model(input_layer, output_layer, name="model")
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(hparams[HP_learningrate]),
                  metrics=[tf.keras.metrics.BinaryAccuracy()])

    
    
    # Netz trainieren
    model.fit(X_train_4dtensor, y_train, epochs=15,shuffle=True,verbose=1 ,batch_size= hparams[HP_batch],callbacks=[
        
    
        
        
        hp.KerasCallback(logdir,hparams), 
       #tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', min_delta=0.01, patience=3, mode='auto', restore_best_weights=False)
    ]) 
    # Accuray ausgeben
    _, Accuracy= model.evaluate(X_test_4dtensor, y_test)
    return Accuracy

In [27]:
# Funktion zum schreiben der Ergebnisse
def run(run_dir, hparams, logs):
    with tf.summary.create_file_writer(run_dir).as_default():
        
        hp.hparams(hparams)  
        Accuracy = train(hparams, logs)
                        
        tf.summary.scalar(METRIC_ACCURACY[0], Accuracy, step=1)
        

In [28]:
# Hyperparametertraining durchführen
logdir = os.getcwd() +'\Sentiment_model\CNN/logs/'
session_num = 0
for num_units in HP_NUM_UNITS.domain.values:
    for learn_rate in (HP_learningrate.domain.values):
        for batchsize in HP_batch.domain.values:
            for dropout in HP_DROPOUT.domain.values:
                for l2 in (HP_L2.domain.min_value, HP_L2.domain.max_value):
                    hparams = {
                        HP_NUM_UNITS: num_units,
                        HP_learningrate: learn_rate,
                        HP_batch: batchsize,
                        HP_DROPOUT: dropout,
                        HP_L2: l2
                    }
                    logs = os.getcwd() +'\Sentiment_model\CNN/logs/fit/' + '_' + str(session_num)  + '_' + 'num_filter'+'_' +str(num_units) + '_' + 'learn_rate'+'_' +str(learn_rate) + '_' + 'batchsize'+'_' +str(hparams[HP_batch])  + '_' + 'dropout'+'_' +str(hparams[HP_DROPOUT])  + '_'  + 'l2'+'_' +str(hparams[HP_L2])
                    run_name = "run_%d" % session_num
                    print('--- Starting trial: %s' % run_name)
                    print({h.name: hparams[h] for h in hparams})
                    run(os.getcwd() +'\Sentiment_model\CNN/logs/hparam_tuning/' + str(run_name), hparams, logs)
                    session_num += 1
          



--- Starting trial: run_0
{'num_filters': 120, 'learningrate': 1e-06, 'batchsize': 32, 'dropout': 0.5, 'l2': 0.0001}
Epoch 1/15

KeyboardInterrupt: 

In [None]:
#Tensorboard zeigen
%tensorboard --logdir logs\fit
# Wenn nicht angezeigt dann über cmd aufrufen - Ordner muss manuell eingefügt werden
#tensorboard --logdir \CNN\logs