In [35]:
import pandas as pd
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = [stopwords.words('dutch'), stopwords.words('danish'), stopwords.words('german'), stopwords.words('italian'), stopwords.words('spanish')]
STOPWORDS = [item for sublist in STOPWORDS for item in sublist]


In [36]:
embedding_dim = 64
max_length = 500
vocab_size = int(2/5 * (max_length * embedding_dim))
print(vocab_size)

trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

12800


In [37]:
langs = ["nl", "da", "de", "it", "es", ]

train = pd.DataFrame()
test = pd.DataFrame()


for language in langs:
    train = pd.concat([train, pd.read_csv(f"../corpus/train/{language}/{language}_plain.csv")])
    test = pd.concat([test, pd.read_csv(f"../corpus/test/{language}/{language}_plain.csv")])

train.set_index("index", inplace=True)
train = train.sort_values(by=["index"])
train.reset_index(inplace=True)

test.set_index("index", inplace=True)
test = test.sort_values(by=["index"])
test.reset_index(inplace=True)



In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.text, train.label, test_size=0.33, random_state=42, shuffle=True)

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(y_train)
train_label_seq = np.array(label_tokenizer.texts_to_sequences(y_train))
test_label_seq = np.array(label_tokenizer.texts_to_sequences(y_test))


In [39]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 64)          819200    
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 5)                 325       
                                                                 
Total params: 893,829
Trainable params: 893,829
Non-trainable params: 0
_________________________________________________________________


In [44]:
from tensorflow.keras import layers
import keras_tuner
import keras

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim))
    model.add(
        layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim))
    )
    model.add(layers.Dense(embedding_dim, activation='relu'))
    model.add(layers.Dense(5, activation='softmax'))

    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"],
    )
    return model

tuner = keras_tuner.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5)

tuner.search(train_padded, train_label_seq, epochs=5,  validation_data=(test_padded, test_label_seq), verbose=2)
best_model = tuner.get_best_models()[0]


Trial 1 Complete [00h 05m 39s]
val_loss: 0.8861347436904907

Best val_loss So Far: 0.8861347436904907
Total elapsed time: 00h 05m 39s
INFO:tensorflow:Oracle triggered exit
