In [1]:
import pandas as pd

from utils.preprocessing import clean, remove_stopwords, lemmatize
from itertools import combinations_with_replacement
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from optuna.integration import TFKerasPruningCallback
from tensorflow.keras.layers import TextVectorization
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from gensim.models import KeyedVectors
from pathlib import Path

import tensorflow as tf
import joblib
import optuna

from optuna.trial import TrialState


tf.get_logger().setLevel('INFO')
%matplotlib inline

In [2]:
def generate_dataset():
    res = pd.DataFrame()
    possible_functions = {
        "clean": clean,
        "no_stopwords": remove_stopwords,
        "lemmas": lemmatize,
    }
    possible_datasets = set()
    for comb in combinations_with_replacement(possible_functions, 3):
        possible_datasets.add(tuple(sorted(tuple(set(comb)))))
    for func_comb in possible_datasets:
        print(func_comb)
        resulting_df = df
        dataset_name = "+".join(func_comb)
        for func in func_comb:
            resulting_df = possible_functions[func](resulting_df)
        res[dataset_name] = resulting_df["text"]
    return res

In [3]:
def get_models(models_path="./models"):
    models_dir = {}
    cwd = Path(models_path)
    for path in cwd.iterdir():
        if path.is_dir():
            for file in path.iterdir():
                models_dir[file.name] = file
    return models_dir

In [4]:
def get_calculated_models(models_path="./results"):
    models_dir = {}
    cwd = Path(models_path)
    for path in cwd.iterdir():
        if path.is_dir():
            models_dir[path.name] = []
            for file in path.iterdir():
                models_dir[path.name].append(file.name)
    return models_dir

In [5]:
def split_dataset(x, y, test_size = 0.15):
    x_train_valid, x_test, y_train_valid, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
    return *train_test_split(x_train_valid, y_train_valid, test_size=(1 - test_size) * test_size, random_state=42), x_test, y_test

In [40]:
def create_embedding_layer(voc, shape, model):
    word_index = dict(zip(voc, range(len(voc))))
    num_tokens = len(voc) + 2
    embedding_dim = shape[1]    # dimension of vectors
    hits = 0
    misses = 0

    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        try:
            embedding_vector = model.get_vector(word)
            embedding_matrix[i] = embedding_vector
            hits += 1
        except KeyError:
            misses += 1

    embedding_layer = keras.layers.Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
        input_shape = [None],
        mask_zero = True,
    )
    return embedding_layer

In [109]:
def create_basic_model(embedding_layer, trail):

    lstm_first_layer_size = trail.suggest_int("lstm_first_layer_size", 16, 256)
    learning_rate = trail.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    model = keras.models.Sequential([
        embedding_layer,
        keras.layers.LSTM(lstm_first_layer_size),
        keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=Adam(learning_rate=learning_rate),
        metrics=["acc"]
    )

    return model

def create_in_series_model(embedding_layer, trail):
    lstm_first_layer_size = trail.suggest_int("lstm_first_layer_size", 16,  256)
    lstm_second_layer_size = trail.suggest_int("lstm_second_layer_size", 16, 256)
    lstm_third_layer_size = trail.suggest_int("lstm_third_layer_size", 16, 256)
    learning_rate = trail.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    model = keras.models.Sequential([
        embedding_layer,
        keras.layers.LSTM(lstm_first_layer_size, return_sequences=True),
        keras.layers.LSTM(lstm_second_layer_size, return_sequences=True),
        keras.layers.LSTM(lstm_third_layer_size),
        keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=Adam(learning_rate=learning_rate),
        metrics=["acc"]
    )

    return model


def create_cnn_lstm_model(embedding_layer, trail):
    lstm_first_layer_size = trail.suggest_int("lstm_first_layer_size", 16, 256)
    learning_rate = trail.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    model = keras.models.Sequential([
        embedding_layer,
        keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
        keras.layers.Conv2D(100, (2, 2), activation="relu", padding="same"),
        keras.layers.MaxPooling2D(pool_size=1),
        keras.layers.Flatten(),
        keras.layers.Reshape((-1, 100)),
        keras.layers.LSTM(lstm_first_layer_size),
        keras.layers.Dense(1, activation="sigmoid")
    ])
    model.summary()
    model.compile(
        loss="binary_crossentropy",
        optimizer=Adam(learning_rate=learning_rate),
        metrics=["acc"]
    )

    return model


def create_model(which, embedding_layer, trail):
    if which == "basic":
        return create_basic_model(embedding_layer, trail)
    elif which == "in_series":
        return create_in_series_model(embedding_layer, trail)
    elif which == "cnn_lstm":
        return create_cnn_lstm_model(embedding_layer, trail)
    else:
        raise NotImplementedError

In [91]:
def main(which, x, y, word2vec_model, n_trails=5):
    shape = word2vec_model.vectors.shape
    vectorizer = TextVectorization(max_tokens=shape[0], output_sequence_length=int(x.str.split().str.len().max()))
    vectorizer.adapt(x)

    # dict mapping words to their indices
    voc = vectorizer.get_vocabulary()

    # create embedding layer
    vectorized_x = vectorizer(np.array([[s] for s in x])).numpy()
    embedding_layer = create_embedding_layer(voc, shape, word2vec_model)

    #create model

    #test train split
    x_train, x_valid, y_train, y_valid, x_test, y_test  = split_dataset(vectorized_x, y)

    func = lambda trail: objective(trail, which, embedding_layer, x_train, y_train, x_valid, y_valid)
    study = optuna.create_study(
        direction="maximize",
        pruner=optuna.pruners.MedianPruner(),
        storage="sqlite:///db.sqlite3"
    )
    study.optimize(func, n_trials=n_trails)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    best_trail = study.best_trial

    print("  Value: ", best_trail.value)
    return study


In [27]:
def load_model(models, file_name):
    file = models[file_name]
    print(file.name)
    return KeyedVectors.load_word2vec_format(file, binary=False)


In [28]:
def objective(trial, which, embedding_layer, x_train, y_train, x_valid, y_valid):
    # Clear clutter from previous session graphs.
    keras.backend.clear_session()
    # Generate our trial model.
    model = create_model(which, embedding_layer, trial)
    # Fit the model on the training data.
    # The KerasPruningCallback checks for pruning condition every epoch.
    model.fit(
        x_train,
        y_train,
        batch_size=BATCH_SIZE,
        callbacks=[TFKerasPruningCallback(trial, "val_acc")],
        epochs=EPOCHS,
        validation_data=(x_valid, y_valid),
        verbose=1,
    )

    # Evaluate the model accuracy on the validation set.
    score = model.evaluate(x_valid, y_valid, verbose=0)
    return score[1]

In [11]:
df = pd.read_csv("data/dane treningowe_I etap.csv")
df_2 = pd.read_csv("data/dane testowe.csv")
df = pd.concat([df, df_2])
label_binarizer = LabelBinarizer()

models = get_models()
bin_y = label_binarizer.fit_transform(df["class"])

In [12]:
dataset = generate_dataset()

('no_stopwords',)
('clean',)
('clean', 'lemmas')
('lemmas',)
('clean', 'lemmas', 'no_stopwords')
('lemmas', 'no_stopwords')
('clean', 'no_stopwords')


In [65]:
BATCH_SIZE = 255
EPOCHS = 100
which = "cnn_lstm"
modelss = {}

curr_results = get_calculated_models(f"./results/{which}")


In [110]:

for file_name in models.keys():
    result = {}
    file = models[file_name]

    if file_name in curr_results.keys():
        continue
    if file.name not in modelss.keys():
        if str(file).endswith(".txt"):
            print(file.name)
            modelss[file.name] = KeyedVectors.load_word2vec_format(file, binary=False)
        elif str(file_name).endswith(".bin"):
            print(file.name)
            modelss[file.name] = KeyedVectors.load(str(file))

    for data in dataset.columns:
        if data == "no_stopwords":
            continue
        if "lemmas" not in data and "lemmas" not in file_name:
            print(data, file_name)
            result =  main(which, dataset[data], bin_y, modelss[file.name])
        elif "lemmas" in data and "forms" not in file_name:
            print(data, file_name)
            result = main(which, dataset[data], bin_y, modelss[file.name])
        if result != {}:
            dest_folder_path = Path(f"./results/{which}/{file_name}/")
            dest_folder_path.mkdir(parents=True, exist_ok=True)
            dest_path = dest_folder_path / (data + ".pkl")
            with dest_path.open('wb') as dest_file:
                joblib.dump(result, dest_file)

clean+lemmas nkjp+wiki-lemmas-all-100-skipg-hs.txt


[32m[I 2023-04-12 16:00:45,686][0m A new study created in RDB with name: no-name-c219d900-06e7-4233-a0d2-f8ead9956793[0m


(9403, 100)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         940300    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                            

[32m[I 2023-04-12 16:01:23,017][0m Trial 0 finished with value: 0.8033373355865479 and parameters: {'lstm_first_layer_size': 155, 'learning_rate': 2.8926732608097648e-05}. Best is trial 0 with value: 0.8033373355865479.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         940300    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:01:51,359][0m Trial 1 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 26, 'learning_rate': 1.61019779371001e-05}. Best is trial 0 with value: 0.8033373355865479.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         940300    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:02:21,302][0m Trial 2 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 62, 'learning_rate': 0.0005607025005751149}. Best is trial 0 with value: 0.8033373355865479.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         940300    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:03:01,653][0m Trial 3 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 194, 'learning_rate': 0.0021398027096919013}. Best is trial 0 with value: 0.8033373355865479.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         940300    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:03:39,540][0m Trial 4 finished with value: 0.6555423140525818 and parameters: {'lstm_first_layer_size': 123, 'learning_rate': 0.08720889214097857}. Best is trial 0 with value: 0.8033373355865479.[0m


Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  5
Best trial:
  Value:  0.8033373355865479
lemmas nkjp+wiki-lemmas-all-100-skipg-hs.txt


[32m[I 2023-04-12 16:03:40,104][0m A new study created in RDB with name: no-name-f8cb6550-0b1d-4e88-bf9b-50329e575d87[0m


(9058, 100)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         905800    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                            

[32m[I 2023-04-12 16:04:19,068][0m Trial 0 finished with value: 0.7032181024551392 and parameters: {'lstm_first_layer_size': 115, 'learning_rate': 0.05723606409949325}. Best is trial 0 with value: 0.7032181024551392.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         905800    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:04:50,408][0m Trial 1 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 24, 'learning_rate': 9.901590369957455e-05}. Best is trial 0 with value: 0.7032181024551392.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         905800    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:05:31,923][0m Trial 2 finished with value: 0.7794994115829468 and parameters: {'lstm_first_layer_size': 196, 'learning_rate': 1.0726800382715764e-05}. Best is trial 2 with value: 0.7794994115829468.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         905800    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:06:08,758][0m Trial 3 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 100, 'learning_rate': 0.07039659154768411}. Best is trial 2 with value: 0.7794994115829468.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         905800    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:06:48,211][0m Trial 4 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 169, 'learning_rate': 0.0053625356008351905}. Best is trial 2 with value: 0.7794994115829468.[0m


Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  5
Best trial:
  Value:  0.7794994115829468
clean+lemmas+no_stopwords nkjp+wiki-lemmas-all-100-skipg-hs.txt


[32m[I 2023-04-12 16:06:48,710][0m A new study created in RDB with name: no-name-681ce05e-c563-4aa5-b63e-a07c555d1714[0m


(8925, 100)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892500    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                            

[32m[I 2023-04-12 16:07:23,425][0m Trial 0 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 122, 'learning_rate': 0.06665087739981983}. Best is trial 0 with value: 0.6185935735702515.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892500    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:07:51,986][0m Trial 1 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 22, 'learning_rate': 0.05430944904281446}. Best is trial 0 with value: 0.6185935735702515.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892500    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:08:29,934][0m Trial 2 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 153, 'learning_rate': 0.0009885197355413715}. Best is trial 0 with value: 0.6185935735702515.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892500    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:09:10,577][0m Trial 3 finished with value: 0.8188319206237793 and parameters: {'lstm_first_layer_size': 199, 'learning_rate': 2.0770171730839946e-05}. Best is trial 3 with value: 0.8188319206237793.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892500    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:09:57,278][0m Trial 4 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 253, 'learning_rate': 0.0060537753862322866}. Best is trial 3 with value: 0.8188319206237793.[0m


Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  5
Best trial:
  Value:  0.8188319206237793
lemmas+no_stopwords nkjp+wiki-lemmas-all-100-skipg-hs.txt


[32m[I 2023-04-12 16:09:57,761][0m A new study created in RDB with name: no-name-70c24a1a-6a93-4fa8-90cb-5b68a5fc2d96[0m


(8927, 100)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892700    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                            

[32m[I 2023-04-12 16:10:27,292][0m Trial 0 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 44, 'learning_rate': 0.004142640999569477}. Best is trial 0 with value: 0.6185935735702515.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892700    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:10:59,755][0m Trial 1 finished with value: 0.8235995173454285 and parameters: {'lstm_first_layer_size': 64, 'learning_rate': 8.403908876144988e-05}. Best is trial 1 with value: 0.8235995173454285.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892700    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:11:27,439][0m Trial 2 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 46, 'learning_rate': 0.052076653088066735}. Best is trial 1 with value: 0.8235995173454285.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892700    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:12:00,355][0m Trial 3 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 67, 'learning_rate': 0.0809135520841377}. Best is trial 1 with value: 0.8235995173454285.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         892700    
                                                                 
 lambda (Lambda)             (None, 1, None, 100)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      40100     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:12:28,641][0m Trial 4 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 28, 'learning_rate': 1.212967348897448e-05}. Best is trial 1 with value: 0.8235995173454285.[0m


Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  5
Best trial:
  Value:  0.8235995173454285
nkjp+wiki-lemmas-all-300-cbow-hs.txt
clean+lemmas nkjp+wiki-lemmas-all-300-cbow-hs.txt


[32m[I 2023-04-12 16:15:48,978][0m A new study created in RDB with name: no-name-5307fe91-41e8-47e1-8ca5-aa49cd6ac595[0m


(9403, 300)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2820900   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                            

[32m[I 2023-04-12 16:16:40,120][0m Trial 0 finished with value: 0.784267008304596 and parameters: {'lstm_first_layer_size': 196, 'learning_rate': 1.2173105362980854e-05}. Best is trial 0 with value: 0.784267008304596.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2820900   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:17:20,618][0m Trial 1 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 47, 'learning_rate': 0.04331703956826088}. Best is trial 0 with value: 0.784267008304596.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2820900   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:18:09,928][0m Trial 2 finished with value: 0.8259832859039307 and parameters: {'lstm_first_layer_size': 135, 'learning_rate': 4.879004187370446e-05}. Best is trial 2 with value: 0.8259832859039307.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2820900   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:18:47,964][0m Trial 3 finished with value: 0.6150178909301758 and parameters: {'lstm_first_layer_size': 18, 'learning_rate': 0.0004912565701675664}. Best is trial 2 with value: 0.8259832859039307.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2820900   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:19:36,127][0m Trial 4 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 126, 'learning_rate': 0.0011056513684751764}. Best is trial 2 with value: 0.8259832859039307.[0m


Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  5
Best trial:
  Value:  0.8259832859039307
lemmas nkjp+wiki-lemmas-all-300-cbow-hs.txt


[32m[I 2023-04-12 16:19:36,635][0m A new study created in RDB with name: no-name-ac36bacb-4760-47c8-9630-3812a8c8edbe[0m


(9058, 300)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2717400   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                            

[32m[I 2023-04-12 16:20:18,887][0m Trial 0 finished with value: 0.7580453157424927 and parameters: {'lstm_first_layer_size': 71, 'learning_rate': 1.4092641648206498e-05}. Best is trial 0 with value: 0.7580453157424927.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2717400   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:21:05,426][0m Trial 1 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 119, 'learning_rate': 0.048537797279404306}. Best is trial 0 with value: 0.7580453157424927.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2717400   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:21:48,531][0m Trial 2 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 64, 'learning_rate': 0.0017315489030366885}. Best is trial 0 with value: 0.7580453157424927.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2717400   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[32m[I 2023-04-12 16:22:31,163][0m Trial 3 finished with value: 0.6185935735702515 and parameters: {'lstm_first_layer_size': 94, 'learning_rate': 0.004713217800036326}. Best is trial 0 with value: 0.7580453157424927.[0m


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2717400   
                                                                 
 lambda (Lambda)             (None, 1, None, 300)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, None, 100)      120100    
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, None, 100)     0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, None)              0         
                                                                 
 reshape (Reshape)           (None, None, 100)         0         
                                                        

[33m[W 2023-04-12 16:22:41,967][0m Trial 4 failed with parameters: {'lstm_first_layer_size': 183, 'learning_rate': 0.00060382567176119} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-91-a88a7a8aabf6>", line 21, in <lambda>
    func = lambda trail: objective(trail, which, embedding_layer, x_train, y_train, x_valid, y_valid)
  File "<ipython-input-28-561794553c96>", line 8, in objective
    model.fit(
  File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1694, in fit
    val_logs = self.evaluate(
  File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs

KeyboardInterrupt: 

In [155]:
res = {}
res_path = Path("./results")
for _dir in res_path.iterdir():
    res[_dir.name] = {}
    for file in _dir.iterdir():
        one_res = joblib.load(file)
        res[_dir.name][file.name] = max(one_res.best_trial.intermediate_values.values())


In [183]:
df = pd.DataFrame.from_dict(res)
df.max(axis=1)

lemmas+no_stopwords.pkl          0.865316
clean+lemmas+no_stopwords.pkl    0.872467
clean+no_stopwords.pkl           0.871275
no_stopwords.pkl                 0.872467
clean.pkl                        0.874851
lemmas.pkl                       0.868892
clean+lemmas.pkl                 0.874851
dtype: float64