In [137]:
import pandas as pd
import numpy as np

import pickle
import importlib
import keras

import nltk
#nltk.data.path.append('C:\\Users\\della\\anaconda3\\nltk-data')

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import pre_processing
#importlib.reload(pre_processing)

import text_vectorization
#importlib.reload(text_vectorization)

import embedding
#importlib.reload(embedding)

Reading initial dataset

In [138]:
arxiv_data = pd.read_csv('data/arxiv-dataset-cat1.csv')

In [139]:
number_of_categories = len(arxiv_data['label'].unique())

Splitting in training and test set

In [140]:
train_set, test_set = train_test_split(arxiv_data, test_size = 0.3, stratify = arxiv_data['label'], random_state = 19)

Pre-processing training set

In [141]:
#train_set_processed = pre_processing.dataPreProcessing(train_set)
#train_set_processed.to_csv('data/train-set-cat1-processed.csv', index = False)
train_set = pd.read_csv('data/train-set-cat1-processed.csv')

### Text vectorization and embedding

In [142]:
vocabulary_size = 50000
words_per_sentence = 200
embedding_dim = 100

Version 1: Keras

In [143]:
text_vectorizer = text_vectorization.createTextVectorizer(vocabulary_size, words_per_sentence, train_set['text'])
vocabulary = text_vectorizer.get_vocabulary()

embedding_matrix = embedding.buildEmbeddingMatrix(embedding_dim, vocabulary)
embedding_layer = embedding.createEmbeddingLayer(embedding_matrix, None)

feature_train = text_vectorization.textVectorization(train_set['text'], text_vectorizer)
label_train = to_categorical(train_set['label'], num_classes = number_of_categories, dtype = 'int64') # One-hot encoding

In [144]:
# Input layer
input_layer = keras.Input(shape = (words_per_sentence,), dtype = 'int64')

# Embedding layer
x = embedding_layer(input_layer)

# Hidden layers
x = keras.layers.Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(rate = 0.5)(x)

# Output layer
x = keras.layers.Dense(number_of_categories, activation = 'softmax')(x)
output_layer = x

# Neural network model
network = keras.Model(input_layer, output_layer)

In [145]:
label_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [146]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import tensorflow

def kfoldCrossValidation(k_folds, feature, label, neural_network, hyperparams):

    # Stratified K-fold Cross Validation
    stratified_kfold = MultilabelStratifiedKFold(n_splits = k_folds, random_state = 19, shuffle = True)

    results = {}
    
    for hyperparams_combination in hyperparams:
    
        print(hyperparams_combination)

        # List with evaluation metric (performance for each iteration)
        evaluation_metric = []
    
        # Neural Network architecture with hyperparameters combination
        network.layers[2].filters = hyperparams_combination['filters']
        network.layers[2].kernel_size = hyperparams_combination['kernel_size']
        network.layers[4].rate = hyperparams_combination['rate']

        # Compiling the network
        network.compile(
            loss = 'categorical_crossentropy', 
            optimizer = hyperparams_combination['optimizer'], 
            metrics = ['accuracy']
        )

        # Converting to numpy for splitting
        feature = feature.numpy()

        # Splitting in training and validation set
        for train, val in stratified_kfold.split(feature, label):

            feature_train = tensorflow.convert_to_tensor(feature[train])
            feature_val = tensorflow.convert_to_tensor(feature[val])

            print(label[train])

            # Training (fit Neural Network)
            training_history = neural_network.fit(
                x = feature_train, 
                y = label[train], 
                batch_size = hyperparams_combination['batch_size'], 
                epochs = hyperparams_combination['epochs']
            )
            
            # Validation 
            score = neural_network.evaluate(feature_val, label[val], verbose = 0)

            print(score)

    return results

In [147]:
hyperparams = [{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'rmsprop',
    'batch_size': 128,
    'epochs': 15

}]

In [148]:
kfoldCrossValidation(3, feature_train, label_train, network, hyperparams)

{'filters': 128, 'kernel_size': 5, 'rate': 0.5, 'optimizer': 'rmsprop', 'batch_size': 128, 'epochs': 15}
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]
Epoch 1/15


TypeError: in user code:

    File "c:\Users\della\anaconda3\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\della\anaconda3\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\della\anaconda3\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\della\anaconda3\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\della\anaconda3\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\della\anaconda3\keras\layers\convolutional\base_conv.py", line 328, in <listcomp>
        self.kernel_size[i],

    TypeError: Exception encountered when calling layer 'conv1d_6' (type Conv1D).
    
    'int' object is not subscriptable
    
    Call arguments received by layer 'conv1d_6' (type Conv1D):
      • inputs=tf.Tensor(shape=(None, 200, 100), dtype=float32)


In [None]:
network.summary()

Verion 2: Word2Vec

In [None]:
word2vec = text_vectorization.createTextVectorizerWord2Vec(train_set['text'], vocabulary_size, embedding_dim)
text_vectorizer = word2vec['text_vectorizer']
vocabulary = list(word2vec['vocabulary_embedding'].key_to_index)

embedding_matrix = embedding.buildingEmbeddingMatrixWord2Vec(embedding_dim, vocabulary, word2vec['vocabulary_embedding'])
embedding_layer = embedding.createEmbeddingLayer(embedding_matrix, None)

feature_train = text_vectorization.textVectorizationWord2Vec(train_set['text'], text_vectorizer, words_per_sentence)
label_train = to_categorical(train_set['label'], num_classes = number_of_categories) # One-hot encoding