### Convolutional Neural Network - Glove Embedding

In [51]:
# from google.colab import drive
# drive.mount('/content/drive')

# from shutil import copyfile
# copyfile('/content/drive/MyDrive/FDL Project/Code/text_vectorization.py', 'text_vectorization.py')
# copyfile('/content/drive/MyDrive/FDL Project/Code/embedding.py', 'embedding.py')
# copyfile('/content/drive/MyDrive/FDL Project/Code/kfold_cv.py', 'kfold_cv.py')

In [52]:
#!pip install iterative-stratification

In [53]:
import pandas as pd
import numpy as np
import importlib

from tensorflow import keras
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import text_vectorization
#importlib.reload(text_vectorization)

import embedding
#importlib.reload(embedding)

import kfold_cv
importlib.reload(kfold_cv)

<module 'kfold_cv' from 'c:\\Users\\della\\Desktop\\arXiv_classification\\kfold_cv.py'>

#### Training and test set

In this first stage, we have:
- Read the **training and test set**;
- Calculated the **number of unique categories**, so the number of classes in the text classification;
- Converted the labels associated with the articles' to **one-hot encoding representation**, which is a deep learning best practice when we cope with multi-label text classification task.

In [54]:
# train_set = pd.read_csv('/content/drive/MyDrive/FDL Project/Code/data/train-set-cat1-processed.csv')
# test_set = pd.read_csv('/content/drive/MyDrive/FDL Project/Code/data/test-set-cat1-processed.csv')

train_set = pd.read_csv('data/train-set-cat1-processed.csv')
test_set = pd.read_csv('data/test-set-cat1-processed.csv')

# Number of different categories
number_of_categories = len(train_set['label'].unique())

# One-hot encoding of the labels
label_train = to_categorical(train_set['label'], num_classes = number_of_categories, dtype = 'int64')
label_test = to_categorical(test_set['label'], num_classes = number_of_categories, dtype = 'int64') 

#### *Text vectorization and embedding*

Firstly, the following **parameters** are defined:
- **Size of the vocabulary** to create;
- **Number of words** considered for each text (article);
- **Dimension of the embedding**;

In [55]:
vocabulary_size = 50000
words_per_sentence = 200
embedding_dim = 100

Then, we have opted the first embedding approach: **Keras vectorization and GloVe embedding**.

- The *vectorization* (and so the creation of the *vocabulary*) is carried out using the **Keras built-in function**, with the final adaption of the text vectorizer on the training set;
- For the *embedding matrix*, we have used a pre-trained solution, named **GloVe**, with 100 dimensions;
- Finally, we have created the final **vectorized feature** for the training phase.

In [56]:
text_vectorizer_keras = text_vectorization.createTextVectorizer(vocabulary_size, words_per_sentence, train_set['text'])
vocabulary_keras = text_vectorizer_keras.get_vocabulary()

embedding_matrix_glove = embedding.buildEmbeddingMatrix(embedding_dim, vocabulary_keras)
embedding_layer_glove = embedding.createEmbeddingLayer(embedding_matrix_glove, None)
embedding_layer_glove._name = 'GloVe'

In [57]:
feature_train_glove = text_vectorization.textVectorization(train_set['text'], text_vectorizer_keras)

#### *Neural network architecture*

In [58]:
# Input layer
input_layer = keras.Input(shape = (words_per_sentence,), dtype = 'int64')

# Embedding layer
x = embedding_layer_glove(input_layer)

# Hidden layers
x = keras.layers.Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(rate = 0.5)(x)

# Output layer
x = keras.layers.Dense(number_of_categories, activation = 'softmax')(x)
output_layer = x

# Neural network model
network = keras.Model(input_layer, output_layer, name = 'Conv1D')

#### K-Fold Cross Validation

In [59]:
hyperparams = [
    
    {

        'filters': 128,
        'kernel_size': 5,
        'rate': 0.5,
        'optimizer': 'rmsprop',
        'batch_size': 128

    }

    #  ,
    #   {
    #  'filters': 128,
    #  'kernel_size': 5,
    #  'rate': 0.5,
    #  'optimizer': 'adam',
    #  'batch_size': 128
    #  }

]

epochs = 1
k_fold = 2

In [60]:
kfold_results = kfold_cv.kfoldCrossValidation(k_fold, feature_train_glove, label_train, network, hyperparams, epochs)



In [61]:
kfold_results

[{'Network': 'Conv1D',
  'Embedding': 'GloVe',
  'k_folds': 2,
  'filters': 128,
  'kernel_size': 5,
  'rate': 0.5,
  'optimizer': 'rmsprop',
  'batch_size': 128,
  'loss_kfold': 0.789,
  'accuracy_kfold': 0.742,
  'best_number_epochs': 1,
  'n_epochs': 1}]

In [62]:
best_hyperparams = { 'loss_kfold': 999 }

for result in kfold_results:

    if(result['loss_kfold'] < best_hyperparams['loss_kfold']):
        best_hyperparams = result

print(best_hyperparams)


{'Network': 'Conv1D', 'Embedding': 'GloVe', 'k_folds': 2, 'filters': 128, 'kernel_size': 5, 'rate': 0.5, 'optimizer': 'rmsprop', 'batch_size': 128, 'loss_kfold': 0.789, 'accuracy_kfold': 0.742, 'best_number_epochs': 1, 'n_epochs': 1}


In [63]:
# Neural Network architecture with best hyperparameters combination
network.layers[2].filters = best_hyperparams['filters']
network.layers[2].kernel_size = (best_hyperparams['kernel_size'],)
network.layers[4].rate = best_hyperparams['rate']

# Compiling the network
network.compile(

    loss = 'categorical_crossentropy', 
    optimizer = best_hyperparams['optimizer'], 
    metrics = ['accuracy']
    
)

#### Training

In [64]:
# Training (fit Neural Network)
training_history = network.fit(

    x = feature_train_glove,
    y = label_train,
    batch_size = best_hyperparams['batch_size'],
    epochs = best_hyperparams['best_number_epochs']

)



#### Testing

In [65]:
feature_test_glove = text_vectorization.textVectorization(test_set['text'], text_vectorizer_keras)

# Validation 
score = network.evaluate(feature_test_glove, label_test, verbose = 0)

# Performance metrics
test_loss = score[0]
test_accuracy = score[1]

In [67]:
round(test_accuracy, 3)

0.747