In [None]:
import pandas as pd
import numpy as np

import pickle
import importlib
import keras

import nltk
#nltk.data.path.append('C:\\Users\\della\\anaconda3\\nltk-data')

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import text_vectorization
#importlib.reload(text_vectorization)

import embedding
#importlib.reload(embedding)

import kfold_cv
#importlib.reload(kfold_cv)

We also computed the **number of unique categories**, that will be useful later during the project.

In [None]:
train_set = pd.read_csv('data/train-set-cat1-processed.csv')
train_set_augmented = pd.read_csv('data/train-set-cat1-augmented.csv')
test_set = pd.read_csv('data/test-set-cat1-processed.csv')

In [None]:
# Number of different categories
number_of_categories = len(train_set['label'].unique())

#### *Text vectorization and embedding*

Firstly, the following **parameters** are defined:
- **Size of the vocabulary** to create;
- **Number of words** considered for each text (article);
- **Dimension of the embedding**;

In [33]:
vocabulary_size = 50000
words_per_sentence = 200
embedding_dim = 100

Then, we have opted for **two different approaches**:

- **Version 1: Keras vectorization and GloVe embedding**\
In this scenario, the vectorization (and so the creation of the vocabulary) is carried out using the **Keras built-in function**, with the final adaption of the text vectorizer on the training set. For the embedding matrix, we have used a pre-trained solution, named **GloVe**, with 100 dimensions.

In [None]:
text_vectorizer_keras = text_vectorization.createTextVectorizer(vocabulary_size, words_per_sentence, train_set['text'])
vocabulary_keras = text_vectorizer_keras.get_vocabulary()

embedding_matrix_glove = embedding.buildEmbeddingMatrix(embedding_dim, vocabulary_keras)
embedding_layer_glove = embedding.createEmbeddingLayer(embedding_matrix_glove, None)

feature_train_glove = text_vectorization.textVectorization(train_set['text'], text_vectorizer_keras)

- **Version 2: Word2Vec vectorization and embedding**\
The second strategy plans to use a text vectorizer, a vocabulary and an embedding using a **Word2Vec model** directly trained and created on our training set.

In [None]:
word2vec = text_vectorization.createTextVectorizerWord2Vec(train_set['text'], vocabulary_size, embedding_dim)
text_vectorizer_word2vec = word2vec['text_vectorizer']
vocabulary_word2vec = list(word2vec['vocabulary_embedding'].key_to_index)

embedding_matrix_word2vec = embedding.buildingEmbeddingMatrixWord2Vec(embedding_dim, vocabulary_word2vec, word2vec['vocabulary_embedding'])
embedding_layer_word2vec = embedding.createEmbeddingLayer(embedding_matrix_word2vec, None)

feature_train_word2vec = text_vectorization.textVectorizationWord2Vec(train_set['text'], text_vectorizer_word2vec, words_per_sentence)

#### *Neural network architecture*

In [None]:
# Input layer
input_layer = keras.Input(shape = (words_per_sentence,), dtype = 'int64')

# Embedding layer
x = embedding_layer_glove(input_layer)

# Hidden layers
x = keras.layers.Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(rate = 0.5)(x)

# Output layer
x = keras.layers.Dense(number_of_categories, activation = 'softmax')(x)
output_layer = x

# Neural network model
network = keras.Model(input_layer, output_layer)

#### *K-Fold Cross Validation*

In [None]:
hyperparams = [{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'rmsprop',
    'batch_size': 128

}, 

{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'adam',
    'batch_size': 128

}]

In [None]:
kfold_results = kfold_cv.kfoldCrossValidation(2, feature_train, label_train, network, hyperparams)