In [None]:
import pandas as pd
import numpy as np

import pickle
import importlib
import keras

import nltk
#nltk.data.path.append('C:\\Users\\della\\anaconda3\\nltk-data')

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import pre_processing
#importlib.reload(pre_processing)

import text_vectorization
#importlib.reload(text_vectorization)

import embedding
#importlib.reload(embedding)

import kfold_cv
#importlib.reload(kfold_cv)

import data_augmentation
importlib.reload(data_augmentation)

#### *Reading initial dataset*

In [None]:
arxiv_data = pd.read_csv('data/arxiv-dataset-cat1.csv')

# Number of different categories
number_of_categories = len(arxiv_data['label'].unique())

#### *Splitting in training and test set*

The initial dataset is partitioned in training and test set, with the following strategy:
- **70%** training and **30%** test;
- **Stratified sampling** based on the value of the label (categeory).

In [None]:
train_set, test_set = train_test_split(arxiv_data, test_size = 0.3, stratify = arxiv_data['label'], random_state = 19)

#### *Pre-processing on training set*

A phase of pre-processing is applied to the textual observation of the training set, with the following operations:
- Converting in **lower case**;
- Removing **special characters and symbols**;
- Removing **stop words**;
- **Lemmatization**.

Therefore, the labels associated with articles' categories are converted in **one-hot representation**.

*The processed dataset has been saved in a CSV file, in such a way that it is not necessary to repeat the time-consuming procedure each time the code is executed.*

In [None]:
#train_set_processed = pre_processing.dataPreProcessing(train_set)
#train_set_processed.to_csv('data/train-set-cat1-processed.csv', index = False)

train_set = pd.read_csv('data/train-set-cat1-processed.csv')

# One-hot encoding of the labels
label_train = to_categorical(train_set['label'], num_classes = number_of_categories, dtype = 'int64') 

#### *Data augmentation*
We have decided to use some **data augmentation techniques** with the aim of balancing the training set, considering these issues:
- **Number of words per observation**

    About this problem, a **threshold** of at least **15 words per observation** has been setted. In order to obtain this result, the **Random Insertion** technique has been used.\
    Our developed **algorithm** follows these steps:
    
    1. For each observation which does have less words than the threshold, get a **random sample of 5 words** from the initial text;
    2. Then, get the **5 most similar words** for each of the previously retrieved tokens and from this set, get **n random words** (we set *n* equal to *15*);
    3. Insert the **n words** in random positions within the starting text and substitute the augmented text in the train set;
    4. Redo the **pre-processing phase**, due to the fact that the random insertion could have include words we do not accept in our context (like stop words). In this way, we also apply the formatting;
    5. **Filter** another time the train set with the aim of finding "under-threshold" observations (pre-processing could have deleted some words);
    6. Repeat the algorithm until there are no more observations to augment.

In [None]:
train_set_augmented = train_set.copy()

# Count the number of words for each observation
train_set_augmented['words'] = train_set_augmented['text'].apply(lambda row: len(row.split()))

# Min number of words per observation
min_number_words = 15

# Set of observations with less than the threshold (needed data augmentation)
observations_to_augment = train_set_augmented[train_set_augmented['words'] < min_number_words]

# Only if there are still observations to augment
while(len(observations_to_augment) != 0):

    # Creating the augmented observations
    obs_augmented = observations_to_augment['text'].apply(lambda row: data_augmentation.randomInsert(row, min_number_words))

    # For each augmented observation replace the text in the training set and redo the pre-processing
    for i in obs_augmented.index:

        train_set_augmented.loc[i,'text'] = obs_augmented[i]
        train_set_augmented.loc[i,'text'] = pre_processing.dataPreProcessing(pd.DataFrame(train_set_augmented.loc[i,:]).transpose()).loc[i,'text']
        train_set_augmented.loc[i,'words'] = len(train_set_augmented.loc[i,'text'].split())

    observations_to_augment = train_set_augmented[train_set_augmented['words'] < min_number_words]

# Drop the unuseful column and write the CSV (previous operations are computational expensive)
train_set_augmented.drop('words', axis = 1, inplace = True)
train_set_augmented.to_csv('data/train-set-cat1-augmented-words-obs.csv', index = False)

In [142]:
train_set_augmented = pd.read_csv('data/train-set-cat1-augmented-words-obs.csv')

- **Class imbalance management**

    ...

In [144]:
round(train_set_augmented['label'].value_counts() / sum(train_set['label'].value_counts()), 2)

4    0.33
0    0.26
3    0.21
5    0.06
6    0.06
7    0.04
2    0.03
1    0.02
Name: label, dtype: float64

#### *Text vectorization and embedding*

Firstable the following **parameters** are defined:
- **Size of the vocabulary** to create;
- **Number of words** considered for each text (article);
- **Dimension of the embedding**;

In [None]:
vocabulary_size = 50000
words_per_sentence = 200
embedding_dim = 100

Then, we have opted for **two different approaches**:

- **Version 1: Keras vectorization and GloVe embedding**\
In this scenario, the vectorization (and so the creation of the vocabulary) is carried out using the **Keras built-in function**, with the final adaption of the text vectorizer on the training set. For the embedding matrix, we have used a pre-trained solution, named **GloVe**, with 100 dimensions.

In [None]:
text_vectorizer_keras = text_vectorization.createTextVectorizer(vocabulary_size, words_per_sentence, train_set['text'])
vocabulary_keras = text_vectorizer_keras.get_vocabulary()

embedding_matrix_glove = embedding.buildEmbeddingMatrix(embedding_dim, vocabulary_keras)
embedding_layer_glove = embedding.createEmbeddingLayer(embedding_matrix_glove, None)

feature_train_glove = text_vectorization.textVectorization(train_set['text'], text_vectorizer_keras)

- **Version 2: Word2Vec vectorization and embedding**\
The second strategy plans to use a text vectorizer, a vocabulary and an embedding using a **Word2Vec model** directly trained and created on our training set.

In [None]:
word2vec = text_vectorization.createTextVectorizerWord2Vec(train_set['text'], vocabulary_size, embedding_dim)
text_vectorizer_word2vec = word2vec['text_vectorizer']
vocabulary_word2vec = list(word2vec['vocabulary_embedding'].key_to_index)

embedding_matrix_word2vec = embedding.buildingEmbeddingMatrixWord2Vec(embedding_dim, vocabulary_word2vec, word2vec['vocabulary_embedding'])
embedding_layer_word2vec = embedding.createEmbeddingLayer(embedding_matrix_word2vec, None)

feature_train_word2vec = text_vectorization.textVectorizationWord2Vec(train_set['text'], text_vectorizer_word2vec, words_per_sentence)

#### *Neural network architecture*

In [None]:
# Input layer
input_layer = keras.Input(shape = (words_per_sentence,), dtype = 'int64')

# Embedding layer
x = embedding_layer_glove(input_layer)

# Hidden layers
x = keras.layers.Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(rate = 0.5)(x)

# Output layer
x = keras.layers.Dense(number_of_categories, activation = 'softmax')(x)
output_layer = x

# Neural network model
network = keras.Model(input_layer, output_layer)

#### *K-Fold Cross Validation*

In [None]:
hyperparams = [{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'rmsprop',
    'batch_size': 128

}, 

{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'adam',
    'batch_size': 128

}]

In [None]:
kfold_results = kfold_cv.kfoldCrossValidation(2, feature_train, label_train, network, hyperparams)