In [None]:
import pandas as pd
import numpy as np

import pickle
import importlib
import keras

import nltk
#nltk.data.path.append('C:\\Users\\della\\anaconda3\\nltk-data')

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import pre_processing
#importlib.reload(pre_processing)

import text_vectorization
#importlib.reload(text_vectorization)

import embedding
#importlib.reload(embedding)

import kfold_cv
importlib.reload(kfold_cv)

#### *Reading initial dataset*

In [None]:
arxiv_data = pd.read_csv('data/arxiv-dataset-cat1.csv')

# Number of different categories
number_of_categories = len(arxiv_data['label'].unique())

#### *Splitting in training and test set*

The initial dataset is partitioned in training and test set, with the following strategy:
- **70%** training and **30%** test;
- **Stratified sampling** based on the value of the label (categeory).

In [None]:
train_set, test_set = train_test_split(arxiv_data, test_size = 0.3, stratify = arxiv_data['label'], random_state = 19)

In [32]:
import nltk
import random
import numpy as np
import pickle
from nltk.corpus import wordnet

with open('../glove-embedding.pkl', 'rb') as file:
    glove_embedding = pickle.load(file)

In [114]:
def getSimilarWords(tokenized_text):

    similar_words_total = []

    for word in tokenized_text:

        try:

            word_embedding = glove_embedding[word]
            distances = np.dot(list(glove_embedding.values()), word_embedding)
            most_similar_indices = np.argsort(-distances)[:5]
            similar_words = [ list(glove_embedding.keys())[i] for i in most_similar_indices ]

            similar_words_total = similar_words_total + similar_words
        
        except KeyError:

            similar_words_total = similar_words_total + []

    return similar_words_total

In [117]:
def randomInsert(text, n):

    index = 0
    tokenized_text = random.sample(nltk.word_tokenize(text), k = 5)
    augmented_tokenized_text = tokenized_text.copy()

    similar_words = random.sample(getSimilarWords(tokenized_text), k = n)

    while(index < n):

        new_word_index = random.randint(0, len(augmented_tokenized_text) - 1)
        augmented_tokenized_text.insert(new_word_index, similar_words[index])

        index = index + 1

    return " ".join(augmented_tokenized_text)

In [76]:
original_text = "The cat is sitting on the mat"

augmented_texts = randomInsert(original_text, n = 15)

print(augmented_texts)

The this cat in an mats of an be is that that has sitting i on dog cats sat the on mat


In [None]:
train_set_augmented

- Class imbalance
- Number of words per class

In [None]:
# NUMBER OF WORDS IMBALANCE
x = train_set['text'].apply(lambda row: len(row.split()))

In [None]:
# CLASS IMBALANCE
round(train_set['label'].value_counts() / sum(train_set['label'].value_counts()), 2)

In [None]:
train_set['words'] = x

In [None]:
y = train_set.groupby('label')['words'].sum().reset_index()

In [None]:
y['words'] / y['words'].sum()

In [None]:
train_set[train_set['words'] < 20]['words']

In [None]:
x

In [None]:
x = arxiv_data.iloc[x[x > 20].index, :]

#### *Pre-processing on training set*

A phase of pre-processing is applied to the textual observation of the training set, with the following operations:
- Converting in **lower case**;
- Removing **special characters and symbols**;
- Removing **stop words**;
- **Lemmatization**.

Therefore, the labels associated with articles' categories are converted in **one-hot representation**.

*The processed dataset has been saved in a CSV file, in such a way that it is not necessary to repeat the time-consuming procedure each time the code is executed.*

In [66]:
#train_set_processed = pre_processing.dataPreProcessing(train_set)
#train_set_processed.to_csv('data/train-set-cat1-processed.csv', index = False)

train_set = pd.read_csv('data/train-set-cat1-processed.csv')

# One-hot encoding of the labels
label_train = to_categorical(train_set['label'], num_classes = number_of_categories, dtype = 'int64') 

#### *Data augmentation*
We have decided to use some **data augmentation techniques** with the aim of balancing the training set, considering these issues:
- **Number of words per observation**\
About this problem, a **threshold** of at least **15 words per observation** has been setted. In order to obtain this result, the **Random Insertion** technique has been used.

In [151]:
train_set_augmented = train_set.copy()

# Count the number of words for each observation
train_set_augmented['words'] = train_set_augmented['text'].apply(lambda row: len(row.split()))

# Min number of words per observation
min_number_words = 15

# Set of observations with less than the threshold (needed data augmentation)
observations_to_augment = train_set_augmented[train_set_augmented['words'] < min_number_words]

#while(len(observations_to_augment) != 0):



In [130]:
len(observations_to_augment)

1671

In [118]:
x = observations_to_augment['text'].iloc[1:10].apply(lambda row: randomInsert(row, min_number_words))

In [156]:
x

8      be kuh n't i commutative sah hypersemigroups a...
42     domenico summary giuseppe contemporary mechani...
61     fax c = group group lopez f optical holographi...
77     einstein pictorial make 1-column pictorial phy...
91     mitul semigroups awards homotopy polytopes sem...
163    axis algebraic spheres linked suspected abelia...
167    research meaning kerner sexual mental cognitiv...
275    lengthwise crosswise shake slice you me crossw...
347    build shelah built shelah abelian formula_2 co...
Name: text, dtype: object

In [152]:
for f in x.index:

    observations_to_augment.loc[f,'text'] = x[f]

In [153]:
observations_to_augment.loc[:,'words'] = observations_to_augment['text'].apply(lambda row: len(row.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  observations_to_augment.loc[:,'words'] = observations_to_augment['text'].apply(lambda row: len(row.split()))


In [154]:
observations_to_augment[observations_to_augment['words'] < min_number_words]

Unnamed: 0,text,label,words
5,three analogue stern diatomic sequence sam nor...,3,13
420,veritas collaboration contribution st internat...,4,14
453,figure surgery stimes motoo tange paper withdr...,3,9
473,algebra infinite qubit system g sardanashvily ...,3,14
508,smooth homotopy sphere akio kawauchi every smo...,3,11
...,...,...,...
53903,mathematics liquid crystal john ball review gi...,4,14
53916,mathematical caricature large wave mikhail kov...,4,14
53933,supersymmetry dimensional system enrico deotto...,4,11
53962,inversion adjunction log canonicity masayuki k...,3,11


In [155]:
observations_to_augment.loc[8,:]

text     be kuh n't i commutative sah hypersemigroups a...
label                                                    3
words                                                   20
Name: 8, dtype: object

In [109]:
observations_to_augment.loc[8, 'text'] = x[8]

In [110]:
observations_to_augment.loc[8, 'text']

'our feudal semigroups insectivores semigroups hypersemigroups niovi serf i kehayopulu matroid paper je are lattices newspapers movie serf example show way pas it you that hah semigroups gammasemigroups hypersemigroups'

In [101]:
observations_to_augment.loc[8] = x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  observations_to_augment.loc[8, 0] = x


ValueError: Incompatible indexer with Series

In [93]:
x

8    use semigroups semigroups hypersemigroups niov...
Name: text, dtype: object

In [89]:
observations_to_augment['text'].iloc[1:2][8]

'semigroups semigroups hypersemigroups niovi kehayopulu paper serf example show way pas semigroups gammasemigroups hypersemigroups'

#### *Text vectorization and embedding*

Firstable the following **parameters** are defined:
- **Size of the vocabulary** to create;
- **Number of words** considered for each text (article);
- **Dimension of the embedding**;

In [None]:
vocabulary_size = 50000
words_per_sentence = 200
embedding_dim = 100

Then, we have opted for **two different approaches**:

- **Version 1: Keras vectorization and GloVe embedding**\
In this scenario, the vectorization (and so the creation of the vocabulary) is carried out using the **Keras built-in function**, with the final adaption of the text vectorizer on the training set. For the embedding matrix, we have used a pre-trained solution, named **GloVe**, with 100 dimensions.

In [None]:
text_vectorizer_keras = text_vectorization.createTextVectorizer(vocabulary_size, words_per_sentence, train_set['text'])
vocabulary_keras = text_vectorizer_keras.get_vocabulary()

embedding_matrix_glove = embedding.buildEmbeddingMatrix(embedding_dim, vocabulary_keras)
embedding_layer_glove = embedding.createEmbeddingLayer(embedding_matrix_glove, None)

feature_train_glove = text_vectorization.textVectorization(train_set['text'], text_vectorizer_keras)

- **Version 2: Word2Vec vectorization and embedding**\
The second strategy plans to use a text vectorizer, a vocabulary and an embedding using a **Word2Vec model** directly trained and created on our training set.

In [None]:
word2vec = text_vectorization.createTextVectorizerWord2Vec(train_set['text'], vocabulary_size, embedding_dim)
text_vectorizer_word2vec = word2vec['text_vectorizer']
vocabulary_word2vec = list(word2vec['vocabulary_embedding'].key_to_index)

embedding_matrix_word2vec = embedding.buildingEmbeddingMatrixWord2Vec(embedding_dim, vocabulary_word2vec, word2vec['vocabulary_embedding'])
embedding_layer_word2vec = embedding.createEmbeddingLayer(embedding_matrix_word2vec, None)

feature_train_word2vec = text_vectorization.textVectorizationWord2Vec(train_set['text'], text_vectorizer_word2vec, words_per_sentence)

#### *Neural network architecture*

In [None]:
# Input layer
input_layer = keras.Input(shape = (words_per_sentence,), dtype = 'int64')

# Embedding layer
x = embedding_layer_glove(input_layer)

# Hidden layers
x = keras.layers.Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(rate = 0.5)(x)

# Output layer
x = keras.layers.Dense(number_of_categories, activation = 'softmax')(x)
output_layer = x

# Neural network model
network = keras.Model(input_layer, output_layer)

#### *K-Fold Cross Validation*

In [None]:
hyperparams = [{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'rmsprop',
    'batch_size': 128

}, 

{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'adam',
    'batch_size': 128

}]

In [None]:
kfold_results = kfold_cv.kfoldCrossValidation(2, feature_train, label_train, network, hyperparams)