In [232]:
import pandas as pd
import numpy as np

import pickle
import importlib
import keras

import nltk
#nltk.data.path.append('C:\\Users\\della\\anaconda3\\nltk-data')

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import pre_processing
#importlib.reload(pre_processing)

import text_vectorization
#importlib.reload(text_vectorization)

import embedding
#importlib.reload(embedding)

Reading initial dataset

In [233]:
arxiv_data = pd.read_csv('data/arxiv-dataset-cat1.csv')

In [234]:
number_of_categories = len(arxiv_data['label'].unique())

Splitting in training and test set

In [235]:
train_set, test_set = train_test_split(arxiv_data, test_size = 0.3, stratify = arxiv_data['label'], random_state = 19)

Pre-processing training set

In [236]:
#train_set_processed = pre_processing.dataPreProcessing(train_set)
#train_set_processed.to_csv('data/train-set-cat1-processed.csv', index = False)
train_set = pd.read_csv('data/train-set-cat1-processed.csv')

### Text vectorization and embedding

In [237]:
vocabulary_size = 50000
words_per_sentence = 200
embedding_dim = 100

Version 1: Keras

In [238]:
text_vectorizer = text_vectorization.createTextVectorizer(vocabulary_size, words_per_sentence, train_set['text'])
vocabulary = text_vectorizer.get_vocabulary()

embedding_matrix = embedding.buildEmbeddingMatrix(embedding_dim, vocabulary)
embedding_layer = embedding.createEmbeddingLayer(embedding_matrix, None)

feature_train = text_vectorization.textVectorization(train_set['text'], text_vectorizer)
label_train = to_categorical(train_set['label'], num_classes = number_of_categories) # One-hot encoding

In [239]:
# Input layer
input_layer = keras.Input(shape = (words_per_sentence,), dtype = 'int64')

# Embedding layer
x = embedding_layer(input_layer)

# Hidden layers
x = keras.layers.Conv1D(128, 5, activation = 'relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(0.5)(x)

# Output layer
x = keras.layers.Dense(number_of_categories, activation = 'softmax')(x)
output_layer = x

# Neural network model
network = keras.Model(input_layer, output_layer)

In [None]:
# Compiling the network
network.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])

In [None]:
network.fit(
    x = feature_train, y = label_train, 
    batch_size = 128, epochs = 20, 
    validation_data = (feature_val, label_val)
)

In [242]:
network.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding_24 (Embedding)    (None, 200, 100)          5000200   
                                                                 
 conv1d (Conv1D)             (None, 196, 128)          64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 8)                 1032      
                                                             

Verion 2: Word2Vec

In [None]:
word2vec = text_vectorization.createTextVectorizerWord2Vec(train_set['text'], vocabulary_size, embedding_dim)
text_vectorizer = word2vec['text_vectorizer']
vocabulary = list(word2vec['vocabulary_embedding'].key_to_index)

embedding_matrix = embedding.buildingEmbeddingMatrixWord2Vec(embedding_dim, vocabulary, word2vec['vocabulary_embedding'])
embedding_layer = embedding.createEmbeddingLayer(embedding_matrix, None)

feature_train = text_vectorization.textVectorizationWord2Vec(train_set['text'], text_vectorizer, words_per_sentence)
label_train = to_categorical(train_set['label'], num_classes = number_of_categories) # One-hot encoding