In [163]:
import pandas as pd
import numpy as np

import pickle
import importlib
import keras

import nltk
#nltk.data.path.append('C:\\Users\\della\\anaconda3\\nltk-data')

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import pre_processing
#importlib.reload(pre_processing)

import text_vectorization
#importlib.reload(text_vectorization)

import embedding
#importlib.reload(embedding)

import kfold_cv
importlib.reload(kfold_cv)

<module 'kfold_cv' from 'c:\\Users\\della\\Desktop\\arXiv_classification\\kfold_cv.py'>

Reading initial dataset

In [164]:
arxiv_data = pd.read_csv('data/arxiv-dataset-cat1.csv')

In [165]:
number_of_categories = len(arxiv_data['label'].unique())

Splitting in training and test set

In [166]:
train_set, test_set = train_test_split(arxiv_data, test_size = 0.3, stratify = arxiv_data['label'], random_state = 19)

Pre-processing training set

In [167]:
#train_set_processed = pre_processing.dataPreProcessing(train_set)
#train_set_processed.to_csv('data/train-set-cat1-processed.csv', index = False)
train_set = pd.read_csv('data/train-set-cat1-processed.csv')

### Text vectorization and embedding

In [168]:
vocabulary_size = 50000
words_per_sentence = 200
embedding_dim = 100

Version 1: Keras

In [169]:
text_vectorizer = text_vectorization.createTextVectorizer(vocabulary_size, words_per_sentence, train_set['text'])
vocabulary = text_vectorizer.get_vocabulary()

embedding_matrix = embedding.buildEmbeddingMatrix(embedding_dim, vocabulary)
embedding_layer = embedding.createEmbeddingLayer(embedding_matrix, None)

feature_train = text_vectorization.textVectorization(train_set['text'], text_vectorizer)
label_train = to_categorical(train_set['label'], num_classes = number_of_categories, dtype = 'int64') # One-hot encoding

In [170]:
# Input layer
input_layer = keras.Input(shape = (words_per_sentence,), dtype = 'int64')

# Embedding layer
x = embedding_layer(input_layer)

# Hidden layers
x = keras.layers.Conv1D(filters = 128, kernel_size = 5, activation = 'relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(rate = 0.5)(x)

# Output layer
x = keras.layers.Dense(number_of_categories, activation = 'softmax')(x)
output_layer = x

# Neural network model
network = keras.Model(input_layer, output_layer)

In [171]:
hyperparams = [{

    'filters': 128,
    'kernel_size': 5,
    'rate': 0.5,
    'optimizer': 'rmsprop',
    'batch_size': 128

}]

In [172]:
kfold_results = kfold_cv.kfoldCrossValidation(2, feature_train, label_train, network, hyperparams)



In [178]:
arxiv_data['label'].value_counts()

4    25500
0    19708
3    16000
5     5000
6     4500
7     3000
2     2000
1     1500
Name: label, dtype: int64

In [209]:
x = arxiv_data['text'].apply(lambda row: len(row.split()))

In [210]:
x.describe()

count    77208.000000
mean        73.942221
std         40.522713
min          7.000000
25%         41.000000
50%         67.000000
75%        100.000000
max        354.000000
Name: text, dtype: float64

In [205]:
x = arxiv_data.iloc[x[x > 20].index, :]

In [208]:
x['label'].value_counts()

4    24847
0    19519
3    15145
5     4990
6     4498
7     2939
2     2000
1     1500
Name: label, dtype: int64

In [204]:
x[x > 20].index

Int64Index([    2,     4,     6,     8,    11,    12,    13,    14,    15,
               19,
            ...
            77198, 77199, 77200, 77201, 77202, 77203, 77204, 77205, 77206,
            77207],
           dtype='int64', length=75438)

In [189]:
len(x[x < 20])

1372

Verion 2: Word2Vec

In [None]:
word2vec = text_vectorization.createTextVectorizerWord2Vec(train_set['text'], vocabulary_size, embedding_dim)
text_vectorizer = word2vec['text_vectorizer']
vocabulary = list(word2vec['vocabulary_embedding'].key_to_index)

embedding_matrix = embedding.buildingEmbeddingMatrixWord2Vec(embedding_dim, vocabulary, word2vec['vocabulary_embedding'])
embedding_layer = embedding.createEmbeddingLayer(embedding_matrix, None)

feature_train = text_vectorization.textVectorizationWord2Vec(train_set['text'], text_vectorizer, words_per_sentence)
label_train = to_categorical(train_set['label'], num_classes = number_of_categories) # One-hot encoding