In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
import json
import pickle
from matplotlib import pyplot as plt
data_folder = "data"

In [7]:
df = pd.read_csv(data_folder + "/arxiv-dataset.csv")

In [8]:
inputs = []
for abstract in df["abstract"]:
    inputs.append(abstract)

In [9]:
class_names = sorted(set(df["categoryGroup"]))

In [10]:
index = 0
labels = []
for name in class_names:
    for item in df["categoryGroup"]:
        if item == name:
            labels.append(index)
    index = index + 1

In [11]:
# Number of available messages
data_card = len(inputs)
data_card

77208

In [12]:
# Apply (the same) random shuffle to inputs and targets
rng = np.random.default_rng()
shuffler = rng.permutation(data_card)

In [13]:
inputs = [inputs[i] for i in shuffler]
labels = [labels[i] for i in shuffler]

In [14]:
# Specify the ratio of training-validation data and compute the corresponding number of elements
split_fraction = 0.8
N_train = int(split_fraction * data_card)
N_train

61766

In [15]:
x_train = inputs[0:N_train]
y_train = labels[0:N_train]

In [16]:
x_val = inputs[N_train:]
y_val = labels[N_train:]

In [17]:
len(x_val)

15442

In [18]:
# Convert targets to pure arrays (we will take care of x_train and x_val later)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [19]:
total_words = 20000
words_per_sentence = 200

In [20]:
# Instantiate a vectorizer
vectorizer = TextVectorization(max_tokens=total_words, output_sequence_length=words_per_sentence)

In [21]:
# Create the vocabulary in the vectorizer
vectorizer.adapt(x_train)
# Alternative for very large datasets: feed the corpus by batch
# text_ds = tf.data.Dataset.from_tensor_slices(x_train).batch(128)
# vectorizer.adapt(text_ds)

In [22]:
# Inspect vocabulary (e.g. the first 4 words)
vectorizer.get_vocabulary()[0:5]

['', '[UNK]', 'the', 'of', 'and']

In [23]:
# Test vectorizer (word-to-index vocabulary)
vectorizer([["the cat sat on the sofa"]])

<tf.Tensor: shape=(1, 200), dtype=int64, numpy=
array([[   2, 7925, 6027,   13,    2,    1,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [30]:
# Apply vectorizer to all our data
x_train = vectorizer(x_train).numpy()
x_val = vectorizer(x_val).numpy()

glove.6B.100d.txt troppo grande, si trova su GDrive

In [32]:
embeddings_index = {}
with open(data_folder + '/glove.6B.100d.txt',  encoding="utf8") as f:
  # For each text line
  for line in f:
    # Separate the word-string from the 100-dimensional-vector-string
    word, coeffs = line.split(maxsplit=1)
    # Convert 100-dimensional vector string into a proper floating point vector
    coeffs = np.fromstring(coeffs, "f", sep=" ")
    # Create a new dictionary entry
    embeddings_index[word] = coeffs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [33]:
embedding_dim = 100

In [34]:
# Size of the matrix: 20002 rows x 100 dimensions
#   20000 words from our vocabulary + word separator + [unknown word]
#   100-dimensional representation from GloVe
embedding_matrix = np.zeros((total_words+2, embedding_dim))

# For each word in our vocabulary
for i, word in enumerate(vectorizer.get_vocabulary()):
    # Search corresponding embedding in GloVe,
    # and add it in the correct row of the embedding matrix
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    # else: words not found in embedding index will be all-zeros.

In [35]:
# Transfer the embedding matrix to a Keras Embedding layer
embedding_layer = Embedding(
    total_words+2,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

## Model

* Input layer (200 input words/tokens)
* Embedding layer mapping each word(token) to a 100-dimensional vector
* 1-dimensional convolution (mapping 128, filter size 5): perchè ciascun input è una riga di dati
* global max-pooling: andrà ad eliminare parte delle informazioni
* Dropout with 50% probability
* Dense layer mapping to the number of classes

In [36]:
# Define architecture
inputs = keras.Input(shape=(words_per_sentence,), dtype='int64')
x = embedding_layer(inputs)
x = keras.layers.Conv1D(128, 5, activation='relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(20, activation='softmax')(x)
outputs = x
net = keras.Model(inputs, outputs)

We are using 1-dimensional layers (Convolutional and Pooling) because each input is represented by a 1-dimensional vector.
The Dropout is used to fight overfitting. During training 50% of the inputs are dropped (set to 0), in order to create noise in the inputs to the subsequent layers. During test, it does nothing.

In [37]:
# Visualize the defined architecture
net.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 100)          2000200   
                                                                 
 conv1d (Conv1D)             (None, 196, 128)          64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 20)                2580      
                                                             

In [38]:
# Compile neural model
net.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [None]:
# Train
history = net.fit(x_train, y_train, batch_size=128, epochs=15, validation_data=(x_val, y_val))

salvo il modello allenato

In [None]:
net.save(data_folder+"firstModelTrained.h5")

salvo file history.json per grafici

In [None]:
json.dump(history.history, open(data_folder+"/firstModelTrainedHistory.json", 'w'))

leggo il file history.json

In [49]:
history = json.load(open(data_folder+"/firstModelTrainedHistory.json", 'r'))

carico il modello allenato\
(si trova su GDrive)

In [27]:
loaded_model = keras.models.load_model(data_folder+'/FirstModelTrained.h5', compile=False)

Visualize accuracy plots

In [None]:
plt.plot(history['acc'])
plt.plot(history['val_acc'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['train', 'valid'])

<matplotlib.legend.Legend at 0x22b83006f20>

# Inference usage

In [24]:
  # Input as string
input_sentence = 'The article reveals the main theoretical approaches to the analysis and study of the phenomenon of corruption. Special attention is paid to the consideration of the index approach to the analysis of corruption.'

In [25]:
# Input as vocabulary indices
input_vector = vectorizer(input_sentence)
input_vector

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([   2,  163, 1898,    2,  204,  223,  292,    7,    2,   59,    4,
         48,    3,    2,  945,    3, 5127,  363,  810,    9, 3296,    7,
          2, 1985,    3,    2,  520,   58,    7,    2,   59,    3, 5127,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [28]:
# Embedded matrix and output probability distribution
output_probs = loaded_model.predict(input_vector[None, ...])
output_probs



array([[0.22900328, 0.03445108, 0.0217946 , 0.08389115, 0.43906447,
        0.09128293, 0.04134462, 0.05916787]], dtype=float32)

In [29]:
np.argmax(output_probs[0])

4

In [30]:
# Selection of highest probabilty class
class_names[np.argmax(output_probs[0])]

'Physics'