## Feature-engineering: pre-processing

In [8]:
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
!pip install mosestokenizer
from mosestokenizer import *

# removing punctuation, but leaving question mark and esclamation mark
useless_punctuation = string.punctuation.replace("?", "").replace("!", "")
# removing stop words
stop_words = stopwords.words('english')


def myPreprocess(text):
    # lowercasing
    text = text.lower()
    # tokenization
    words = nltk.word_tokenize(text)
    # initialing the stemmer
    porter = PorterStemmer()
    
    # stop words removal, punctuation removal, and stemming
    words = [porter.stem(word) for word in words 
             if word not in useless_punctuation 
             and word not in stop_words]
    return words


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Create vocabulary

In [9]:
# extarct the vacabulary from given title, returning the n_words most 
#frequent words

def extract_vocabulary(word_vector_list, n_words):
    words_dict = {}
    
    for word_vector in (word_vector_list):
        for word in word_vector:
            if word in words_dict:
                words_dict[word] += 1
            else:
                words_dict.update({word: 1})
    most_frequent = [k for k, v in sorted(words_dict.items(), 
                                          key=lambda item: item[1], 
                                          reverse=True)]
    
    return most_frequent[:min(n_words, len(most_frequent))]

# Framework for variable vocabulary size

The vocabulary extractor is intergrated in the vectorizer

In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder


import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras import backend as K
import tensorflow.keras as keras


Loading dataset, removing rows with NaN and splitting 60/20/20

In [23]:
df = pd.read_csv("NYdataset.csv", encoding="latin1")
df = df.dropna()


# random shuffling
df = df.sample(frac=1).reset_index(drop=True)

# the class we'll have to predict is "majortopic"

y = np.array(df["majortopic"]).reshape(-1,1)
X = df.drop(["majortopic", "subtopic", "id"], axis=1)

# merging title and summary
X["text"] = X["title"]+" "+ X["summary"]
# dropping useless columns
X = X.drop(["title", "summary", "year"], axis=1)

X = X["text"].apply(myPreprocess)


from sklearn.model_selection import train_test_split


# splitting 80/20

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2)
# splitting the 80 in 60/20 -> 60/20/20
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                    stratify=y_train, 
                                                    test_size=0.25)


In [26]:
#######################################################
#
# basic NN with 1 hidden leyer
#    - regularizer: parameter for the l2 regularizer
#       if set to None, no regularization is set
#
#######################################################

def NN_3(input_shape, num_classes, regularizer=None):

    model = keras.models.Sequential([
        Dense(200, activation='relu', 
              input_shape=[input_shape],
              kernel_regularizer=keras.regularizers.l2(regularizer),
              bias_regularizer = keras.regularizers.l2(regularizer),
              activity_regularizer = keras.regularizers.l2(regularizer)
              ),
             
        Dense(200, activation='relu', 
              kernel_regularizer=keras.regularizers.l2(regularizer),
              bias_regularizer = keras.regularizers.l2(regularizer),
              activity_regularizer = keras.regularizers.l2(regularizer)
              ),
        Dense(num_classes, activation='softmax', kernel_regularizer=keras.regularizers.l2(regularizer),
              bias_regularizer = keras.regularizers.l2(regularizer),
              activity_regularizer = keras.regularizers.l2(regularizer))
    ])

    return model

In [27]:
# for skipping preprocessing and tokenization in CountVectorizer
def dummy(doc):
        return doc

# suppressing useless tensorflow warnings 
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn



### TRAINING PARAMETERS ###

batch_size = 32
epochs = 200

vocabulary_sizes = [100, 1000, 10000, 100000]
l2_reg = [10e-2, 10e-4, 10e-6, 10e-8, 10e-10, 0]

### INITIALIZING MATRICES FOR ACCURACIES AND F1-SCORES ###
accuracies_val = np.zeros((len(vocabulary_sizes), len(l2_reg)))
f1_scores_val = np.zeros((len(vocabulary_sizes), len(l2_reg)))

accuracies = np.zeros((len(vocabulary_sizes), len(l2_reg)))
f1_scores = np.zeros((len(vocabulary_sizes), len(l2_reg)))



for index_voc, vocabulary_size in enumerate(vocabulary_sizes):
    print('#'*70)
    for index_reg, regularizer in enumerate(l2_reg):
       
        # extracting vocabulary of size *vocabulary_size*       
        vocabulary = extract_vocabulary(X_train, vocabulary_size)

        # bag of words with sum pooling and restricted vocabulary
        vectorizer = CountVectorizer(token_pattern = None, 
                                     tokenizer=dummy, 
                                     preprocessor=dummy, 
                                     vocabulary=vocabulary)
        vectorizer.fit(X_train)

        X_train_vec = vectorizer.transform(X_train)
        X_val_vec = vectorizer.transform(X_val)
        X_test_vec = vectorizer.transform(X_test)
        
        #encoding classes with one hot encoding for softmax
        one_hot_encoder = OneHotEncoder(sparse=False, 
                                        handle_unknown='ignore')
        
        one_hot_encoder.fit(y_train)

        
        y_train_enc = one_hot_encoder.transform(y_train)
        y_val_enc = one_hot_encoder.transform(y_val)
        y_test_enc = one_hot_encoder.transform(y_test)
        
        
        # model creation
        NUM_CLASSES = y_train_enc.shape[1]
        model = NN_3(X_train_vec.shape[1], NUM_CLASSES, regularizer)


        ### CALLBACKS ###

        checkpoint = tf.keras.callbacks.ModelCheckpoint(
            "./checkpoint",
            monitor='val_loss',
            save_best_only=True,
            verbose=0,
            mode='min')
        
        red_plateau = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', 
            factor=0.1, 
            patience=2)
        
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                      patience=2)

        callbacks = [early_stop, red_plateau]
      
        ### END CALLBACKS ###

        ### METRICS ###
        acc_metric = tf.keras.metrics.CategoricalAccuracy();
                        
        f1_metric = tfa.metrics.F1Score(num_classes=NUM_CLASSES, 
                                        average='macro')

        metrics = [acc_metric, f1_metric]

        ### END METRICS ###

        # model training
        adam = tf.optimizers.Adam()

        model.compile(loss="categorical_crossentropy",
                      optimizer=adam,
                      metrics=metrics)
        
        model.fit(
              X_train_vec,
              y_train_enc,
              batch_size=batch_size,
              epochs=epochs,
              verbose=0,
              validation_data=(X_val_vec, y_val_enc), 
              callbacks = callbacks)
        

        # storing results on validation (for selecting the l2 reg value)
        score = model.evaluate(X_val_vec, y_val_enc, verbose=0)
        accuracies_val[index_voc, index_reg] = score[1]
        f1_scores_val[index_voc, index_reg] = score[2]

        print("VAL =>  VOC: {}\t L2: {}\t LOSS: {}\t ACC: {}\t F1: {}".format(
                                                           vocabulary_size, 
                                                           regularizer,
                                                           round(score[0],4),
                                                           round(score[1],4), 
                                                           round(score[2],4)))
        # and related results on test set (so that I don't have to evaluate 
        # them later)

        score = model.evaluate(X_test_vec, y_test_enc, verbose=0)
        accuracies[index_voc, index_reg] = score[1]
        f1_scores[index_voc, index_reg] = score[2]
        
       
        
        print("TEST =>  VOC: {}\t L2: {}\t LOSS: {} \t ACC: {}\t F1: {}".format(
                                                           vocabulary_size, 
                                                           regularizer, 
                                                           round(score[0],4),
                                                           round(score[1],4), 
                                                           round(score[2],4)))

print("FINAL MATRICES")

print(accuracies_val)
print(f1_scores_val)

print(accuracies)
print(f1_scores)

######################################################################
VAL =>  VOC: 100	 L2: 0.1	 LOSS: 3.2096	 ACC: 0.2048	 F1: 0.0121
TEST =>  VOC: 100	 L2: 0.1	 LOSS: 3.2097 	 ACC: 0.2048	 F1: 0.0121
VAL =>  VOC: 100	 L2: 0.001	 LOSS: 2.1644	 ACC: 0.4349	 F1: 0.1837
TEST =>  VOC: 100	 L2: 0.001	 LOSS: 2.1686 	 ACC: 0.4345	 F1: 0.1857
VAL =>  VOC: 100	 L2: 1e-05	 LOSS: 1.9752	 ACC: 0.4349	 F1: 0.2234
TEST =>  VOC: 100	 L2: 1e-05	 LOSS: 1.9753 	 ACC: 0.4344	 F1: 0.2205
VAL =>  VOC: 100	 L2: 1e-07	 LOSS: 1.9635	 ACC: 0.4326	 F1: 0.2181
TEST =>  VOC: 100	 L2: 1e-07	 LOSS: 1.962 	 ACC: 0.4378	 F1: 0.2253
VAL =>  VOC: 100	 L2: 1e-09	 LOSS: 1.9632	 ACC: 0.4387	 F1: 0.2226
TEST =>  VOC: 100	 L2: 1e-09	 LOSS: 1.9711 	 ACC: 0.4442	 F1: 0.219
VAL =>  VOC: 100	 L2: 0	 LOSS: 1.9612	 ACC: 0.4444	 F1: 0.2203
TEST =>  VOC: 100	 L2: 0	 LOSS: 1.9611 	 ACC: 0.4476	 F1: 0.214
######################################################################
VAL =>  VOC: 1000	 L2: 0.1	 LOSS: 3.2096	 ACC: 0.2048	 F1

In [None]:
np.savetxt('acc_val.csv', np.transpose(accuracies_val), delimiter=",")
np.savetxt('f1score_val.csv', np.transpose(f1_scores_val), delimiter=",")

np.savetxt('acc.csv', np.transpose(accuracies), delimiter=",")
np.savetxt('f1score.csv', np.transpose(f1_scores), delimiter=",")