# Text Classification of StackOverflow using BiGRU RNNs with deep self-attention

In [1]:
import sys
import os
from os import pardir, getcwd
from os.path import join, abspath
PARENT_DIRECTORY = abspath(join(getcwd(), pardir))
sys.path.insert(0, PARENT_DIRECTORY)

import warnings
import sklearn.exceptions
import talos as ta
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

from definitions import  TALOS_DIR
from app.preprocessing import (load_dataset, load_embeddings,
                               preprocess_data, save_embeddings_matrix)
from app.models import load_bi_gru_model, find_best_model_over_scan_logs
from app.metrics import *

#Comment out In case of Testing use only a set of the tags as dataset
# tags_categories = ['c', 'python', 'java']
# RUN_STATE = 'testing'

#Comment out In case of Production use all the tags of the dataset
tags_categories = "__all__"
RUN_STATE = 'production'

Using TensorFlow backend.


## Preprocessing for the loaded Dataset
1. Format into *lowercase*
2. Remove some of the *punctuation* characters
3. Remove *Numbers*
4. Remove *stopwords*
5. Remove *links*

In [2]:
data = load_dataset(tags_categories=tags_categories, load_from_pickle=True)
classes_counts =data['tags'].value_counts().where(lambda cls: cls > 0).dropna() 
Classes = list(classes_counts.index)
Nclasses = len(Classes)
print(classes_counts, Classes, Nclasses)

python    2000.0
c         2000.0
java      2000.0
Name: tags, dtype: float64 ['python', 'c', 'java'] 3


 ### Hyper parameter tuning for the  model

In [3]:
# 70% Train & 30% Test
# 70% Train-Dev % 30* Train-Dev 
embeddings_voc, embeddings_vec = load_embeddings(load_from_pickle=True)
model_data = preprocess_data(data, 'tags', 'post', cv_split_dev=0.125)
embeddings_matrix_path = save_embeddings_matrix(embeddings_voc, embeddings_vec, model_data['words_index'])


In [4]:
model_data['x_train'][4]

array([  100,     2,   342,  2517,   599,    10,   452,  3686,   240,
          87,   100,     2,   420,   593,   112,  5572,    67,   934,
          16,    69,    10,   113,     3,     2,    32, 13392,     2,
           3,    64,    16,     3,     2,   622,     2,     3,   386,
           3,     2,   708,   240,    77,   189,   141,    32,   240,
         268,    32,    77,   114,   268,     7,    32,    32,   126,
         268,   240,   268,   463,   240,   268,    40,   240,   268,
          16,   240,    77,   708,   240,   240,   141,   240,     5,
         141,   708,   240,   240,    40,   708,   240,    16,   651,
           2,     3,   162,     3,     2, 13393,     2,     3,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Unistacked RNN with BiGRU & MLP on top of it

In [6]:
TALOS_BiGRU_DEEP_LOG_FILENAME = 'talos_bigru_deep_log'
talos_bigru_deep_log_pathname = os.path.join(TALOS_DIR, TALOS_BiGRU_DEEP_LOG_FILENAME)


###### Production configuration
rnn_deep_gru_config = {
    "model_type": ["keras_deep_BiGRU_model"],
    "embedding_dim": [embeddings_vec.shape[1]],
    "gru_size": [200],
    "dense": [300],
    "embeddings_matrix_path": [embeddings_matrix_path],
    "visualize_process": [True],
    "with_early_stoping": [True],
    "multistack_run": [False],
    'early_stopping':[True],
    'early_stopping_config__monitor': ['val_f1'],
    'early_stopping_config__min_delta': [0],
    'early_stopping_config__patience': [5],
    'early_stopping_config__mode': ['max'],
    "embeddings_dropout": [0.2],
    "var_dropout": [0.2, 0.6],
    "mlp_dropout": [0.2],
    "mlp_activation": ["softmax"],
    "rnn_activation": ["relu", "tanh"],
    "optimizer": ["Nadam", "Adam"],
    "batch_size": [32, 64],
    "epochs": [3 if RUN_STATE == 'testing' else 10]
}

history_model = ta.Scan(model_data['x_train'],
                        model_data['y_train'],
                        x_val=model_data['x_train_dev'],
                        y_val=model_data['y_train_dev'],
                        model=load_bi_gru_model,
                        params=rnn_deep_gru_config,
                        grid_downsample=0.1,
                        print_params=True,
                        last_epoch_value=True,
                        seed=(123),
                        dataset_name=talos_bigru_deep_log_pathname
                        )


W0714 18:32:58.048437 140322879141696 deprecation_wrapper.py:119] From /home/giannhs/miniconda3/envs/text_analytics/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0714 18:32:58.063734 140322879141696 deprecation_wrapper.py:119] From /home/giannhs/miniconda3/envs/text_analytics/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0714 18:32:58.067680 140322879141696 deprecation_wrapper.py:119] From /home/giannhs/miniconda3/envs/text_analytics/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0714 18:32:58.080005 140322879141696 deprecation_wrapper.py:119] From /home/giannhs/miniconda3/envs/text_analytics/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:17

{'model_type': 'keras_deep_BiGRU_model', 'embedding_dim': 300, 'gru_size': 200, 'dense': 300, 'embeddings_matrix': array([[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.0363, -0.0601, -0.079 , ...,  0.0515,  0.0123, -0.0028],
       ...,
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.1047,  0.0371, -0.0528, ...,  0.1238,  0.0719, -0.0474],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]]), 'visualize_process': True, 'with_early_stoping': True, 'multistack_run': False, 'early_stopping': True, 'early_stopping_config__monitor': 'val_f1', 'early_stopping_config__min_delta': 0, 'early_stopping_config__patience': 5, 'early_stopping_config__mode': 'max', 'embeddings_dropout': 0.2, 'var_dropout': 0.2, 'mlp_dropout': 0.2, 'mlp_activation': 'softmax', 'rnn_activation': 'tanh', 'optimizer': 'Nadam', 'batch_size': 64, 'epochs': 3}


W0714 18:32:58.456665 140322879141696 deprecation.py:323] From /home/giannhs/miniconda3/envs/text_analytics/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0714 18:32:58.834827 140322879141696 deprecation_wrapper.py:119] From /home/giannhs/miniconda3/envs/text_analytics/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 600)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 600, 300)          6000600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 600, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 600, 400)          601200    
_________________________________________________________________
deep_attention_1 (DeepAttent [(None, 400), (None, 600) 160801    
_________________________________________________________________
dense_1 (Dense)              (None, 300)               120300    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 903       
Total para

HBox(children=(IntProgress(value=0, description='Training', max=3, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=3675, style=ProgressStyle(description_width='in…


Epoch 00001: val_accuracy improved from -inf to 0.94540, saving model to keras_deep_BiGRU_model

Epoch 00001: val_f1 improved from -inf to 0.91539, saving model to keras_deep_BiGRU_model


HBox(children=(IntProgress(value=0, description='Epoch 1', max=3675, style=ProgressStyle(description_width='in…


Epoch 00002: val_accuracy improved from 0.94540 to 0.95556, saving model to keras_deep_BiGRU_model

Epoch 00002: val_f1 improved from 0.91539 to 0.93208, saving model to keras_deep_BiGRU_model


HBox(children=(IntProgress(value=0, description='Epoch 2', max=3675, style=ProgressStyle(description_width='in…


Epoch 00003: val_accuracy improved from 0.95556 to 0.96063, saving model to keras_deep_BiGRU_model

Epoch 00003: val_f1 improved from 0.93208 to 0.93986, saving model to keras_deep_BiGRU_model


100%|██████████| 1/1 [11:21<00:00, 681.90s/it]


    Finds the best model configuration set for the BiGRU with deep self-attention, after the Talos Scanning.

In [None]:
report_talos = ta.Reporting(history_model)
best_model_idx = report_talos.data['val_f1'].idxmax()
best_model_params = report_talos.data.loc[best_model_idx].to_dict()
best_model_params

    Train return am RNN BiGRU model with the the best configuration set.

In [None]:
if RUN_STATE == 'testing':
    # Train and Load the best model of given the tuned featured model
    best_model_params['early_stopping'] = True
    best_model_params['with_early_stopping'] = True
    best_model_params['visualize_process'] = True
    model_history, model = load_bi_gru_model(model_data['x_train'],
                                              model_data['y_train'],
                                              model_data['x_train_dev'],
                                              model_data['y_train_dev'],
                                              best_model_params)

### Visualize Model History Scores

In [None]:
if RUN_STATE == 'testing':
    from app.visualization import plot_history_metrics
    import matplotlib.pylab as plt

    %matplotlib inline
    plot_history_metrics(history_obj=model_history)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
if RUN_STATE == 'testing':
    score = model.evaluate(model_data_ftc['x_test'],
                           model_data_ftc['y_test'],
                           batch_size=best_model_params['batch_size'],
                           verbose=1)

    print('\nTest f1: %.4f' % (score[1]))
    print('\nTest categorical accuracy: %.4f'% (score[2]))

### Visualize Prediction Perfomance  model

In [None]:
if RUN_STATE == 'testing':
    import numpy as np
    from app.visualization import (plot_prediction_metrics,
                                   create_clf_report,
                                   plot_roc_curve,
                                   plot_precision_recall_curve,
                                   plot_confusion_matrix)
    import matplotlib.pylab as plt

    prediction_val = model.predict(model_data['x_test'], batch_size=best_model_params['batch_size'])

    # returns each entry result to the classification with the relevant probabilities
    y_pred_processed = np.array([np.argmax(val) for val in prediction_val])
    y_true_processed = np.array([np.argmax(val) for val in model_data['y_test']])

    # If you want to see the OneVSAll ROC Curves of each class uncomment the below line
    # plot_roc_curve(model_data['y_test'], prediction_val, Classes, 1)

    # If you want to see the OneVSAll Precission Recall Curves of each class, comment out the below line
    # plot_precision_recall_curve(model_data['y_test'], prediction_val, Classes , 1)

    # If you want to get the Classification Report, comment out the below line
    create_clf_report(model_data['y_test'], (prediction_val > 0.5).astype('int32'),
                      y_true_processed, y_pred_processed)

    # If you want to get the confusion matrix,comment out the below line
    plot_confusion_matrix(y_true_processed, y_pred_processed, Classes)

## Multistack RNN with BiGRU & MLP on top of it

In [11]:
TALOS_BiGRU_DEEP_MUTLI_LOG_FILENAME = 'talos_bigru_deep_multi_log'
talos_bigru_deep_multi_log_pathname = os.path.join(TALOS_DIR, TALOS_BiGRU_DEEP_MUTLI_LOG_FILENAME)


###### Production configuration
rnn_deep_gru_multi_config = rnn_deep_gru_config.copy()
rnn_deep_gru_multi_config.update({
    "model_type": ["keras_deep_BiGRU_multi_model"],
    "multistack_run": [True],
})

history_model_multi = ta.Scan(model_data['x_train'],
                        model_data['y_train'],
                        x_val=model_data['x_train_dev'],
                        y_val=model_data['y_train_dev'],
                        model=load_bi_gru_model,
                        params=rnn_deep_gru_multi_config,
                        grid_downsample=0.1,
                        print_params=True,
                        last_epoch_value=True,
                        seed=(123),
                        dataset_name=talos_bigru_deep_multi_log_pathname
                        )


  0%|          | 0/1 [00:00<?, ?it/s]

{'model_type': 'keras_deep_BiGRU_multi_model', 'embedding_dim': 300, 'gru_size': 200, 'dense': 300, 'embeddings_matrix': array([[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.0363, -0.0601, -0.079 , ...,  0.0515,  0.0123, -0.0028],
       ...,
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.1047,  0.0371, -0.0528, ...,  0.1238,  0.0719, -0.0474],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]]), 'visualize_process': True, 'with_early_stoping': True, 'multistack_run': True, 'early_stopping': True, 'early_stopping_config__monitor': 'val_f1', 'early_stopping_config__min_delta': 0, 'early_stopping_config__patience': 5, 'early_stopping_config__mode': 'max', 'embeddings_dropout': 0.2, 'var_dropout': 0.6, 'mlp_dropout': 0.2, 'mlp_activation': 'softmax', 'rnn_activation': 'tanh', 'optimizer': 'Nadam', 'batch_size': 64, 'epochs': 3}


W0714 18:48:09.236534 140322879141696 nn_ops.py:4224] Large dropout rate: 0.6 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0714 18:48:09.254832 140322879141696 nn_ops.py:4224] Large dropout rate: 0.6 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0714 18:48:09.279870 140322879141696 nn_ops.py:4224] Large dropout rate: 0.6 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0714 18:48:09.471505 140322879141696 nn_ops.py:4224] Large dropout rate: 0.6 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0714 18:48:09.492430 140322879141696 nn_ops.py:4224] Large dropout rate: 0.6 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 600)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 600, 300)          6000600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 600, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 600, 400)          601200    
_________________________________________________________________
gru_2 (GRU)                  (None, 600, 200)          360600    
_________________________________________________________________
deep_attention_1 (DeepAttent [(None, 200), (None, 600) 40401     
_________________________________________________________________
dense_1 (Dense)              (None, 300)               60300     
__________

HBox(children=(IntProgress(value=0, description='Training', max=3, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=3675, style=ProgressStyle(description_width='in…


Epoch 00001: val_accuracy improved from -inf to 0.94159, saving model to keras_deep_BiGRU_multi_model

Epoch 00001: val_f1 improved from -inf to 0.91080, saving model to keras_deep_BiGRU_multi_model


HBox(children=(IntProgress(value=0, description='Epoch 1', max=3675, style=ProgressStyle(description_width='in…


Epoch 00002: val_accuracy improved from 0.94159 to 0.96698, saving model to keras_deep_BiGRU_multi_model

Epoch 00002: val_f1 improved from 0.91080 to 0.94963, saving model to keras_deep_BiGRU_multi_model


HBox(children=(IntProgress(value=0, description='Epoch 2', max=3675, style=ProgressStyle(description_width='in…

100%|██████████| 1/1 [14:43<00:00, 883.68s/it]


Epoch 00003: val_accuracy did not improve from 0.96698

Epoch 00003: val_f1 did not improve from 0.94963





    Finds the best model configuration set for the Multistacked BiGRU with deep self-attention, after the Talos Scanning.

In [None]:
report_talos_multi = ta.Reporting(history_model_multi)
best_model_idx = report_talos_multi.data['val_f1'].idxmax()
best_model_params_multi = report_talos_multi.data.loc[best_model_idx].to_dict()
best_model_params_multi

    Train return an Multistacked RNN BiGRU model with the the best configuration set.

In [None]:
if RUN_STATE == 'testing':
    # Train and Load the best model of given the tuned featured model
    model_history_multi, model_multi = load_bi_gru_model(model_data['x_train'],
                                                         model_data['y_train'],
                                                         model_data['x_train_dev'],
                                                         model_data['y_train_dev'],
                                                         best_model_params)

### Visualize Model History Scores

In [None]:
if RUN_STATE == 'testing':
    from app.visualization import plot_history_metrics
    import matplotlib.pylab as plt

    %matplotlib inline
    plot_history_metrics(history_obj=model_history_multi)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
if RUN_STATE == 'testing':
    score_multi = model_multi.evaluate(model_data['x_test'],
                                       model_data['y_test'],
                                       batch_size=best_model_params_multi['batch_size'],
                                       verbose=1)

    print('\nTest f1: %.4f' % (score_multi[1]))
    print('\nTest categorical accuracy: %.4f'% (score_multi[2]))

### Visualize Prediction Perfomance  model

In [None]:
if RUN_STATE == 'testing':
    import numpy as np
    from app.visualization import (plot_prediction_metrics,
                                   create_clf_report,
                                   plot_roc_curve,
                                   plot_precision_recall_curve,
                                   plot_confusion_matrix)
    import matplotlib.pylab as plt

    prediction_val_multi = model.predict(model_data['x_test'], batch_size=best_model_params['batch_size'])

    # returns each entry result to the classification with the relevant probabilities
    y_pred_processed_multi = np.array([np.argmax(val) for val in prediction_val_multi])
    y_true_processed_multi = np.array([np.argmax(val) for val in model_data['y_test']])

    # If you want to see the OneVSAll ROC Curves of each class uncomment the below line
    # plot_roc_curve(model_data['y_test'], prediction_val, Classes, 1)

    # If you want to see the OneVSAll Precission Recall Curves of each class, comment out the below line
    # plot_precision_recall_curve(model_data['y_test'], prediction_val, Classes , 1)

    # If you want to get the Classification Report, comment out the below line
    create_clf_report(model_data['y_test'], (prediction_val_multi > 0.5).astype('int32'),
                      y_true_processed_multi, y_pred_processed_multi)

    # If you want to get the confusion matrix,comment out the below line
    plot_confusion_matrix(y_true_processed_multi, y_pred_processed_multi, Classes)