# Text Classification of StackOverflow using BiGRU RNNs with deep self-attention

In [1]:
import sys
import os
from os import pardir, getcwd
from os.path import join, abspath
PARENT_DIRECTORY = abspath(join(getcwd(), pardir))
sys.path.insert(0, PARENT_DIRECTORY)

import warnings
import sklearn.exceptions
import talos as ta
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

from definitions import  TALOS_DIR
from app.preprocessing import (load_dataset, load_embeddings,
                               preprocess_data, generate_embeddings_matrix)
from app.models import load_biGRU_model, find_best_model_over_scan_logs
from app.metrics import *


Using TensorFlow backend.


## Preprocessing for the loaded Dataset
1. Format into *lowercase*
2. Remove some of the *punctuation* characters
3. Remove *Numbers*
4. Remove *stopwords*
5. Remove *links*

In [2]:
data = load_dataset(tags_categories=['c', 'python', 'java'],load_from_pickle=True)
classes_counts =data['tags'].value_counts().where(lambda cls: cls > 0).dropna() 
Classes = list(classes_counts.index)
Nclasses = len(Classes)
print(classes_counts, Classes, Nclasses)

python    2000.0
c         2000.0
java      2000.0
Name: tags, dtype: float64 ['python', 'c', 'java'] 3


 ### Hyper parameter tuning for the  model

In [3]:
# 70% Train & 30% Test
# 70% Train-Dev % 30* Train-Dev 
embeddings_voc, embeddings_vec = load_embeddings(load_from_pickle=True)
model_data = preprocess_data(data, 'tags', 'post',
                             input_ins='as_centroids',
                             cv_split_dev=0.125)

embeddings_matrix = generate_embeddings_matrix(embeddings_voc, embeddings_vec, model_data['words_index'])


In [4]:
model_data['x_train'][4]


array([  100,     2,   342,  2517,   599,    10,   452,  3686,   240,
          87,   100,     2,   420,   593,   112,  5572,    67,   934,
          16,    69,    10,   113,     3,     2,    32, 13392,     2,
           3,    64,    16,     3,     2,   622,     2,     3,   386,
           3,     2,   708,   240,    77,   189,   141,    32,   240,
         268,    32,    77,   114,   268,     7,    32,    32,   126,
         268,   240,   268,   463,   240,   268,    40,   240,   268,
          16,   240,    77,   708,   240,   240,   141,   240,     5,
         141,   708,   240,   240,    40,   708,   240,    16,   651,
           2,     3,   162,     3,     2, 13393,     2,     3,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Unistacked RNN with BiGRU & MLP on top of it

In [None]:
TALOS_BiGRU_DEEP_LOG_FILENAME = 'talos_bigru_deep_log'
talos_bigru_deep_log_pathname = os.path.join(TALOS_DIR, TALOS_BiGRU_DEEP_LOG_FILENAME)


###### Production configuration
rnn_deep_gru_config = {
    "model_type": "keras_deep_BiGRU_model",
    "embedding_dim": [embeddings_vec.shape[1]],
    "gru_size": [200],
    "dense": [300],
    "embeddings_matrix": [embeddings_matrix],
    "visualize_process": [True],
    "with_early_stoping": [True],
    "multistack_run": [False],
    'early_stopping':[True],
    'early_stopping_config__monitor': ['val_f1'],
    'early_stopping_config__min_delta': [0],
    'early_stopping_config__patience': [5],
    'early_stopping_config__mode': ['max'],
    "embeddings_dropout": [0.2],
    "var_dropout": [0.2, 0.6],
    "mlp_dropout": [0.2],
    "mlp_activation": ["softmax"],
    "rnn_activation": ["sigmoid", "tanh"],
    "optimizer": ["Nadam", "Adam"],
    "batch_size": [32, 64],
    "epochs": [3]
}

history_model = ta.Scan(model_data['x_train'],
                        model_data['y_train'],
                        x_val=model_data['x_train_dev'],
                        y_val=model_data['y_train_dev'],
                        model=load_biGRU_model,
                        params=rnn_deep_gru_config,
                        grid_downsample=0.05,
                        print_params=True,
                        last_epoch_value=True,
                        seed=(123),
                        dataset_name=talos_bigru_deep_log_pathname
                        )


    Finds the best model configuration set for the BiGRU with deep self-attention, after the Talos Scanning.

In [None]:
report_talos = ta.Reporting(history_model)
best_model_idx = report_talos.data['val_f1'].idxmax()
best_model_params = report_talos.data.loc[best_model_idx].to_dict()
best_model_params

    Train return am RNN BiGRU model with the the best configuration set.

In [None]:
# Train and Load the best model of given the tuned featured model
best_model_params['embeddings_matrix'] = embeddings_matrix
best_model_params['early_stopping'] = True
best_model_params['with_early_stopping'] = True
best_model_params['visualize_process'] = True
model_history, model = load_biGRU_model(model_data['x_train'],
                                          model_data['y_train'],
                                          model_data['x_train_dev'],
                                          model_data['y_train_dev'],
                                          best_model_params)

### Visualize Model History Scores

In [None]:
from app.visualization import plot_history_metrics
import matplotlib.pylab as plt

%matplotlib inline
plot_history_metrics(history_obj=model_history)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
score = model.evaluate(model_data_ftc['x_test'],
                       model_data_ftc['y_test'],
                       batch_size=best_model_params['batch_size'],
                       verbose=1)

print('\nTest f1: %.4f' % (score[1]))
print('\nTest categorical accuracy: %.4f'% (score[2]))

### Visualize Prediction Perfomance  model

In [None]:
import numpy as np
from app.visualization import (plot_prediction_metrics,
                               create_clf_report,
                               plot_roc_curve,
                               plot_precision_recall_curve,
                               plot_confusion_matrix)
import matplotlib.pylab as plt

prediction_val = model.predict(model_data['x_test'], batch_size=best_model_params['batch_size'])

# returns each entry result to the classification with the relevant probabilities
y_pred_processed = np.array([np.argmax(val) for val in prediction_val])
y_true_processed = np.array([np.argmax(val) for val in model_data['y_test']])

# If you want to see the OneVSAll ROC Curves of each class uncomment the below line
# plot_roc_curve(model_data['y_test'], prediction_val, Classes, 1)

# If you want to see the OneVSAll Precission Recall Curves of each class, comment out the below line
# plot_precision_recall_curve(model_data['y_test'], prediction_val, Classes , 1)

# If you want to get the Classification Report, comment out the below line
create_clf_report(model['y_test'], (prediction_val > 0.5).astype('int32'),
                  y_true_processed, y_pred_processed)

# If you want to get the confusion matrix,comment out the below line
plot_confusion_matrix(y_true_processed, y_pred_processed, Classes)

## Multistack RNN with BiGRU & MLP on top of it

In [None]:
TALOS_BiGRU_DEEP_MUTLI_LOG_FILENAME = 'talos_bigru_deep_multi_log'
talos_bigru_deep_multi_log_pathname = os.path.join(TALOS_DIR, TALOS_BiGRU_DEEP_MULTI_LOG_FILENAME)


###### Production configuration
rnn_deep_gru_multi_config = rnn_deep_gru_config.copy()
rnn_deep_gru_multi_config.update({
    "model_type": "keras__BiGRU_multi_model",
    "multistack_run": [True],
})

history_model_multi = ta.Scan(model_data['x_train'],
                        model_data['y_train'],
                        x_val=model_data['x_train_dev'],
                        y_val=model_data['y_train_dev'],
                        model=load_biGRU_model,
                        params=rnn_deep_gru_mult_config,
                        grid_downsample=0.05,
                        print_params=True,
                        last_epoch_value=True,
                        seed=(123),
                        dataset_name=talos_bigru_deep_multi_log_pathname
                        )


    Finds the best model configuration set for the Multistacked BiGRU with deep self-attention, after the Talos Scanning.

In [None]:
report_talos_multi = ta.Reporting(history_model_multi)
best_model_idx = report_talos_multi.data['val_f1'].idxmax()
best_model_params_multi = report_talos_multi.data.loc[best_model_idx].to_dict()
best_model_params_multi

    Train return an Multistacked RNN BiGRU model with the the best configuration set.

In [None]:
# Train and Load the best model of given the tuned featured model
best_model_params_multi['embeddings_matrix'] = embeddings_matrix
model_history_multi, model_multi = load_lstm_model(model_data['x_train'],
                                                   model_data['y_train'],
                                                   model_data['x_train_dev'],
                                                   model_data['y_train_dev'],
                                                   best_model_params)

### Visualize Model History Scores

In [None]:
from app.visualization import plot_history_metrics
import matplotlib.pylab as plt

%matplotlib inline
plot_history_metrics(history_obj=model_history_multi)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
score_multi = model_multi.evaluate(model_data['x_test'],
                                   model_data['y_test'],
                                   batch_size=best_model_params_multi['batch_size'],
                                   verbose=1)

print('\nTest f1: %.4f' % (score_multi[1]))
print('\nTest categorical accuracy: %.4f'% (score_multi[2]))

### Visualize Prediction Perfomance  model

In [None]:
import numpy as np
from app.visualization import (plot_prediction_metrics,
                               create_clf_report,
                               plot_roc_curve,
                               plot_precision_recall_curve,
                               plot_confusion_matrix)
import matplotlib.pylab as plt

prediction_val_multi = model.predict(model_data['x_test'], batch_size=best_model_params['batch_size'])

# returns each entry result to the classification with the relevant probabilities
y_pred_processed_multi = np.array([np.argmax(val) for val in prediction_val_multi])
y_true_processed_multi = np.array([np.argmax(val) for val in model_data['y_test']])

# If you want to see the OneVSAll ROC Curves of each class uncomment the below line
# plot_roc_curve(model_data['y_test'], prediction_val, Classes, 1)

# If you want to see the OneVSAll Precission Recall Curves of each class, comment out the below line
# plot_precision_recall_curve(model_data['y_test'], prediction_val, Classes , 1)

# If you want to get the Classification Report, comment out the below line
create_clf_report(model['y_test'], (prediction_val_multi > 0.5).astype('int32'),
                  y_true_processed_multi, y_pred_processed_multi)

# If you want to get the confusion matrix,comment out the below line
plot_confusion_matrix(y_true_processed_multi, y_pred_processed_multi, Classes)