<a href="https://colab.research.google.com/github/peterbaile/squid/blob/master/DL_Quintile_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import os
import re
import time
import math

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from tqdm import tqdm
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import sklearn

tqdm.pandas()

Last Updated 2 Dec

**Housekeeping**

1. Download tensorflow_gpu (to enable much quicker training)
2. Download eli5
3. Download scikit-learn==0.21.3 (to enable text highlighting visualization of the eli5 explanations) https://github.com/TeamHG-Memex/eli5/issues/361

**Workflow**

1. Preprocessing raw text data
2. Loading existing word embeddings to create embedding matrix
3. Train RNN model (GRU) to classify documents into quintiles
4. Evaluating Model (Confusion Matrix)
5. Explainable Model Insights (contribution of each word to prediction)

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [7]:
train = pd.read_csv('drive/MyDrive/CIS520 Project/train.csv')
content = train['content'].tolist()

In [8]:
def preprocessing(content_list):
    
    processed_list = []
    
    for line in tqdm(content_list):
        tokens = word_tokenize(line)
        # Convert to lower case
        tokens = [w.lower() for w in tokens]
        # Remove punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # Remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # Filter out stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        
        processed_list.append(words)
        
    return processed_list

In [9]:
# Preprocessing the words
train['processed_content'] = preprocessing(train['content'])

100%|██████████| 16772/16772 [01:13<00:00, 226.80it/s]


**Training Classification Model**


In [10]:
# Extract the embeddings from the stored file
# Embedding is size 111k (# words) x 100 (dimensions)
import os 

EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('drive/MyDrive/CIS520 Project', 'word2vec_train2.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [11]:
# Vectorize the text samples into 2D integer tensor
tokenizer_obj = Tokenizer()
# Fit the tokenizer on the text
tokenizer_obj.fit_on_texts(train['processed_content'])
# Generate the sequence of tokens
sequences = tokenizer_obj.texts_to_sequences(train['processed_content'])

# Get the max length of each article - 5587
max_length = max([len(s) for s in train['processed_content']])
# Get vocab size
vocab_size = len(tokenizer_obj.word_index) + 1

# Pad the sequences
review_pad = pad_sequences(sequences, maxlen = max_length)

word_index = tokenizer_obj.word_index

In [12]:
num_words = len(word_index) + 1
words_not_found = []
# Create the emedding matrix - map embeddings from word2vec model for each word and create matrix of word vectors
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words: # Least common words (don't care)
        continue
        
    embedding_vector = embeddings_index.get(word)
    
    if (embedding_vector is not None):
        # Assign the ith elmenet of the embedding matrix to the embedding of that word
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
        
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

number of null word embeddings: 43


In [13]:
embedding_matrix.shape

(111813, 100)

**Training DL Model**


In [14]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, LSTM, GRU, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.optimizers import SGD

In [15]:
def RNN_Model():
    
    text_sequence = Input(shape = (max_length,), name = 'text_sequence_input')
    
    rnn_layer = Embedding(num_words, EMBEDDING_DIM, weights = [embedding_matrix], trainable = False, name = 'embedding')(text_sequence)
    
    # Embedding Dropout
    rnn_layer = SpatialDropout1D(0.25, name='EMBEDDING_DROPOUT')(rnn_layer)
    rnn_layer = GRU(units = 32, dropout = 0.2)(rnn_layer)
    output = Dense(5, activation = 'softmax', name = 'output')(rnn_layer)
    
    model = Model(inputs = text_sequence, outputs = output)
    
    return model

In [16]:
model = RNN_Model()
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_sequence_input (InputLa [(None, 5587)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 5587, 100)         11181300  
_________________________________________________________________
EMBEDDING_DROPOUT (SpatialDr (None, 5587, 100)         0         
_________________________________________________________________
gru (GRU)                    (None, 32)                12864     
_________________________________________________________________
output (Dense)               (None, 5)                 165       
Total params: 11,194,329
Trainable params: 13,029
Non-trainable params: 11,181,300
_________________________________________________________________


In [17]:
# Getting the y-variable (Quintile classification)

train['quintile'] = pd.cut(train['percentile'], [0, 0.2, 0.4, 0.6, 0.8, 1], labels = [1,2,3,4,5])
train['quintile'] = train['quintile'].astype(int)

# Split into train and validation set
VALIDATION_SPLIT = 0.2
dl_train, dl_val = train_test_split(train, test_size = VALIDATION_SPLIT, random_state = 42, stratify = train['quintile'])

train_indices = dl_train.index.tolist()
val_indices = dl_val.index.tolist()

# Get the training and validation data
X_train = review_pad[train_indices]
X_val = review_pad[val_indices]

#y_train = dl_train['quintile'].to_numpy()
#y_val = dl_val['quintile'].to_numpy()

# One Hot Encoding of y variable
y_train = pd.get_dummies(dl_train['quintile']).to_numpy()
y_val = pd.get_dummies(dl_val['quintile']).to_numpy()

print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_val: ', X_val.shape)
print('Shape of y_val: ', y_val.shape)

Shape of X_train:  (13417, 5587)
Shape of y_train:  (13417, 5)
Shape of X_val:  (3355, 5587)
Shape of y_val:  (3355, 5)


In [46]:
# Early stopping and model checkpoint
early_stopping = EarlyStopping(monitor = 'val_categorical_accuracy', patience = 4, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    'model.h5', monitor='val_categorical_accuracy', verbose=0, save_best_only=True)

# Train the DL Model
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['categorical_accuracy'])

model.fit(X_train, y_train, batch_size = 32, epochs = 20, validation_data = (X_val, y_val), verbose = 1,
         callbacks = [early_stopping, model_checkpoint])

Epoch 1/20
Epoch 2/20

KeyboardInterrupt: ignored

In [43]:
# Save model
model.save('drive/MyDrive/CIS520 Project/word2vec_gru_content1')

In [18]:
# Load model
model = keras.models.load_model('drive/MyDrive/CIS520 Project/word2vec_gru_content1')

**Evaluating the Model**

In [19]:
from sklearn.metrics import confusion_matrix

In [21]:
# Predict on the validation data - returns (3355, 5) matrix of predicted classes
val_probs = model.predict(X_val)
# Predicted quintiles
val_preds = np.argmax(val_probs, axis = 1)

y_val_actual = np.argmax(y_val, axis = 1)

In [30]:
# Confusion matrix
confusion_matrix(y_val_actual, val_preds)

array([[450, 125,  33,  33,  24],
       [232, 210,  86,  80,  62],
       [109, 193, 105, 145, 117],
       [ 43, 148, 100, 168, 213],
       [ 21,  70,  54, 147, 387]])

The results show that while the accuracy is low (~40%), the errors are mostly off-by-1, and decline with greater difference between actual and predicted class, which is a good sign.

**Model Interpretability using ELI5**

(Needs to be installed first)


In [23]:
import eli5
from eli5.lime import TextExplainer

In [24]:
# Define the custom predict function - input is list of strings (documents) and return matrix of shape (n_samples, n_classes) with probability values


# Assumes you already fitted the tokenizer on the training data
def predict_complex(documents_list):

  # Generate the sequence of tokens
  sequences = tokenizer_obj.texts_to_sequences(documents_list)

  # Pad the sequences
  X = pad_sequences(sequences, maxlen = 5587)

  # Predict
  y_probs = model.predict([X], batch_size = 32, verbose = 0)

  # *** Convert this into a one-class classification of bottom 3 quintiles vs top 2 quintiles
  y_high = y_probs[:, 3:].sum(axis = 1)
  y_low = y_probs[:, 0:3].sum(axis = 1)

  y_out = np.vstack((y_low, y_high)).T
  return y_out


In [29]:
te = TextExplainer(random_state = 42)

doc = ' '.join(dl_val['processed_content'].iloc[1])
te.fit(doc, predict_complex)
te.explain_prediction(target_names = ['low', 'high'])

Contribution?,Feature
0.91,<BIAS>
-0.824,Highlighted in text (sum)


In [28]:
te.explain_weights(target_names = ['low', 'high'])

Weight?,Feature
+0.465,back
+0.228,penn
+0.213,arts sciences
+0.204,tweet
+0.199,said
+0.194,received
+0.189,emergency
+0.180,university
+0.173,pm
+0.171,operations
