# ToxBlocker Model: Interactive Notebook

### Import NLP libraries

In [23]:
import pandas as pd
import numpy as np
from nltk import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

### Read in training and test data

In [25]:
dfTrain_full = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')

In [27]:
dfTrain.head(2)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4


In [28]:
dfTest.head(2)

Unnamed: 0,id,comment_text
0,7097320,[ Integrity means that you pay your debts.]\n\...
1,7097321,This is malfeasance by the Administrator and t...


### Pre-process text data before feeding to model

In [None]:
dfTrain['comment_text'] = dfTrain['comment_text'].str.replace(r'[^A-Za-z0-9]()?!@\s\'\~\*\"\_\n','')
dfTrain['comment_text'] = dfTrain['comment_text'].str.replace(r'@','at')
dfTrain['comment_text'] = dfTrain['comment_text'].str.lower()
dfTest['comment_text'] = dfTest['comment_text'].str.replace(r'[^A-Za-z0-9]()?!@\s\'\~\*\"\_\n','')
dfTest['comment_text'] = dfTest['comment_text'].str.replace(r'@','at')
dfTest['comment_text'] = dfTest['comment_text'].str.lower()

In [30]:
dfTrain['comment_text'].head()

0    this is so cool. it's like, 'would you want yo...
1    thank you!! this would make my life a lot less...
2    this is such an urgent design problem; kudos t...
3    is this something i'll be able to install on m...
4                 haha you guys are a bunch of losers.
Name: comment_text, dtype: object

In [None]:
tokenizer = RegexpTokenizer(r'\w+\'*[a-zA-Z]+')
dfTrain['toks'] = dfTrain['comment_text'].apply(tokenizer.tokenize)
dfTest['toks'] = dfTest['comment_text'].apply(tokenizer.tokenize)

In [None]:
sw = stopwords.words('english')
dfTrain['no stopwords'] = dfTrain['toks'].apply(lambda x: [item for item in x if item not in sw])
dfTest['no stopwords'] = dfTest['toks'].apply(lambda x: [item for item in x if item not in sw])

In [None]:
dfTrain['speech_tags'] = dfTrain['no stopwords'].apply(lambda x: pos_tag(x))

In [35]:
dfTest['speech_tags'] = dfTest['no stopwords'].apply(lambda x: pos_tag(x))

In [36]:
def get_wordnet_pos(treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

In [None]:
dfTrain['wordnet_pos'] = dfTrain['speech_tags'].apply(lambda x: [(word,get_wordnet_pos(pos_tag)) for (word,pos_tag) in x])

In [38]:
dfTest['wordnet_pos'] = dfTest['speech_tags'].apply(lambda x: [(word,get_wordnet_pos(pos_tag)) for (word,pos_tag) in x])

In [None]:
lem = WordNetLemmatizer()
dfTrain['lemmatized'] = dfTrain['wordnet_pos'].apply(lambda x: [lem.lemmatize(word,tag) for (word,tag) in x])
dfTest['lemmatized'] = dfTest['wordnet_pos'].apply(lambda x: [lem.lemmatize(word,tag) for (word,tag) in x])

In [90]:
words = [word for row in dfTrain['lemmatized'] for word in row]
vocab = list(set(words))
sentence_lengths = [len(sentence) for sentence in dfTrain['lemmatized']]
print('Number of words: ', len(words))
print('Number of Unique words: ', len(vocab))
print('Longest sentence: ', max(sentence_lengths), ' words')

Number of words:  27382853
Number of Unique words:  222000
Longest sentence:  305  words


### Perform word embedding

In [None]:
import gensim
from gensim.models import Word2Vec

word2vec_path = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz" #"~/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [42]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged
def get_word2vec_embeddings(vectors, clean_questions, generate_missing=False):
    embeddings = clean_questions['toks'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [44]:
embeddings = get_word2vec_embeddings(word2vec, dfTrain)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(embeddings, dfTrain['target'], 
                                                                                        test_size=0.2, random_state=40)

In [48]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 35
VOCAB_SIZE = len(vocab)

VALIDATION_SPLIT=.2
tokeniz = Tokenizer(num_words=VOCAB_SIZE)
tokeniz.fit_on_texts(dfTrain['lemmatized'].tolist())
sequences = tokeniz.texts_to_sequences(dfTrain['lemmatized'].tolist())

word_index = tokeniz.word_index

In [49]:
EMBEDDING_DIM = 300
embedding_weights = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word,index in word_index.items():
    embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)

### Train the model

In [None]:
!pip install tensorflow

In [51]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical
import keras

In [52]:
embedding_dim = 300
sequence_length = MAX_SEQUENCE_LENGTH
inp = Input(shape=(sequence_length,))
x = Embedding(len(word_index)+1, embedding_dim, weights=[embedding_weights], trainable=False)(inp)
x = Bidirectional(LSTM(64, recurrent_dropout=0.1))(x)
x = Dense(32)(x)
x = Dropout(0.25)(x)
y = Dense(1, activation='sigmoid')(x)
NN2 = Model(inp,y)

In [53]:
cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.asarray(dfTrain["target"])  # to_categorical(...)

In [54]:
num_validation_samples = 1000
x_train = cnn_data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = cnn_data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [None]:
NN2.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
history = NN2.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=128)

### Test model

In [61]:
test_seq = tokeniz.texts_to_sequences(dfTest['no stopwords'].tolist())
NN2.predict(pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH))

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

### Save model

In [None]:
NN2.save('saved_model1')

In [66]:
reconstructed_model = keras.models.load_model("saved_model1")

In [67]:
reconstructed_model.predict(pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH))

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

### Sample output from short training run:
Epoch 1/50
7805/7805 [==============================] - 968s 123ms/step - loss: 0.0000e+00 - acc: 0.7062 - val_loss: 0.0000e+00 - val_acc: 0.7400