In [2]:
import pandas as pd   #import pandas
import numpy as np   #import numpy
import matplotlib.pyplot as plt   #visualisation library
import seaborn as sns   #visualisation library

import re   # import regex library

import nltk.corpus 
from nltk.corpus import stopwords   # import library for stepword
from nltk.stem.porter import PorterStemmer  # import library for Stemming
from nltk.stem import WordNetLemmatizer   # import library for lemmattazing


from sklearn.model_selection import train_test_split


#deep learning library
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
import tensorflow as tf

### 3. Word Embedding

In this section of the notebook, we will try to use an other type of NLP model, by using word embeddings. We will be usig a pre-trained set of embeddings trained with Glove algorith. 

For the Glove model we will import the Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download): [glove.6B.zip](https://nlp.stanford.edu/projects/glove/)

#### a. Import and Split dataset

In [3]:
df_train = pd.read_csv("/Users/stefano/UDACITY/Data Engeeniring/Capstone/Data/train.csv")

#Prepare the dataset to be splitted into Train and Test data
X = df_train.comment_text.values  #predicotors
y = df_train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].values # independent variable
# Split of the dataset. Test size 33% and random seed = 42
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)  

#### b. Preparing the data

In [4]:
embed_size = 300 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1000 # max number of words in a comment to use

In [5]:
# Tokenize the text by using Keras

tokenizer = Tokenizer(num_words=max_features)
    
#preparing vocabulary
tokenizer.fit_on_texts(list(X_train)+list(X_test))
    
# transform text into numbers
X_train_seq  = tokenizer.texts_to_sequences(X_train) 
X_test_seq = tokenizer.texts_to_sequences(X_test)

# create a sequence of same lenght
X_train_pad  = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

In [6]:
X_test_pad.shape

(52659, 1000)

In [7]:
index = tokenizer.word_index
len(index)

210337

#### c. Load the model

In [9]:
# load all pre-trained embeddings
embeddings = dict()
file = open('/Users/stefano/UDACITY/Data Engeeniring/Capstone/Model/glove/glove.6B.300d.txt')

# Read the txt file containing the embeddings 
for line in file:
    values = line.split()
    word = values[0]
    embeddings_coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = embeddings_coefs

file.close()
print('The embeddings loaded are {}.'.format(len(embeddings)))


The embeddings loaded are 400000.


In [10]:
print(values[0])

sandberger


In [11]:
print(values)

['sandberger', '0.429191', '-0.296897', '0.15011', '0.245201', '-0.00352027', '-0.0576971', '0.1409', '-0.222294', '0.221153', '0.767218', '-0.0772662', '-0.0710635', '0.0629486', '-0.220179', '-0.108197', '-0.301419', '0.232164', '0.168669', '-0.00452476', '0.168254', '-0.0579106', '-0.0362662', '-0.273464', '-0.162976', '0.239398', '-0.0119058', '0.044685', '0.105252', '0.102867', '-0.0232984', '-0.0114432', '-0.381673', '0.06122', '0.0170547', '0.415463', '-0.109101', '0.0959916', '0.19149', '-0.00752907', '-0.194603', '-0.0431976', '0.259788', '0.00527856', '-0.183626', '0.225188', '-0.0187726', '-0.158172', '-0.586937', '0.249259', '-0.130252', '-0.0537497', '0.0315535', '-0.18562', '0.0610198', '-0.0850566', '-0.0965162', '0.278621', '-0.247254', '-0.153895', '0.0418453', '0.0704212', '-0.062286', '-0.284913', '0.0152124', '0.144002', '0.335902', '-0.288315', '-0.00253548', '-0.0876423', '-0.0574409', '0.00670068', '-0.0753335', '-0.0677815', '-0.056624', '0.19296', '0.0250159', 

In [12]:
print(embeddings_coefs)

[ 0.429191   -0.296897    0.15011     0.245201   -0.00352027 -0.0576971
  0.1409     -0.222294    0.221153    0.767218   -0.0772662  -0.0710635
  0.0629486  -0.220179   -0.108197   -0.301419    0.232164    0.168669
 -0.00452476  0.168254   -0.0579106  -0.0362662  -0.273464   -0.162976
  0.239398   -0.0119058   0.044685    0.105252    0.102867   -0.0232984
 -0.0114432  -0.381673    0.06122     0.0170547   0.415463   -0.109101
  0.0959916   0.19149    -0.00752907 -0.194603   -0.0431976   0.259788
  0.00527856 -0.183626    0.225188   -0.0187726  -0.158172   -0.586937
  0.249259   -0.130252   -0.0537497   0.0315535  -0.18562     0.0610198
 -0.0850566  -0.0965162   0.278621   -0.247254   -0.153895    0.0418453
  0.0704212  -0.062286   -0.284913    0.0152124   0.144002    0.335902
 -0.288315   -0.00253548 -0.0876423  -0.0574409   0.00670068 -0.0753335
 -0.0677815  -0.056624    0.19296     0.0250159  -0.39188    -0.159278
  0.26123     0.10221     0.0877169   0.0433055  -0.179803   -0.189744


In [13]:
in_voc = 0
not_voc = 0


# create a weight matrix for words in training docs
embeddings_matrix = np.zeros((len(index)+1, embed_size))

for word, i in index.items():
    if i >= max_features: continue
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector
        in_voc +=1
    else:
        not_voc +=1
print("Converted %d words (%d misses)" % (in_voc, not_voc))

Converted 9595 words (404 misses)


#### d. Create Model on Pretrained embeddings

In [14]:
embed_layer = Embedding(len(index) + 1, embed_size, input_length = maxlen, weights = [embeddings_matrix] )

In [15]:
from keras.models import *
from keras.layers import *
from keras.callbacks import *

model_embeddings = Sequential()

#embedding layer

model_embeddings.add(embed_layer) 

#lstm layer
model_embeddings.add(Bidirectional(LSTM(50, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1)))

model_embeddings.add(GlobalMaxPooling1D())
model_embeddings.add(Dense(50, activation = 'relu'))
model_embeddings.add(Dropout(0.2))
model_embeddings.add(Dense(6, activation = 'sigmoid'))

model_embeddings.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 300)         63101400  
_________________________________________________________________
bidirectional (Bidirectional (None, 1000, 100)         140400    
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 306       
Total params: 63,247,156
Trainable params: 63,247,156
Non-trainable params: 0
____________________________________________

In [16]:
#Compile the model
model_embeddings.compile(loss='binary_crossentropy', optimizer = 'Adam', metrics = ['AUC'])

In [17]:
# Fit the model
history = model_embeddings.fit(X_train_pad, y_train, epochs = 2, batch_size = 128, validation_split = 0.1)

Epoch 1/2
Epoch 2/2


In [19]:
# Evaluate the model
result = model_embeddings.evaluate(X_test_pad,y_test)



In [20]:
history.history["val_auc"]

[0.9793121218681335, 0.9827395677566528]

### Let's try to apply the trained model to the test.csv dataset

In [21]:
# import data
df_test = pd.read_csv("/Users/stefano/UDACITY/Data Engeeniring/Capstone/Data/test.csv")

In [22]:
# drop "id" column
df_test.drop(["id"], axis = 1, inplace = True)

In [23]:
# add columns
df_test[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]]= ""

In [24]:
df_test.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Yo bitch Ja Rule is more succesful then you'll...,,,,,,
1,== From RfC == \n\n The title is fine as it is...,,,,,,
2,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",,,,,,
3,":If you have a look back at the source, the in...",,,,,,
4,I don't anonymously edit articles at all.,,,,,,


In [25]:
# identify predictors and dependent variable

X_test_t = df_test.comment_text.values  #predicotors
y_test_t = df_train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].values # independent variable

In [26]:
# define features for tokenization

embed_size = 300 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1000 # max number of words in a comment to use

In [31]:
# Tokenize the text by using Keras

tokenizer = Tokenizer(num_words=max_features)
    
#preparing vocabulary
tokenizer.fit_on_texts(list(X_test_t))
    
# transform text into numbers
X_test_t_seq  = tokenizer.texts_to_sequences(X_test_t) 

# create a sequence of same lenght
X_test_t_pad  = pad_sequences(X_test_t_seq, maxlen=maxlen)

In [33]:
df_test.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Yo bitch Ja Rule is more succesful then you'll...,,,,,,
1,== From RfC == \n\n The title is fine as it is...,,,,,,
2,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",,,,,,
3,":If you have a look back at the source, the in...",,,,,,
4,I don't anonymously edit articles at all.,,,,,,


In [34]:
# predict by using our trained bidirectional LSTM model

prediction = model_embeddings.predict(X_test_t_pad)

In [35]:
# Attribute the predicted AUC-ROC probability to the right class

classes = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

df_test[classes] = prediction
df_test.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Yo bitch Ja Rule is more succesful then you'll...,0.035888,0.000319,0.002634,0.000665,0.006884,0.003766
1,== From RfC == \n\n The title is fine as it is...,0.969456,0.110548,0.87486,0.05546,0.709053,0.116462
2,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.005873,0.000378,0.002362,0.000734,0.001526,0.000402
3,":If you have a look back at the source, the in...",0.022357,0.00014,0.002681,0.000409,0.003757,0.000466
4,I don't anonymously edit articles at all.,0.001224,2.9e-05,0.000332,4.3e-05,0.000179,2.3e-05
