In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
train_data = '/content/drive/My Drive/research_project/nlp_privacy_policy_analyze/train.csv'
glove_embedding = '/content/drive/My Drive/research_project/nlp_privacy_policy_analyze/glove.6B.100d.txt'

In [0]:
train = pd.read_csv(train_data)
train["comment_text"].fillna("fillna")

x_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

max_words = 100000
max_len = 150

embed_size = 100

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)

x_train

array([[    0,     0,     0, ...,  4583,  2273,   985],
       [    0,     0,     0, ...,   589,  8377,   182],
       [    0,     0,     0, ...,     1,   737,   468],
       ...,
       [    0,     0,     0, ...,  3509, 13675,  4528],
       [    0,     0,     0, ...,   151,    34,    11],
       [    0,     0,     0, ...,  1627,  2056,    88]], dtype=int32)

In [0]:
embeddings_index = {}
with open(glove_embedding, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        embed = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embed

word_index = tokenizer.word_index

#num_words = min(max_words, len(word_index) + 1)
num_words = len(word_index)

embedding_matrix = np.zeros((num_words, embed_size), dtype='float32')
for word, i in word_index.items():

    if i >= max_words:
        continue

    embedding_vector = embeddings_index.get(word)

    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_matrix.shape

(210337, 100)

In [0]:
input = tf.keras.layers.Input(shape=(max_len,))
x = tf.keras.layers.Embedding(num_words, embed_size, weights=[embedding_matrix], trainable=False)(input)

In [0]:
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, dropout=0.1,recurrent_dropout=0.1))(x)
x = tf.keras.layers.Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)

avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)

x = tf.keras.layers.concatenate([avg_pool, max_pool])

preds = tf.keras.layers.Dense(6, activation="sigmoid")(x)

model = tf.keras.Model(input, preds)

model.summary()

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-3), metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 100)     21033700    input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None

In [0]:
batch_size = 128
 
cp_callback = tf.keras.callbacks.ModelCheckpoint('/content/drive/My Drive/research_project/nlp_privacy_policy_analyze/cp.ckpt',
                                                 save_weights_only=True,
                                                 verbose=1)
 
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_loss'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    cp_callback
]
 
model.fit(x_train, y_train, validation_split=0.2, batch_size=batch_size,
          epochs=1, callbacks=callbacks, verbose=1)

Train on 127656 samples, validate on 31915 samples
Epoch 00001: saving model to /content/drive/My Drive/research_project/nlp_privacy_policy_analyze/cp.ckpt


<tensorflow.python.keras.callbacks.History at 0x7fe389e149e8>

In [0]:
latest = tf.train.latest_checkpoint('/content/drive/My Drive/research_project/nlp_privacy_policy_analyze')
 
model.load_weights(latest)
 
predictions = model.predict(np.expand_dims(x_train[41], 0))
 
print(tokenizer.sequences_to_texts([x_train[41]]))
print(y_train[41])
print(predictions)

["tfd i think we just eced i think we responded to each other without seeing each others responses i added something in response to yours but don't know if you saw mine t c wp chicago wp four"]
[0 0 0 0 0 0]
[[0.00413106 0.00024349 0.00064855 0.00055654 0.00079537 0.00024647]]
