In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import csv
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import gensim

In [2]:
df = pd.read_csv(r'../data/adversarial_swap_train_final.csv')
df.head(5)
df.dropna(how='any', inplace=True)

In [6]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(df['clean_text'], df['classification'], test_size=0.2)
print(X_train_raw.shape)
print(X_test_raw.shape)
print(y_train_raw.shape)
print(y_test_raw.shape)

(13077,)
(3270,)
(13077,)
(3270,)


In [17]:
print(X_train_raw)

4698                                 free ice cream though
9775     read lactose stuff hard eat cheese lactose jum...
5270     lost favorite thing love always story year key...
14465            hi selena made team support greece cyprus
350                      stupid bipolar weather ruined day
                               ...                        
9316     yes haha impaled crossed key love scottishtryi...
4884                                      hell awake early
7362            monday work leg still hurting little smile
6231        sometimes people never learn shut stop talking
3285          thx old computer slow kubuntu blender really
Name: clean_text, Length: 13077, dtype: object


In [18]:
t = Tokenizer()
t.fit_on_texts(X_train_raw)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train_raw)
max_length = 23
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [24]:
X_train = []
vocab = []
for x in X_train_raw:
    x = x.split(' ')
    for word in x:
        if word not in vocab:
            vocab.append(word)
    X_train.append(x)

In [20]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

In [33]:
model = load_glove_model("output.txt")
dim = 100

Loading Glove Model
15872 words loaded!


In [26]:
def create_embedding_dict(vocab):
    # create a map of words to vectors
    embedding = dict()
    for word in vocab:
        # key is string word, value is numpy array for vector
        if word in model:
            embedding[word] = np.asarray(model[word], dtype='float32')
    return embedding

def create_embedding_matrix(vocab_size, dim, vocab, embeddings_index):
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dict = create_embedding_dict(vocab)
embedding_matrix = create_embedding_matrix(vocab_size, 100, vocab, embedding_dict)

In [None]:
print(list(embedding_dict.items())[:5])

In [30]:
print(embedding_matrix[:100])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [31]:
def build_model(vocab_size, dim):
    custom_embedding_layer = layers.Embedding(vocab_size, dim, weights=[embedding_matrix], trainable=False, name="embeddings")
    model = keras.Sequential()
    model.add(custom_embedding_layer)
    model.add(layers.LSTM(100, dropout=0.3, name="Normal"))
    #model.add(layers.LSTM(2))
    #model.add(layers.LSTM(32, kernel_regularizer="l1"))
    model.add(layers.Dense(10))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
    

In [34]:
model = build_model(vocab_size, dim)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embeddings (Embedding)       (None, None, 100)         1573600   
_________________________________________________________________
Normal (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 10)                1010      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 1,655,021
Trainable params: 81,421
Non-trainable params: 1,573,600
_________________________________________________________________


In [38]:
history = model.fit(padded_docs, y_train_raw, epochs=20)
print("Done")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Done


In [None]:
t.fit_on_texts(X_train_raw)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train_raw)
max_length = 23
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

result = model.predict()
result = np.round(result).astype(int)

In [39]:
loss, accuracy = model.evaluate(padded_docs, y_train_raw, verbose=0)
print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % (loss*100))

Accuracy: 61.030817
Loss: 63.465697


In [40]:
def build_sparse_model(vocab_size, dim):
    custom_embedding_layer = layers.Embedding(vocab_size, dim, weights=[embedding_matrix], trainable=False, name="embeddings")
    model = keras.Sequential()
    model.add(custom_embedding_layer)
    model.add(layers.LSTM(100, dropout=0.3, name="Normal", return_sequences=True))
    model.add(layers.LSTM(32, dropout=0.3, kernel_regularizer="l1", name="Regularized"))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [41]:
sparse_model = build_sparse_model(vocab_size, dim)
sparse_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embeddings (Embedding)       (None, None, 100)         1573600   
_________________________________________________________________
Normal (LSTM)                (None, None, 100)         80400     
_________________________________________________________________
Regularized (LSTM)           (None, 32)                17024     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,671,057
Trainable params: 97,457
Non-trainable params: 1,573,600
_________________________________________________________________


In [42]:
history = sparse_model.fit(padded_docs, y_train_raw, epochs=20)
print("Done")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Done


In [43]:
loss, accuracy = sparse_model.evaluate(padded_docs, y_train_raw, verbose=0)
print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % (loss*100))

Accuracy: 61.306107
Loss: 65.015322
