In [60]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import csv
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import gensim

In [12]:
df = pd.read_csv(r'../data/adversarial_swap_train_final.csv')
df.head(5)
df.dropna()

Unnamed: 0.1,Unnamed: 0,text,sentiment,classification,clean_text
0,1,Sooo SAD I will miss you here in San Diego!!!,negative,0.0,sooo sad miss san diego
1,2,my boss is bullying me...,negative,0.0,bos bullying
2,3,what interview! leave me alone,negative,0.0,interview leave alone
3,4,"Sons of ****, why couldn`t they put them on t...",negative,0.0,son couldnt put release already bought
4,6,2am feedings for the baby are fun when he is a...,positive,1.0,feeding baby fun smile coo
...,...,...,...,...,...
16354,27474,enjoy ur night,positive,1.0,enjoy ur night
16355,27475,wish we could come see u on Denver husband l...,negative,0.0,wish could come see u denver husband lost job ...
16356,27476,I`ve wondered about rake to. The client has ...,negative,0.0,ive wondered rake client made clear net dont f...
16357,27477,Yay good for both of you. Enjoy the break - y...,positive,1.0,yay good enjoy break probably need hectic week...


In [13]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(df['clean_text'], df['classification'], test_size=0.2)
print(X_train_raw.shape)
print(X_test_raw.shape)
print(y_train_raw.shape)
print(y_test_raw.shape)

(13087,)
(3272,)
(13087,)
(3272,)


In [61]:
t = Tokenizer()
t.fit_on_texts(X_train_raw)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train_raw)
max_length = 23
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [34]:
X_train = []
vocab = []
for x in X_train_raw:
    x = x.split(' ')
    for word in x:
        if word not in vocab:
            vocab.append(word)
    X_train.append(x)

15796


In [50]:
model = gensim.models.Word2Vec(X_train, window=5, min_count=1, seed=1)
dim = 100

In [49]:
def create_embedding_dict(vocab):
    # create a map of words to vectors
    embedding = dict()
    for word in vocab:
        # key is string word, value is numpy array for vector
        embedding[word] = np.asarray(model.wv[word], dtype='float32')

    return embedding

def create_embedding_matrix(vocab_size, dim, vocab, embeddings_index):
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dict = create_embedding_dict(vocab)
embedding_matrix = create_embedding_matrix(vocab_size, 100, vocab, embedding_dict)

In [89]:
def build_model(vocab_size, dim):
    custom_embedding_layer = layers.Embedding(vocab_size, dim, weights=[embedding_matrix], trainable=False, name="embeddings")
    model = keras.Sequential()
    model.add(custom_embedding_layer)
    model.add(layers.LSTM(100, dropout=0.3, name="Normal"))
    #model.add(layers.LSTM(2))
    #model.add(layers.LSTM(32, kernel_regularizer="l1"))
    model.add(layers.Dense(10))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
    

In [90]:
model = build_model(vocab_size, dim)
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embeddings (Embedding)       (None, None, 100)         1579700   
_________________________________________________________________
Normal (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_11 (Dense)             (None, 10)                1010      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 11        
Total params: 1,661,121
Trainable params: 81,421
Non-trainable params: 1,579,700
_________________________________________________________________


In [86]:
history = model.fit(padded_docs, y_train_raw, epochs=20)
print("Done")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Done


In [None]:
t.fit_on_texts(X_train_raw)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train_raw)
max_length = 23
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

result = model.predict()
result = np.round(result).astype(int)

In [88]:
loss, accuracy = model.evaluate(padded_docs, y_train_raw, verbose=0)
print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % (loss*100))

Accuracy: 70.459235
Loss: 55.650568


In [101]:
def build_sparse_model(vocab_size, dim):
    custom_embedding_layer = layers.Embedding(vocab_size, dim, weights=[embedding_matrix], trainable=False, name="embeddings")
    model = keras.Sequential()
    model.add(custom_embedding_layer)
    model.add(layers.LSTM(100, dropout=0.3, name="Normal", return_sequences=True))
    model.add(layers.LSTM(32, dropout=0.3, kernel_regularizer="l1", name="Regularized"))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [102]:
sparse_model = build_sparse_model(vocab_size, dim)
sparse_model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embeddings (Embedding)       (None, None, 100)         1579700   
_________________________________________________________________
Normal (LSTM)                (None, None, 100)         80400     
_________________________________________________________________
Regularized (LSTM)           (None, 32)                17024     
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 33        
Total params: 1,677,157
Trainable params: 97,457
Non-trainable params: 1,579,700
_________________________________________________________________


In [103]:
history = sparse_model.fit(padded_docs, y_train_raw, epochs=20)
print("Done")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Done


In [104]:
loss, accuracy = sparse_model.evaluate(padded_docs, y_train_raw, verbose=0)
print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % (loss*100))

Accuracy: 67.830670
Loss: 59.267986
