In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk
import re
import pandas as pd
import string

In [8]:
# remove stop words
# remove links
# remove punctuation
# remove hashtags

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def preprocess(text):
    text = text.lower()
    text = strip_all_entities(strip_links(text))
    text = nltk.word_tokenize(text)
    text = " ".join([word for word in text if word not in nltk.corpus.stopwords.words('english')])

    return text

In [26]:
train_df = pd.read_csv("./datasets/tweets/train.csv")
train_texts = train_df['text'].apply(preprocess).to_numpy()
train_labels = train_df["target"].to_numpy()
train_labels = np.expand_dims(train_labels, axis=0).T

del [train_df]

In [27]:
train_labels

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [16]:
max_sentence_length = 200

In [28]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

In [29]:
word_index = tokenizer.word_index

In [32]:
sequences = tokenizer.texts_to_sequences(train_texts)
padded = pad_sequences(sequences, maxlen=max_sentence_length, padding="post")

In [49]:
def train_test_split(x, y, percent=0.3):
    split_index = int(len(x) * (1-percent))
    return x[:split_index], x[split_index:], y[:split_index], y[split_index:]

In [50]:
train_x, test_x, train_y, test_y = train_test_split(padded, train_labels)

In [51]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))

In [52]:
# make a callback to save the model at every epoch
class SaveModelCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        #evaluate the model on validation data
        self.model.save(f"./models/tf_base/model-{epoch}.h5")

In [53]:
train_dataset = train_dataset.shuffle(buffer_size=1000)
valid_dataset = (test_x, test_y)

In [54]:
train_dataset = train_dataset.batch(32)

In [68]:
model = keras.Sequential([
    layers.Embedding(len(word_index) + 1, 64, input_length=max_sentence_length),
    layers.Dropout(0.2),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Dropout(0.2),
    # layers.LSTM(32),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dropout(0.2),
    layers.Dense(512, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(learning_rate=1e-4, decay=1e-6), metrics=["accuracy"])
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 200, 64)           894848    
                                                                 
 dropout_34 (Dropout)        (None, 200, 64)           0         
                                                                 
 bidirectional_10 (Bidirecti  (None, 200, 128)         66048     
 onal)                                                           
                                                                 
 dropout_35 (Dropout)        (None, 200, 128)          0         
                                                                 
 bidirectional_11 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 dropout_36 (Dropout)        (None, 64)               

In [69]:
model.fit(
    train_dataset,
    epochs=50,
    callbacks=[SaveModelCallback()],
    validation_data=valid_dataset
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50

KeyboardInterrupt: 