In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers

In [None]:
def standardize_text(input_text):
    emojis = {
    ':-*\)': 'smile', ':-*]': 'smile', ':-*d': 'smile',
    ':-*\(': 'frown', ':-*\[': 'frown', ':-*/': 'unsure',
    ':-*o': 'astonish', ':-*0': 'astonish', 'xd': 'laugh',
    ';-*\)': 'wink', ":'\(": 'cry', ':3': 'smile', '&lt;3': 'love',
    }
    # Convert to lower case
    input_text = tf.strings.lower(input_text)

    # Remove all URLs, hashtags, mentions
    input_text = tf.strings.regex_replace(input_text, r'(https|http)?:\/\/\S+', ' ')
    input_text = tf.strings.regex_replace(input_text, r'^#\w+|\s#\w+', ' ')
    input_text = tf.strings.regex_replace(input_text, r'^@\w+|\s@\w+', ' ')

    # Convert all emojis to their text counterparts
    for emoji, emoji_text in emojis.items():
        input_text = tf.strings.regex_replace(input_text, emoji, emoji_text)

    # Convert HTML references to text
    input_text = tf.strings.regex_replace(input_text, r'&amp;', 'and ')
    input_text = tf.strings.regex_replace(input_text, r'&quot;', '')
    input_text = tf.strings.regex_replace(input_text, r'&gt;', '')

    # Remove non-ASCII characters
    input_text = tf.strings.regex_replace(input_text, r'\w*[^\x00-\x7F]+\w*', ' ')

    # Remove additional spaces
    input_text = tf.strings.regex_replace(input_text, r'\s\s+', ' ')
    input_text = tf.strings.strip(input_text)


    return input_text

In [None]:
def word_lengthening(input_text):
    if len(input_text) < 1:
        return input_text

    # Fix word-lengthening; convert Helllooooo to Helloo.
    word_count = 1
    input_text = input_text.lower()
    input_text_clean = [input_text[0]]
    for i in range(1, len(input_text)):
        if input_text[i] == input_text[i-1]:
            word_count += 1
        else:
            word_count = 1
        if word_count <= 2:
            input_text_clean.append(input_text[i])
        else:
            continue
    input_text = ''.join(input_text_clean)
    return input_text

In [None]:
def get_embeddings(vectorizer):
    glove_embedding_path = "../input/glovetwitter27b100dtxt/glove.twitter.27B.200d.txt"
    embeddings_index = {}
    with open(glove_embedding_path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, dtype=float, sep=" ")
            embeddings_index[word] = coefs

    print("Total word vectors: ", len(embeddings_index))

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))

    num_tokens = len(voc) + 2
    embedding_dim = 200
    hits = 0
    misses = 0
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print(f"Converted {hits} words ({misses} misses).")

    return embedding_matrix, embedding_dim, num_tokens

In [None]:
def build_model_cnn_biLSTM(embedding_matrix, embedding_dim, num_tokens, learning_rate=0.001):
    inp = keras.Input(shape=(None,), dtype="int64")
    x = Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )(inp)
    x = layers.SpatialDropout1D(0.4)(x)

    x_gru = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
    x_gru_dr = layers.Dropout(0.4)(x_gru)

    x1_conv1 = layers.Conv1D(128, 5, activation="relu")(x_gru_dr)
    x1_conv1_maxpool = layers.MaxPooling1D(5)(x1_conv1)
    x1_conv2 = layers.Conv1D(64, 5, activation="relu")(x1_conv1_maxpool)
    x1_avgpool = layers.GlobalAveragePooling1D()(x1_conv2)
    x1_maxpool = layers.GlobalMaxPool1D()(x1_conv2)

    x_lstm = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x_lstm_dr = layers.Dropout(0.4)(x_lstm)

    x2_conv1 = layers.Conv1D(128, 5, activation="relu")(x_lstm_dr)
    x2_conv1_maxpool = layers.MaxPooling1D(5)(x2_conv1)
    x2_conv2 = layers.Conv1D(64, 5, activation="relu")(x2_conv1_maxpool)
    x2_avgpool = layers.GlobalAveragePooling1D()(x2_conv2)
    x2_maxpool = layers.GlobalMaxPool1D()(x2_conv2)


    x = keras.layers.concatenate(
        [x1_avgpool, x1_maxpool, x2_avgpool, x2_maxpool]
        )
    
    x = layers.BatchNormalization()(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.4)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(16, activation='relu')(x)
    x = layers.Dropout(0.4)(x)
    preds = layers.Dense(1, activation='sigmoid')(x)

    model_cnn_biLSTM = keras.Model(inp, preds)
    model_cnn_biLSTM.summary()

    model_cnn_biLSTM.compile(
        loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=["acc"]
    )

    return model_cnn_biLSTM

In [None]:
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
tweets = pd.read_csv('../training.1600000.processed.noemoticon.csv', encoding="latin-1", names=columns)
tweets = tweets.sample(frac=0.6)
tweets.drop(columns=columns[1:5], inplace=True)
tweets.target.replace(4, 1, inplace=True)

In [None]:
tweets['clean_text'] = tweets.text.apply(word_lengthening)
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=150, standardize=standardize_text)
vectorizer.adapt(tweets.clean_text)

In [None]:
X_tweets = vectorizer(np.expand_dims(tweets.clean_text, axis=-1)).numpy()
y_tweets = np.array(tweets.target).reshape(-1, 1)

embeddings_matrix, embedding_dim, num_tokens = get_embeddings(vectorizer)
fit_parameters = {'x': X_tweets, 'y': y_tweets, 'batch_size': 64, 'epochs': 20, 'validation_split': 0.05 }

In [None]:
model_cnn_biLSTM = build_model_cnn_biLSTM(embeddings_matrix, embedding_dim, num_tokens)

In [None]:
best_model_file_path = './model_cnn_biLSTM/best_model'
check_point = keras.callbacks.ModelCheckpoint(best_model_file_path, monitor = "val_acc", save_best_only = True, mode = "max")
early_stopping = keras.callbacks.EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
history_cnn_biLSTM = model_cnn_biLSTM.fit(callbacks=[check_point, early_stopping], **fit_parameters)