In [None]:
! kaggle datasets download -p sentiment140 --unzip -d kazanova/sentiment140

In [None]:
import re
import pickle
import pandas as pd
import numpy as np

import html
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
nltk.download("stopwords")

###  Dataset

In [None]:
df = pd.read_csv(
    "sentiment140/training.1600000.processed.noemoticon.csv",
    names=["sentiment", "id", "date", "query", "user_id", "text"],
    encoding="latin",
)
df["sentiment"] = df["sentiment"].map({0: 0, 4: 1})  # 0 = Negative, 1 = Postitive
df = df[["text", "sentiment"]]
df.head()

### Preprocess

In [None]:
print(df.text[df.text.str.startswith("@")].iloc[0])  # Mentions/tags, regex - @\S+
print(df.text[df.text.str.contains("http")].iloc[-1])  # URL, regex - https?:\S+
print(df.text[df.text.str.contains("&")].iloc[0])  # HTML character like &quot;
print(
    df.text[df.text.str.contains("!")].iloc[0]
)  # anything other than letters and numbers, regex - [^A-Za-z0-9]+

In [None]:
def preprocess(text):
    text = text.lower()  # lower the text
    text = html.unescape(text)  # parse html entitities
    text = re.sub(
        r"@\S+|https?:\S+|[^A-Za-z0-9]+", " ", text
    ).strip()  # remove the unwanted text
    stop_words = stopwords.words("english")
    tokens = [token for token in text.split() if token not in stop_words]
    text = " ".join(tokens)
    return text

In [None]:
text = df.text.sample(1).iloc[0]
print(text)
print(preprocess(text))

### Distribution

In [None]:
df.sentiment.value_counts()

In [None]:
data_df = df.sample(frac=0.05)  # Select a fraction of data
data_df.sentiment.value_counts()

In [None]:
data_df["processed_text"] = data_df["text"].apply(preprocess)

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(
    data_df.text.tolist(), data_df.sentiment.tolist(), test_size=0.3, random_state=42
)
np.unique(Y_train, return_counts=True), np.unique(Y_val, return_counts=True)

### Tokenization

In [None]:
NUM_FEATURES = 2048

In [None]:
vectorizer = TfidfVectorizer(max_features=NUM_FEATURES)

In [None]:
vectorizer.fit(X_train)

pickle.dump(
    vectorizer, open("vectorizer.pickle", "wb")
)  # save vectorizer to use for inference

### Data Preparation

In [None]:
def generator(features, labels):
    def _generator():
        for vector, label in zip(features, labels):
            yield vector.toarray(), label

    return _generator


def get_dataset(features, labels, batch_size=128, mode="val"):
    dataset = tf.data.Dataset.from_generator(
        generator(features, labels),
        output_signature=(
            tf.TensorSpec(shape=(1, NUM_FEATURES), dtype=tf.float32),
            tf.TensorSpec(shape=(), dtype=tf.int32),
        ),
    )
    if mode == "train":
        dataset = dataset.shuffle(2 * batch_size)
    dataset = dataset.batch(batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
X_train_vectors = vectorizer.transform(X_train)
X_val_vectors = vectorizer.transform(X_val)

In [None]:
BATCH_SIZE = 512
train_dataset = get_dataset(X_train_vectors, Y_train, BATCH_SIZE, mode="train")
val_dataset = get_dataset(X_val_vectors, Y_train, BATCH_SIZE, mode="val")

### Model

In [None]:
input_layer = layers.Input(shape=(1, NUM_FEATURES))
x = layers.Bidirectional(layers.LSTM(64, dropout=0.2))(input_layer)
x = layers.Dense(512, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(512, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(input_layer, outputs)
model.summary()

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

### Train

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=25,
)

In [None]:
model.save("model.keras")  # save the model