In [4]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_nlp
import string
import keras

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

In [5]:
import psutil

mem = psutil.virtual_memory()
print(f"Total RAM: {mem.total / 1e9:.2f} GB")
print(f"Available RAM: {mem.available / 1e9:.2f} GB")


Total RAM: 13.61 GB
Available RAM: 11.75 GB


In [6]:
train = pd.read_csv("sample_data/train.csv")
test = pd.read_csv("sample_data/test.csv")
sample_submission = pd.read_csv("sample_data/sample_submission.csv")

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def text_cleaning(dataframe):
    dataframe['text_cleaned'] = dataframe['text'].str.lower()
    dataframe['text_cleaned'] = dataframe['text_cleaned'].apply(lambda x: re.sub(r'http\S+', '', x))  # Remove URLs
    dataframe['text_cleaned'] = dataframe['text_cleaned'].apply(lambda x: re.sub(r'\d+', '', x))  # Remove numbers
    dataframe['text_cleaned'] = dataframe['text_cleaned'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))  # Remove punctuation
    dataframe['text_cleaned'] = dataframe['text_cleaned'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))  # Lemmatization & stopword removal

    return dataframe

train = text_cleaning(train)
test = text_cleaning(test)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
padding_type='post'
oov_tok = "<OOV>"

# Tokenizing -> Assigning a number to each word
# <OOV> -> Out of Vocabulary will be used for words which are not known
training_sentences = train['text_cleaned']
training_labels = train['target']

testing_sentences = test['text_cleaned']

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

# Sequencing and padding
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)


testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, maxlen=max_length)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, activation='tanh')),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False, activation='tanh')),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.AdamW(learning_rate=0.0005),
              metrics=[tf.keras.metrics.F1Score(average='binary')])


# Train the model
num_epochs = 50
history = model.fit(training_padded, training_labels, epochs=num_epochs, verbose=2, batch_size=64)

predictions = model.predict(testing_padded)
prediction_rounded = [int(pred.round()) for pred in predictions]
test['target'] = prediction_rounded
test[['id', 'target']].to_csv("nlp_submission_1.csv", index=False)

In [8]:
# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor,
                                                               num_classes=2)

classifier.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/config.json...


100%|██████████| 515/515 [00:00<00:00, 995kB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/tokenizer.json...


100%|██████████| 580/580 [00:00<00:00, 788kB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/assets/tokenizer/vocabulary.txt...


100%|██████████| 226k/226k [00:00<00:00, 316kB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/model.weights.h5...


100%|██████████| 253M/253M [00:15<00:00, 16.6MB/s]


In [None]:
# Load a DistilBERT model
preset = "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    preset,
    sequence_length=160,
    name="preprocessor_4_tweets"
)

# Pretrained classifier
classifier = keras_nlp.models.DistilBertClassifier.from_preset(
    preset,
    preprocessor=preprocessor,
    num_classes=2
)

classifier.summary()

BATCH_SIZE = 32
EPOCHS = 50

training_sentences = train['text_cleaned'].tolist()
training_labels = train['target'].astype(int).tolist()

# Convert labels to Tensor (important for SparseCategoricalCrossentropy)
training_labels = tf.convert_to_tensor(training_labels, dtype=tf.int32)

classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(1e-5),
    metrics=["accuracy"]
)

# Fit with raw text (DistilBERT handles tokenization internally)
history = classifier.fit(
    x=training_sentences,  # raw text
    y=training_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)


In [None]:
predictions = model.predict(testing_padded)
prediction_rounded = [int(pred.round()) for pred in predictions]
test['target'] = prediction_rounded
test[['id', 'target']].to_csv("nlp_3.csv", index=False)