In [None]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [None]:
def text_cleaning(dataframe):
  dataframe['text'] = dataframe['text'].str.lower()
  translator = str.maketrans('', '', string.punctuation)
  dataframe['text_cleaned'] = dataframe['text'].apply(lambda x: x.translate(translator))

  return dataframe

In [None]:
train = text_cleaning(train)
test = text_cleaning(test)

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
padding_type='post'
oov_tok = "<OOV>"

# Tokenizing -> Assigning a number to each word
# <OOV> -> Out of Vocabulary will be used for words which are not known
training_sentences = train['text_cleaned']
training_labels = train['target']

testing_sentences = test['text_cleaned']

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

# Sequencing and padding
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)


testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, maxlen=max_length)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, activation='relu')),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False, activation='relu')),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.AdamW(learning_rate=0.0005),
              metrics=[tf.keras.metrics.F1Score(average='macro')])


# Train the model
num_epochs = 50
history = model.fit(training_padded, training_labels, epochs=num_epochs, verbose=2, batch_size=64)


In [None]:
predictions = model.predict(testing_padded)
prediction_rounded = [int(pred.round()) for pred in predictions]
test['target'] = prediction_rounded
test[['id', 'target']].to_csv("nlp_submission.csv", index=False)