In [None]:
import gc
import re
import string
import operator
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from tensorflow.keras import layers, callbacks

# Load and explore the dataset

## Load the train and test dataset 

In [None]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

## Explore the dataset

Let's look at the first five rows of each dataset

In [None]:
df_train.head()

In [None]:
df_test.head()

Both datasets contain:
- id
- keyword: A keyword from that tweet (although this may be blank)
- location: The location the tweet was sent from (may also be blank)
- text: The text of a tweet

While the train dataset has an extra column:
- target: 1 if the tweet is a real disaster or 0 if not

# Preprocess the data

Let's take a look at the total number of missing values in two columns `keyword` and `location`.

In [None]:
print('Number of missing values of keyword column in train set {}'.format(df_train['keyword'].isnull().sum()))
print('Number of missing values of location column in train set {}'.format(df_train['location'].isnull().sum()))
print('Number of missing values of keyword column in test set {}'.format(df_test['keyword'].isnull().sum()))
print('Number of missing values of location column in test set {}'.format(df_test['location'].isnull().sum()))

Here is a plot for better visualization

In [None]:
missing_cols = ['keyword', 'location']

fig, axes = plt.subplots(ncols=2, figsize=(17, 4), dpi=100)

sns.barplot(x=df_train[missing_cols].isnull().sum().index, y=df_train[missing_cols].isnull().sum().values, ax=axes[0])
sns.barplot(x=df_test[missing_cols].isnull().sum().index, y=df_test[missing_cols].isnull().sum().values, ax=axes[1])

axes[0].set_ylabel('Missing Value Count', size=15, labelpad=20)
axes[0].tick_params(axis='x', labelsize=15)
axes[0].tick_params(axis='y', labelsize=15)
axes[1].tick_params(axis='x', labelsize=15)
axes[1].tick_params(axis='y', labelsize=15)

axes[0].set_title('Training Set', fontsize=13)
axes[1].set_title('Test Set', fontsize=13)

plt.show()

Fill the missing values with `no_keyword` and `no_location` respectively.

In [None]:
for df in [df_train, df_test]:
    for col in ['keyword', 'location']:
        df[col] = df[col].fillna(f'no_{col}')

Since locations are user inputs, the `location` column can be very dirty and has a lot of unique values. Let's look at the number of unique values in both columns of the datasets.

In [None]:
print('Number of unique values of keyword column in train set {}'.format(df_train['keyword'].nunique()))
print('Number of unique values of location column in train set {}'.format(df_train['location'].nunique()))
print('Number of unique values of keyword column in test set {}'.format(df_test['keyword'].nunique()))
print('Number of unique values of location column in test set {}'.format(df_test['location'].nunique()))

 Every single keyword in training set exists in test set. If training and test set are from the same sample, it is possible to use target encoding on `keyword`.

Check for duplicates and remove if found

In [None]:
train_duplicates = df_train.duplicated()
test_duplicates = df_test.duplicated()

print('Total number of duplicates in train dataset: {}'.format(train_duplicates.sum()))
print('Rows with duplicates: \n{}'.format(df_train[train_duplicates]))
print('Total number of duplicates in test dataset: {}'.format(test_duplicates.sum()))
print('Rows with duplicates: \n{}'.format(df_test[test_duplicates]))

Encode the `text` column

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(df_train['text'])
X_test = vectorizer.transform(df_test["text"]).toarray()

Split the train dataset to a train and a validation set

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_val, y_train, y_val = train_test_split(text_features, df_train['target'], test_size=0.2, random_state=42)

X_train = X_train.toarray()
X_val = X_val.toarray()

# Get the input shape
input_shape = X_train.shape[1]

# Load and train model

In [None]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = df_train.shape[0]

EPOCHS = 30
AUTO = tf.data.experimental.AUTOTUNE

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, 
    patience=10, 
    restore_best_weights=True,
)

## Create a sequential model

In [None]:
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(input_shape,)),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

# Fit
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping],
)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [None]:
y_pred_train = model.predict(X_train)
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)

print("Classification report of training: \n", classification_report(y_train, y_pred_train))

In [None]:
y_pred_val = model.predict(X_val)
y_pred_val = np.where(y_pred_val > 0.5, 1, 0)

print("Classification report of validation: \n", classification_report(y_val, y_pred_val))

# Generate the submission file 

For each tweets in the test set, we predict if the given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

The `submission.csv` file uses the following format:
`id,target`

In [None]:
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submission.head()

In [None]:
y_test = model.predict(X_test)
y_test = np.where(y_test > 0.5, 1, 0)

submission["target"] = y_test
print(submission)

In [None]:
submission.describe()

In [None]:
submission.to_csv("submission.csv", index=False)