# Project 4: Getting Started with NLP

## Step 1: Load in the data

In [1]:
import pandas as pd

# load data
train_df = pd.read_csv("nlp-getting-started/train.csv")
test_df = pd.read_csv("nlp-getting-started/test.csv")

print(train_df.head())
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")


   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
Train shape: (7613, 5), Test shape: (3263, 4)


## Step 2: Clean the data

In [2]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  
    text = re.sub(r'\@w+|\#', '', text)  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'[^a-zA-Z!? ]', '', text)  
    text = " ".join([word for word in text.split() if word not in STOPWORDS]) 
    return text


#clean data
train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

print(train_df[["text", "clean_text"]].head(5))


                                                text  \
0  Our Deeds are the Reason of this #earthquake M...   
1             Forest fire near La Ronge Sask. Canada   
2  All residents asked to 'shelter in place' are ...   
3  13,000 people receive #wildfires evacuation or...   
4  Just got sent this photo from Ruby #Alaska as ...   

                                          clean_text  
0       deeds reason earthquake may allah forgive us  
1              forest fire near la ronge sask canada  
2  residents asked shelter place notified officer...  
3  people receive wildfires evacuation orders cal...  
4  got sent photo ruby alaska smoke wildfires pou...  


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcvucovich/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Step 3: Pre-process the data and create tokenizer

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


MAX_VOCAB = 10000  
MAX_LENGTH = 50  

#init tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["clean_text"])
word_index = tokenizer.word_index 

#text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df["clean_text"])
test_sequences = tokenizer.texts_to_sequences(test_df["clean_text"])

#pad sequences
X_train = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")
X_test = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding="post", truncating="post")

# add labels
y_train = train_df["target"].values  

print(f"Sample sequence: {train_sequences[0]}")
print(f"Padded shape: {X_train.shape}")


Sample sequence: [4231, 740, 147, 57, 1552, 4232, 13]
Padded shape: (7613, 50)


## Step 4: Create the model

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D, Conv1D, MaxPooling1D, LayerNormalization
from tensorflow.keras.optimizers import Adam

import numpy as np

def load_glove_embeddings(filepath, word_index, embedding_dim=50):
    embeddings_index = {}

    #read glove file
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0] 
            coeffs = np.asarray(values[1:], dtype="float32") 
            embeddings_index[word] = coeffs

    vocab_size = len(word_index) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)  
        if vector is not None:  
            embedding_matrix[i] = vector

    return embedding_matrix


# load glove embeddings
EMBEDDING_DIM = 50
EMBEDDING_PATH = "glove/glove.6B.50d.txt"
embedding_matrix = load_glove_embeddings(EMBEDDING_PATH, tokenizer.word_index, EMBEDDING_DIM)

# init model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, 
              output_dim=EMBEDDING_DIM, 
              input_length=MAX_LENGTH, 
              weights=[embedding_matrix], 
              trainable=False), 

    SpatialDropout1D(0.3),

    Conv1D(filters=64, kernel_size=3, activation="relu", padding="same"),  
    MaxPooling1D(pool_size=2),

    Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),

    LayerNormalization(),

    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")  
])


model.compile(
    loss="binary_crossentropy", 
    optimizer=Adam(learning_rate=1e-4),  
    metrics=["accuracy"]
)

model.build(input_shape=(None, MAX_LENGTH))
model.summary()




## Step 5: Train the model 

In [6]:
history = model.fit(
    X_train, y_train,
    epochs=20, 
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


Epoch 1/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - accuracy: 0.5592 - loss: 0.7266 - val_accuracy: 0.7833 - val_loss: 0.4981
Epoch 2/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.7128 - loss: 0.5677 - val_accuracy: 0.7905 - val_loss: 0.4806
Epoch 3/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.7432 - loss: 0.5416 - val_accuracy: 0.7951 - val_loss: 0.4533
Epoch 4/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.7511 - loss: 0.5181 - val_accuracy: 0.7991 - val_loss: 0.4508
Epoch 5/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.7501 - loss: 0.5257 - val_accuracy: 0.7965 - val_loss: 0.4469
Epoch 6/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.7671 - loss: 0.5137 - val_accuracy: 0.7958 - val_loss: 0.4555
Epoch 7/20
[1m191/191

In [7]:
# eval
loss, acc = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {acc:.2%}")


[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8162 - loss: 0.4127
Train Accuracy: 80.56%


## Step 6: Validate 

In [8]:
#predict on test data
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)  

# save
test_df["target"] = predictions
test_df[["id", "target"]].to_csv("submission.csv", index=False)
print("Predictions saved!")


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
Predictions saved!
