In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
import string
import re
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
print("Reading Test data...")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
print("Reading Training data...")
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

Reading Test data...
Reading Training data...


In [3]:
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
test_df.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [5]:
text_raw = train_df["text"]

In [6]:
print("The training data has {} rows and {} columns".format(train_df.shape[0], train_df.shape[1]))
print("The test data has {} rows and {} columns".format(test_df.shape[0], test_df.shape[1]))

The training data has 7613 rows and 5 columns
The test data has 3263 rows and 4 columns


In [7]:
print("amount of empty cells in training data:")
print(train_df.isna().sum())
print("")
print("amount of empty cells in test data:")
print(test_df.isna().sum())

amount of empty cells in training data:
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

amount of empty cells in test data:
id             0
keyword       26
location    1105
text           0
dtype: int64


Normal Bag of Words
NLP Steps:
1. Lower words and remove punctuation
2. Expand contradictions
3. Lemmatizing
4. Remove Stopwords

In [8]:
def remove_punct(text: str):
    text = text.lower()
    text_nopunct = "".join([character for character in text if character not in string.punctuation])
    return text_nopunct

train_df["text"] = train_df["text"].apply(lambda x: remove_punct(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
5,8,,,rockyfire update california hwy 20 closed in ...,1
6,10,,,flood disaster heavy rain causes flash floodin...,1
7,13,,,im on top of the hill and i can see a fire in ...,1
8,14,,,theres an emergency evacuation happening now i...,1
9,15,,,im afraid that the tornado is coming to our area,1


In [9]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [10]:
contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))
def expand_contractions(s, contractions=contractions):
    def replace(match):
         return contractions[match.group(0)]
    return contractions_re.sub(replace, s)

train_df["text"] = train_df["text"].apply(lambda x: expand_contractions(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
5,8,,,rockyfire update california hwy 20 closed in ...,1
6,10,,,flood disaster heavy rain causes flash floodin...,1
7,13,,,im on top of the hill and i can see a fire in ...,1
8,14,,,theres an emergency evacuation happening now i...,1
9,15,,,im afraid that the tornado is coming to our area,1


In [11]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text: str):
    text = word_tokenize(text)
    
    lemmatize_text = " ".join([lemmatizer.lemmatize(word) for word in text])
    
    return lemmatize_text

train_df["text"] = train_df["text"].apply(lambda x: lemmatize(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deed are the reason of this earthquake may...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all resident asked to shelter in place are bei...,1
3,6,,,13000 people receive wildfire evacuation order...,1
4,7,,,just got sent this photo from ruby alaska a sm...,1
5,8,,,rockyfire update california hwy 20 closed in b...,1
6,10,,,flood disaster heavy rain cause flash flooding...,1
7,13,,,im on top of the hill and i can see a fire in ...,1
8,14,,,there an emergency evacuation happening now in...,1
9,15,,,im afraid that the tornado is coming to our area,1


In [12]:
#remove Stopwords
stopwords = set(stopwords.words('english')) 
def remove_stopwords(text: str):
    text = word_tokenize(text)
    
    text_no_sw = " ".join([w for w in text if w not in stopwords])
    
    return text_no_sw

train_df["text"] = train_df["text"].apply(lambda x: remove_stopwords(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive u,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,resident asked shelter place notified officer ...,1
3,6,,,13000 people receive wildfire evacuation order...,1
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1
5,8,,,rockyfire update california hwy 20 closed dire...,1
6,10,,,flood disaster heavy rain cause flash flooding...,1
7,13,,,im top hill see fire wood,1
8,14,,,emergency evacuation happening building across...,1
9,15,,,im afraid tornado coming area,1


In [13]:
x_text = train_df["text"]
y = train_df["target"]

In [14]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_raw)

max_len = 150

x_raw = tokenizer.texts_to_sequences(text_raw)
x = tokenizer.texts_to_sequences(x_text)

x_raw = tf.keras.preprocessing.sequence.pad_sequences(x_raw, maxlen = max_len, padding = "post", truncating = "post", value = 0)
x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen = max_len, padding = "post", truncating = "post", value = 0)

In [15]:
len_vocab = len(tokenizer.word_index) + 1
vocab = tokenizer.word_index

print(len_vocab)

21259


In [16]:
inputs = layers.Input((max_len,))

l = layers.Embedding(input_dim = len_vocab, output_dim = 300)(inputs)

convs = []
filter_sizes = [2,3,4,5,6,7]

for filter_size in filter_sizes:
    conv = layers.Conv1D(filters = 200, kernel_size = filter_size, activation = "relu")(l)
    pool = layers.GlobalMaxPool1D()(conv)
    
    convs.append(pool)

merge = layers.concatenate(convs)
l = layers.Dropout(0.1)(merge)
l = layers.Dense(512, activation = "relu")(l)
l = layers.Dropout(0.1)(l)
l = layers.Dense(256, activation = "relu")(l)
l = layers.Dropout(0.1)(l)
l = layers.Dense(128, activation = "relu")(l)
output = layers.Dense(1, activation = "sigmoid")(l)

model = keras.Model(inputs, output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 300)     6377700     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 149, 200)     120200      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 148, 200)     180200      embedding[0][0]                  
______________________________________________________________________________________________

In [17]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["acc"])
callback = tf.keras.callbacks.EarlyStopping(monitor = "loss", patience = 10, mode = "auto")

history = model.fit(x, y,
                   batch_size = 1000,
                   callbacks = [callback],
                   epochs = 100000,
                   validation_split = 0.1)

Train on 6851 samples, validate on 762 samples
Epoch 1/100000
Epoch 2/100000
Epoch 3/100000
Epoch 4/100000
Epoch 5/100000
Epoch 6/100000
Epoch 7/100000
Epoch 8/100000
Epoch 9/100000
Epoch 10/100000
Epoch 11/100000
Epoch 12/100000
Epoch 13/100000
Epoch 14/100000
Epoch 15/100000
Epoch 16/100000
Epoch 17/100000
Epoch 18/100000
Epoch 19/100000
Epoch 20/100000
Epoch 21/100000
Epoch 22/100000
Epoch 23/100000
Epoch 24/100000
Epoch 25/100000
Epoch 26/100000
Epoch 27/100000
Epoch 28/100000
Epoch 29/100000
Epoch 30/100000
Epoch 31/100000
Epoch 32/100000
Epoch 33/100000
Epoch 34/100000
Epoch 35/100000
Epoch 36/100000
Epoch 37/100000
Epoch 38/100000
Epoch 39/100000
Epoch 40/100000
Epoch 41/100000
Epoch 42/100000
Epoch 43/100000
Epoch 44/100000
Epoch 45/100000
Epoch 46/100000
Epoch 47/100000
Epoch 48/100000
Epoch 49/100000
Epoch 50/100000
Epoch 51/100000
Epoch 52/100000
Epoch 53/100000
Epoch 54/100000
Epoch 55/100000
Epoch 56/100000
Epoch 57/100000
Epoch 58/100000


In [18]:
#Predict the train data
pred_train = model.predict(x)

pred_train[pred_train < 0.5] = 0
pred_train[pred_train >= 0.5] = 1
print("The F1-Score for prediction the training data is {}".format(f1_score(y, pred_train)))

The F1-Score for prediction the training data is 0.9688454489920588


In [19]:
test_df["text"] = test_df["text"].apply(lambda x: remove_punct(x))
test_df["text"] = test_df["text"].apply(lambda x: expand_contractions(x))
test_df["text"] = test_df["text"].apply(lambda x: lemmatize(x))
test_df["text"] = test_df["text"].apply(lambda x: remove_stopwords(x))

x_test_text = test_df["text"]

x_test = tokenizer.texts_to_sequences(x_test_text)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen = max_len, padding = "post", truncating = "post", value = 0)

In [20]:
predict = model.predict(x_test)

predict[predict < 0.5] = 0
predict[predict >= 0.5] = 1
predict = predict.astype(int)
predict

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [21]:
submission = pd.concat([pd.DataFrame(test_df["id"]), pd.DataFrame(predict, columns = ["target"])], axis = 1)

submission["target"].value_counts()

0    2069
1    1194
Name: target, dtype: int64

In [22]:
submission.to_csv("submission.csv", index = False)