In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
import string
import re
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [2]:
print("Reading Test data...")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
print("Reading Training data...")
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

Reading Test data...
Reading Training data...


In [3]:
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
test_df.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [5]:
print("The training data has {} rows and {} columns".format(train_df.shape[0], train_df.shape[1]))
print("The test data has {} rows and {} columns".format(test_df.shape[0], test_df.shape[1]))

The training data has 7613 rows and 5 columns
The test data has 3263 rows and 4 columns


In [6]:
print("amount of empty cells in training data:")
print(train_df.isna().sum())
print("")
print("amount of empty cells in test data:")
print(test_df.isna().sum())

amount of empty cells in training data:
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

amount of empty cells in test data:
id             0
keyword       26
location    1105
text           0
dtype: int64


Normal Bag of Words
NLP Steps:
1. Lower words and remove punctuation
2. Expand contradictions
3. Lemmatizing
4. Remove Stopwords

In [7]:
def remove_punct(text: str):
    text = text.lower()
    text_nopunct = "".join([character for character in text if character not in string.punctuation])
    return text_nopunct

train_df["text"] = train_df["text"].apply(lambda x: remove_punct(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
5,8,,,rockyfire update california hwy 20 closed in ...,1
6,10,,,flood disaster heavy rain causes flash floodin...,1
7,13,,,im on top of the hill and i can see a fire in ...,1
8,14,,,theres an emergency evacuation happening now i...,1
9,15,,,im afraid that the tornado is coming to our area,1


In [8]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [9]:
contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))
def expand_contractions(s, contractions=contractions):
    def replace(match):
         return contractions[match.group(0)]
    return contractions_re.sub(replace, s)

train_df["text"] = train_df["text"].apply(lambda x: expand_contractions(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
5,8,,,rockyfire update california hwy 20 closed in ...,1
6,10,,,flood disaster heavy rain causes flash floodin...,1
7,13,,,im on top of the hill and i can see a fire in ...,1
8,14,,,theres an emergency evacuation happening now i...,1
9,15,,,im afraid that the tornado is coming to our area,1


In [10]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text: str):
    text = word_tokenize(text)
    
    lemmatize_text = " ".join([lemmatizer.lemmatize(word) for word in text])
    
    return lemmatize_text

train_df["text"] = train_df["text"].apply(lambda x: lemmatize(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deed are the reason of this earthquake may...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all resident asked to shelter in place are bei...,1
3,6,,,13000 people receive wildfire evacuation order...,1
4,7,,,just got sent this photo from ruby alaska a sm...,1
5,8,,,rockyfire update california hwy 20 closed in b...,1
6,10,,,flood disaster heavy rain cause flash flooding...,1
7,13,,,im on top of the hill and i can see a fire in ...,1
8,14,,,there an emergency evacuation happening now in...,1
9,15,,,im afraid that the tornado is coming to our area,1


In [11]:
#remove Stopwords
stopwords = set(stopwords.words('english')) 
def remove_stopwords(text: str):
    text = word_tokenize(text)
    
    text_no_sw = " ".join([w for w in text if w not in stopwords])
    
    return text_no_sw

train_df["text"] = train_df["text"].apply(lambda x: remove_stopwords(x))
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive u,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,resident asked shelter place notified officer ...,1
3,6,,,13000 people receive wildfire evacuation order...,1
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1
5,8,,,rockyfire update california hwy 20 closed dire...,1
6,10,,,flood disaster heavy rain cause flash flooding...,1
7,13,,,im top hill see fire wood,1
8,14,,,emergency evacuation happening building across...,1
9,15,,,im afraid tornado coming area,1


In [12]:
x_text = train_df["text"]
y = train_df["target"]

count_vect = CountVectorizer()
x = count_vect.fit_transform(x_text)
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
fold = StratifiedKFold(n_splits = 20)
models = []
f1 = []
NB_clf = RandomForestClassifier(n_estimators = 100)

for train_index, test_index in fold.split(x, y):
    print("Train Model {}.".format(len(models)))
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    NB_clf.fit(x_train, y_train)
    models.append(NB_clf)
    
    pred = NB_clf.predict(x_test)
    f1_model = f1_score(y_test, pred)
    
    print("The F1-Score is {}.".format(f1_model))
    f1.append(f1_model)

Train Model 0.
The F1-Score is 0.6409266409266409.
Train Model 1.
The F1-Score is 0.4453781512605042.
Train Model 2.
The F1-Score is 0.5313653136531366.
Train Model 3.
The F1-Score is 0.3067484662576687.
Train Model 4.
The F1-Score is 0.22222222222222224.
Train Model 5.
The F1-Score is 0.4688427299703264.
Train Model 6.
The F1-Score is 0.3864229765013054.
Train Model 7.
The F1-Score is 0.22291021671826625.
Train Model 8.
The F1-Score is 0.3739376770538244.
Train Model 9.
The F1-Score is 0.36746987951807236.
Train Model 10.
The F1-Score is 0.4547945205479452.
Train Model 11.
The F1-Score is 0.4028776978417266.
Train Model 12.
The F1-Score is 0.3130990415335463.
Train Model 13.
The F1-Score is 0.37142857142857144.
Train Model 14.
The F1-Score is 0.5147058823529412.
Train Model 15.
The F1-Score is 0.24390243902439027.
Train Model 16.
The F1-Score is 0.5739130434782609.
Train Model 17.
The F1-Score is 0.5244956772334294.
Train Model 18.
The F1-Score is 0.5979381443298969.
Train Model 19.
T

In [14]:
def prediction(model, data):
    full_prediction = np.zeros((data.shape[0], 1))
    for mod in model:
        pred = mod.predict(data)
        full_prediction += pred.reshape((data.shape[0], 1))
    
    pred_final = full_prediction/len(model)
    pred_final[pred_final < 0.5] = 0
    pred_final[pred_final >= 0.5] = 1 
    
    pred_final = pred_final.astype(int)
    
    return pred_final

In [15]:
#Predict the train data
pred_train = prediction(models, x)
print("The F1-Score for prediction the training data is {}".format(f1_score(y, pred_train)))

The F1-Score for prediction the training data is 0.9763133920437291


In [16]:
test_df["text"] = test_df["text"].apply(lambda x: remove_punct(x))
test_df["text"] = test_df["text"].apply(lambda x: expand_contractions(x))
test_df["text"] = test_df["text"].apply(lambda x: lemmatize(x))
test_df["text"] = test_df["text"].apply(lambda x: remove_stopwords(x))

x_test_text = test_df["text"]
x_test = count_vect.transform(x_test_text)
x_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
predict = prediction(models, x_test)
predict

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [0]])

In [18]:
submission = pd.concat([pd.DataFrame(test_df["id"]), pd.DataFrame(predict, columns = ["target"])], axis = 1)

submission["target"].value_counts()

0    2209
1    1054
Name: target, dtype: int64

In [19]:
submission.to_csv("submission.csv", index = False)