# Analizing tweets to know if they are from a real dissaster

In [126]:
#imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

## Loading data

In [49]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [3]:
#Taking a look at the data
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Analisys

In [4]:
#We have a lot of nan values in location and in keyword
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [6]:
#The dataset is unbalanced
a = train[train['target'] == 1].count()
b = train[train['target'] == 0].count()

In [7]:
#Lets take the baseline prediction
baseline = b['target']/7613*100

## Text cleaning

In [8]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string

In [9]:
train.drop_duplicates()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [10]:
train[train['keyword'].isna() == False]['keyword']

31       ablaze
32       ablaze
33       ablaze
34       ablaze
35       ablaze
         ...   
7578    wrecked
7579    wrecked
7580    wrecked
7581    wrecked
7582    wrecked
Name: keyword, Length: 7552, dtype: object

In [11]:
lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [12]:
#Lets lemmatize the keywords
lista = []
i=0
ps = PorterStemmer()
for keyword in train['keyword']:
    lista.append(ps.stem(str(keyword).replace("%20"," ")))
    
# for key,text in zip(lista,train['text']):
#     if key == "nan":
#         for key_2 in lista:
#             if key_2 in text:
#                 key = key_2

In [13]:
lista

['nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'ablaz',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'accid',
 'aftershock',
 'aftershock',
 'aftershock

In [14]:
#We store de unique keywords to use them for the rows without keywords
unique_keywords = pd.unique(lista)

In [15]:
train['keyword'] = lista

In [16]:
train[train['keyword'] == "nan"]['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 61, dtype: object

In [17]:
train[['text','id']]

Unnamed: 0,text,id
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,4
2,All residents asked to 'shelter in place' are ...,5
3,"13,000 people receive #wildfires evacuation or...",6
4,Just got sent this photo from Ruby #Alaska as ...,7
...,...,...
7608,Two giant cranes holding a bridge collapse int...,10869
7609,@aria_ahrary @TheTawniest The out of control w...,10870
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,10871
7611,Police investigating after an e-bike collided ...,10872


In [42]:
i=0
words = []
for sentence in train[train['keyword'] == "nan"]['text']:
    for key in unique_keywords:
        if key in sentence:
            words.append(key)
    if len(words)+1<i:
        words.append("nan")
    i+=1  

In [43]:
len(lista)

7613

In [44]:
a = 0
i = 0
indexes = []
for word in words:
    if word[1] == a:
        indexes.append(i)
    a = word[1]
    i+=1

In [45]:
for index in sorted(indexes, reverse=True):
    del words[index]

In [47]:
words

['earthquak',
 'fire',
 'evacu',
 'fire',
 'smoke',
 'wildfir',
 'flood',
 'fire',
 'emerg',
 'evacu',
 'tornado',
 'heat wav',
 'flood',
 'crash',
 'nan',
 'bomb',
 'deton',
 'suicide bomb',
 'explod',
 'nan',
 'attack',
 'earthquak',
 'bomb',
 'deton',
 'explos',
 'suicide bomb',
 'loud bang',
 'explod',
 'scream',
 'storm',
 'trauma',
 'debri',
 'oil spil',
 'tornado',
 'quarantin',
 'evacu',
 'bomb',
 'evacu',
 'bomb',
 'storm',
 'bridge collaps',
 'collaps',
 'fire',
 'troubl',
 'wild fir',
 'collid',
 'injur',
 'threat',
 'fire']

In [24]:
i = 0
j = 0
for word in lista:
    if word == 'nan':
        lista[j] = words[i]
        i += 1
    j+=1

IndexError: list index out of range

In [25]:
len(words)

49

In [None]:
train['keyword'] = lista

In [None]:
words

In [None]:
train[train['keyword'] == "nan"]

In [None]:
train.head(50)

## Rallado

In [54]:
str(train['keyword'][0])

'nan'

In [74]:
lista = []
i = 0
for keyword,text in zip(train['keyword'],train['text']):
    if str(keyword) == 'nan':
        keyword = str(keyword).lower()
        for keyword in train['keyword']:
            keyword = str(keyword).lower()
            if keyword in text.lower():
                if len(lista) < i+1:
                    lista.append(keyword)
        if len(lista) < i+1:
                    lista.append('nan')
    else: 
        lista.append(keyword)
    i+=1

In [75]:
len(lista)

7613

In [76]:
train['keyword'] = lista

In [78]:
#Lets lemmatize the keywords
lista = []
i=0
ps = PorterStemmer()
for keyword in train['keyword']:
    lista.append(ps.stem(str(keyword).replace("%20"," ")))

In [79]:
train['keyword'] = lista

In [80]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,earthquak,,Our Deeds are the Reason of this #earthquake M...,1
1,4,fire,,Forest fire near La Ronge Sask. Canada,1
2,5,evacu,,All residents asked to 'shelter in place' are ...,1
3,6,evacu,,"13,000 people receive #wildfires evacuation or...",1
4,7,fire,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,collaps,,Two giant cranes holding a bridge collapse int...,1
7609,10870,fire,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,volcano,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,collid,,Police investigating after an e-bike collided ...,1


In [100]:
text_list = []
dic = {"@":"", ":":"", "#":"",".":"","?":"","[":"","]":"","/":" ","(":"",")":"","_":"","-":"","!":"","¡":"","'":"","|":"","\n":" ","&":""}
for sentence in train['text']:
    new_sentence = sentence.lower()
    for i, j in dic.items():
        new_sentence = new_sentence.replace(i, j)
    text_list.append(new_sentence.strip())

In [101]:
text_list

['our deeds are the reason of this earthquake may allah forgive us all',
 'forest fire near la ronge sask canada',
 'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected',
 '13,000 people receive wildfires evacuation orders in california',
 'just got sent this photo from ruby alaska as smoke from wildfires pours into a school',
 'rockyfire update => california hwy 20 closed in both directions due to lake county fire  cafire wildfires',
 'flood disaster heavy rain causes flash flooding of streets in manitou, colorado springs areas',
 'im on top of the hill and i can see a fire in the woods',
 'theres an emergency evacuation happening now in the building across the street',
 'im afraid that the tornado is coming to our area',
 'three people died from the heat wave so far',
 'haha south tampa is getting flooded hah wait a second i live in south tampa what am i gonna do what am i gonna do fvck flooding',
 'rainin

In [102]:
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
lmtzr = WordNetLemmatizer()
def preprocessing(df):
    pattern = r'[0-9]'
    df['text'] = df.text.apply(lambda x: x.strip())
    df['text'] = df.text.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    df['text'] = df.text.apply(lambda x: x.lower())
    df['text'] = df.text.apply(lambda x: re.sub(pattern, '', x))
    df['text'] = df.text.apply(lambda x:' '.join([lmtzr.lemmatize(w) for w in x.split()]))
    return df['text']

In [105]:
train['text'] = preprocessing(train)

In [109]:
test['text'] = preprocessing(test)

In [113]:
test.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [118]:
lista_test = []
i = 0
for keyword,text in zip(test['keyword'],test['text']):
    if str(keyword) == 'nan':
        keyword = str(keyword).lower()
        for keyword in test['keyword']:
            keyword = str(keyword).lower()
            if keyword in text.lower():
                if len(lista_test) < i+1:
                    lista_test.append(keyword)
        if len(lista_test) < i+1:
                    lista_test.append('nan')
    else: 
        lista_test.append(keyword)
    i+=1

In [119]:
test['keyword'] = lista_test

In [124]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,earthquak,,our deed are the reason of this earthquake may...,1
1,4,fire,,forest fire near la ronge sask canada,1
2,5,evacu,,all resident asked to shelter in place are bei...,1
3,6,evacu,,people receive wildfire evacuation order in ca...,1
4,7,fire,,just got sent this photo from ruby alaska a sm...,1
...,...,...,...,...,...
7608,10869,collaps,,two giant crane holding a bridge collapse into...,1
7609,10870,fire,,ariaahrary thetawniest the out of control wild...,1
7610,10871,volcano,,m utckm s of volcano hawaii httptcozdtoydebj,1
7611,10872,collid,,police investigating after an ebike collided w...,1


## Model

In [136]:
X_train = train['text']
X_test = test['text']
y_train = train['target']

In [137]:
X_train

0       our deed are the reason of this earthquake may...
1                   forest fire near la ronge sask canada
2       all resident asked to shelter in place are bei...
3       people receive wildfire evacuation order in ca...
4       just got sent this photo from ruby alaska a sm...
                              ...                        
7608    two giant crane holding a bridge collapse into...
7609    ariaahrary thetawniest the out of control wild...
7610         m utckm s of volcano hawaii httptcozdtoydebj
7611    police investigating after an ebike collided w...
7612    the latest more home razed by northern califor...
Name: text, Length: 7613, dtype: object

In [138]:
vectorizer = CountVectorizer(ngram_range = (2,2))
naivebayes = MultinomialNB()

X_bow = vectorizer.fit_transform(X_train)

cv_nb = cross_validate(
    naivebayes,
    X_bow,
    y_train,
    scoring = "accuracy"
)

round(cv_nb['test_score'].mean(),2)

0.64