# Analizing tweets to know if they are from a real dissaster

In [1]:
#imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk

## Loading data

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [3]:
#Taking a look at the data
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Analisys

In [4]:
#We have a lot of nan values in location and in keyword
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [6]:
#The dataset is unbalanced
a = train[train['target'] == 1].count()
b = train[train['target'] == 0].count()

In [7]:
#Lets take the baseline prediction
baseline = b['target']/7613*100

## Text cleaning

In [8]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string

In [9]:
train.drop_duplicates()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [10]:
train[train['keyword'].isna() == False]['keyword']

31       ablaze
32       ablaze
33       ablaze
34       ablaze
35       ablaze
         ...   
7578    wrecked
7579    wrecked
7580    wrecked
7581    wrecked
7582    wrecked
Name: keyword, Length: 7552, dtype: object

In [11]:
lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [12]:
#Lets lemmatize the keywords
lista = []
i=0
ps = PorterStemmer()
for keyword in train['keyword']:
    lista.append(ps.stem(str(keyword).replace("%20"," ")))

In [13]:
len(lista)

7613

In [14]:
#We store de unique keywords to use them for the rows without keywords
unique_keywords = pd.unique(lista)

In [15]:
train['keyword'] = lista

In [16]:
train[train['keyword'] == "nan"]['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 61, dtype: object

In [41]:
train[['text','id']]

Unnamed: 0,text,id
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,4
2,All residents asked to 'shelter in place' are ...,5
3,"13,000 people receive #wildfires evacuation or...",6
4,Just got sent this photo from Ruby #Alaska as ...,7
...,...,...
7608,Two giant cranes holding a bridge collapse int...,10869
7609,@aria_ahrary @TheTawniest The out of control w...,10870
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,10871
7611,Police investigating after an e-bike collided ...,10872


In [45]:
i=0
words = []
for sentence, ids in zip(train[train['keyword'] == "nan"]['text'],train[train['keyword'] == "nan"]['id']):
    for key in unique_keywords:
        if key in sentence:
            words.append([key, ids])
    if len(words)+1<i:
        words.append(["nan", ids])
    i+=1  

In [22]:
len(lista)

7613

In [None]:
i = 0
j = 0
for word in lista:
    if word == 'nan':
        lista[j] = words[i]
        i += 1
    j+=1

In [46]:
len(words)

67

In [None]:
train['keyword'] = lista

In [47]:
words 

[['earthquak', 1],
 ['fire', 4],
 ['evacu', 5],
 ['evacu', 6],
 ['fire', 6],
 ['wildfir', 6],
 ['fire', 7],
 ['smoke', 7],
 ['wildfir', 7],
 ['fire', 8],
 ['wildfir', 8],
 ['disast', 10],
 ['flood', 10],
 ['fire', 13],
 ['emerg', 14],
 ['evacu', 14],
 ['tornado', 15],
 ['heat wav', 16],
 ['flood', 17],
 ['flood', 18],
 ['crash', 20],
 ['nan', 34],
 ['nan', 36],
 ['nan', 37],
 ['nan', 38],
 ['nan', 39],
 ['nan', 40],
 ['nan', 41],
 ['nan', 44],
 ['bomb', 10835],
 ['deton', 10835],
 ['suicide bomb', 10835],
 ['explod', 10837],
 ['nan', 10841],
 ['attack', 10842],
 ['earthquak', 10843],
 ['nan', 10844],
 ['nan', 10846],
 ['bomb', 10847],
 ['deton', 10847],
 ['explos', 10847],
 ['suicide bomb', 10847],
 ['loud bang', 10848],
 ['explod', 10849],
 ['scream', 10849],
 ['storm', 10851],
 ['trauma', 10851],
 ['debri', 10852],
 ['oil spil', 10859],
 ['siren', 10860],
 ['tornado', 10860],
 ['quarantin', 10862],
 ['evacu', 10863],
 ['bomb', 10864],
 ['evacu', 10864],
 ['bomb', 10866],
 ['storm', 1

In [None]:
train[train['keyword'] == "nan"]

In [None]:
train.head(50)