# Desafio Natural Language Processing with Disaster Tweets

In [1]:
import pandas as pd
import zipfile
from nltk.corpus import stopwords

In [2]:
zf = zipfile.ZipFile('./data/nlp-getting-started.zip')
train = pd.read_csv(zf.open('train.csv'))

In [3]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
test = pd.read_csv(zf.open('test.csv'))

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


- A coluna 'keyword' tem potencial, ja que no treino e no teste tem poucos dados faltantes
    - Preencher com label 'faltante' nos faltantes

In [7]:
train['text'][200]

'HAPPENING NOW - HATZOLAH EMS AMBULANCE RESPONDING WITH DUAL SIRENS AND\x89Û_ https://t.co/SeK6MQ6NJF'

In [8]:
train['text'][2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [9]:
from nltk.tokenize import word_tokenize, sent_tokenize

word_tokenize(train['text'][1])
word_tokenize(train['text'][200])

['HAPPENING',
 'NOW',
 '-',
 'HATZOLAH',
 'EMS',
 'AMBULANCE',
 'RESPONDING',
 'WITH',
 'DUAL',
 'SIRENS',
 'AND\x89Û_',
 'https',
 ':',
 '//t.co/SeK6MQ6NJF']

In [10]:
stop_words_nltk = list(stopwords.words('english'))

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(train['text'].values)

In [13]:
count_train

<7613x21363 sparse matrix of type '<class 'numpy.int64'>'
	with 74103 stored elements in Compressed Sparse Row format>

In [14]:
from scipy.sparse import csr_matrix

mtr = csr_matrix(count_train).toarray()

In [15]:
mtr[0].sum()

5

In [16]:
train['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [17]:
word_tokenize(train['text'][0])

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#',
 'earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

In [18]:
count_vectorizer.get_feature_names()[-100]

'zss'

In [19]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

In [20]:
train['text'].values

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [21]:
lista_sent = [tweet_tokenizer.tokenize(sent) for sent in train['text'].values]

In [22]:
lista_tokens_no_stop = [[token.lower() for token in sent if token not in stopwords.words('english')] for sent in lista_sent]

In [23]:
# for sent in lista_sent:
#     for token in sent:
#         if token not in stopwords.words('english')

In [24]:
lista_tokens_no_stop

[['our', 'deeds', 'reason', '#earthquake', 'may', 'allah', 'forgive', 'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada'],
 ['all',
  'residents',
  'asked',
  "'",
  'shelter',
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#alaska',
  'smoke',
  '#wildfires',
  'pours',
  'school'],
 ['#rockyfire',
  'update',
  '=',
  '>',
  'california',
  'hwy',
  '.',
  '20',
  'closed',
  'directions',
  'due',
  'lake',
  'county',
  'fire',
  '-',
  '#cafire',
  '#wildfires'],
 ['#flood',
  '#disaster',
  'heavy',
  'rain',
  'causes',
  'flash',
  'flooding',
  'streets',
  'manitou',
  ',',
  'colorado',
  'springs',
  'areas'],
 ["i'm", 'top', 'hill', 'i', 'see', 'fire', 'woods', '...'],
 ["there's",
  'emergency',
  'evacuation',
  'happenin

In [25]:
lista_tokens_no_stop

[['our', 'deeds', 'reason', '#earthquake', 'may', 'allah', 'forgive', 'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada'],
 ['all',
  'residents',
  'asked',
  "'",
  'shelter',
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#alaska',
  'smoke',
  '#wildfires',
  'pours',
  'school'],
 ['#rockyfire',
  'update',
  '=',
  '>',
  'california',
  'hwy',
  '.',
  '20',
  'closed',
  'directions',
  'due',
  'lake',
  'county',
  'fire',
  '-',
  '#cafire',
  '#wildfires'],
 ['#flood',
  '#disaster',
  'heavy',
  'rain',
  'causes',
  'flash',
  'flooding',
  'streets',
  'manitou',
  ',',
  'colorado',
  'springs',
  'areas'],
 ["i'm", 'top', 'hill', 'i', 'see', 'fire', 'woods', '...'],
 ["there's",
  'emergency',
  'evacuation',
  'happenin

In [None]:
train