# Desafio Natural Language Processing with Disaster Tweets

In [1]:
import pandas as pd
import zipfile
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
zf = zipfile.ZipFile('./data/nlp-getting-started.zip')
train = pd.read_csv(zf.open('train.csv'))

In [3]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
test = pd.read_csv(zf.open('test.csv'))

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


- A coluna 'keyword' tem potencial, ja que no treino e no teste tem poucos dados faltantes
    - Preencher com label 'faltante' nos faltantes

In [7]:
from nltk.tokenize import word_tokenize, sent_tokenize

word_tokenize(train['text'][1])
word_tokenize(train['text'][200])

['HAPPENING',
 'NOW',
 '-',
 'HATZOLAH',
 'EMS',
 'AMBULANCE',
 'RESPONDING',
 'WITH',
 'DUAL',
 'SIRENS',
 'AND\x89Û_',
 'https',
 ':',
 '//t.co/SeK6MQ6NJF']

In [8]:
stop_words_nltk = list(stopwords.words('english'))

In [9]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(train['text'].values)

In [10]:
count_train

<7613x21363 sparse matrix of type '<class 'numpy.int64'>'
	with 74103 stored elements in Compressed Sparse Row format>

In [11]:
from scipy.sparse import csr_matrix

mtr = csr_matrix(count_train).toarray()

In [12]:
train['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [13]:
word_tokenize(train['text'][0])

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#',
 'earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

## Tweet tokenizer

In [14]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

In [15]:
train['text'].values

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [16]:
# Lista de sentencas.

lista_sent = [tweet_tokenizer.tokenize(sent) for sent in train['text'].values]

In [17]:
# Lista de sentencas sem stopwords

lista_sent_no_stop = [[token.lower() for token in sent if token not in stopwords.words('english')] for sent in lista_sent]

Aplicando o TF-IDF no no dataset de treino modificado. O dataset modificado
- Contem palavras somente em letra minuscula
- Nao tem stopwords
- Foi tokenizado com o TweetTokenizer

In [18]:
# Funcao auxiliar para bypass do tokenizador, uma vez que este passo ja foi feito.
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)    
tfidf_fit = tfidf.fit_transform(lista_sent_no_stop)



In [19]:
tfidf.get_feature_names()

['!',
 '#',
 '##book',
 '##fukushima',
 '##youtube',
 '#0215',
 '#034',
 '#039',
 '#06',
 '#09',
 '#1-1st',
 '#1008pla',
 '#1008planet',
 '#124',
 '#140',
 '#16',
 '#163',
 '#17',
 '#171',
 '#2015',
 '#20150613',
 '#22days',
 '#24',
 '#263chat',
 '#2a',
 '#2fast2furious',
 '#2minutemix',
 '#360wisenews',
 '#365disasters',
 '#37592',
 '#38745',
 '#3novices',
 '#452',
 '#4playthursdays',
 '#5sosfam',
 '#615',
 '#629',
 '#7newsadl',
 '#8217',
 '#8392',
 '#89x',
 '#911',
 '#9973',
 '#999day',
 '#9newsmornings',
 '#abandoned',
 '#abbott',
 '#abc',
 '#abc7eyewitness',
 '#abcnews',
 '#abha',
 '#ableg',
 '#abomb',
 '#abstorm',
 '#accident',
 '#accidentalprophecy',
 '#acenewsdesk',
 '#achedin',
 '#act',
 '#actionmoviestaughtus',
 '#adani',
 '#addtexastonext1dtour',
 '#adiossuperbacterias',
 '#adjust',
 '#adult',
 '#aeroplane',
 '#aerospace',
 '#afc',
 '#afghanistan',
 '#afp',
 '#africa',
 '#africanbaze',
 '#africansinsf',
 '#after',
 '#afterhaiyan',
 '#afterlife',
 '#aftershock',
 '#age',
 '#ai

In [20]:
tfidf_fit.toarray().shape

(7613, 22900)

In [21]:
len(tfidf.get_feature_names())

22900

Faz sentido, ja que o numero de colunas do ``tfidf_fit`` corresponde ao numero de tokens, e a contagem do ``tfidf.get_feature_names()`` tambem. 

In [22]:
tfidf_df = pd.DataFrame(tfidf_fit.toarray(), columns=tfidf.get_feature_names())

In [23]:
tfidf_df.iloc[0].values.sum()

2.2226131392341495

In [24]:
# Acrescentando o target ao dataframe

tfidf_df["target_column"] = train["target"]

In [25]:
tfidf_df

Unnamed: 0,!,#,##book,##fukushima,##youtube,#0215,#034,#039,#06,#09,...,ûò800000,ûòthe,ûòåêcnbc,ûó,ûóher,ûókody,ûónegligence,ûótech,ûówe,target_column
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [26]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [27]:
tfidf_df['reason']

0       0.384545
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
7608    0.000000
7609    0.000000
7610    0.000000
7611    0.000000
7612    0.000000
Name: reason, Length: 7613, dtype: float64

observamos acima que a palavra 'reason' tem score na primeira sentenca. Isso e coerente ja que ela aparece na primeira linha do dataframe de treino.