# Desafio Natural Language Processing with Disaster Tweets

In [1]:
import pandas as pd
import zipfile
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix

In [2]:
zf = zipfile.ZipFile('./data/nlp-getting-started.zip')
train = pd.read_csv(zf.open('train.csv'))
test = pd.read_csv(zf.open('test.csv'))

In [3]:
train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [4]:
test.head(3)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


- A coluna 'keyword' tem potencial, ja que no treino e no teste tem poucos dados faltantes
    - Preencher com label 'faltante' nos faltantes

In [7]:
word_tokenize(train['text'][200])

['HAPPENING',
 'NOW',
 '-',
 'HATZOLAH',
 'EMS',
 'AMBULANCE',
 'RESPONDING',
 'WITH',
 'DUAL',
 'SIRENS',
 'AND\x89Û_',
 'https',
 ':',
 '//t.co/SeK6MQ6NJF']

In [8]:
stop_words_nltk = list(stopwords.words('english'))
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(train['text'].values)

In [9]:
csr_matrix(count_train).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
train['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [11]:
# Checando a tokenizacao

word_tokenize(train['text'][0])

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#',
 'earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

## Tweet tokenizer

In [12]:
from nltk.tokenize import TweetTokenizer

def tweet_tokenize_column(df, column):
    """ 
        This function gets the Dataframe and the name of a column (String) containing texts (Strings) and returns
        a list of lists containing the tokenized text. It also turns every token to it's lower form.
        
        Input: Pandas DataFrame, String
        Return: Nested List
    """
    
    tweet_tokenizer = TweetTokenizer()
    
    # List of sentences
    list_sent = [tweet_tokenizer.tokenize(sent) for sent in df[column].values]
    
    # List of sentences excluding stopword tokens
    list_sent_no_stop = [[token.lower() 
                           for token in sent 
                           if token not in stopwords.words('english')] 
                           for sent in list_sent]
    
    
    
    return list_sent_no_stop

In [13]:
tokenized_sent_train = tweet_tokenize_column(train,'text')
tokenized_sent_test = tweet_tokenize_column(test,'text')

In [14]:
tokenized_sent_train[:2]

[['our', 'deeds', 'reason', '#earthquake', 'may', 'allah', 'forgive', 'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada']]

In [15]:
tokenized_sent_test[:2]

[['just', 'happened', 'terrible', 'car', 'crash'],
 ['heard',
  '#earthquake',
  'different',
  'cities',
  ',',
  'stay',
  'safe',
  'everyone',
  '.']]

Aplicando o TF-IDF nos datasets. Esses tem como caracteristicas:
- Contem palavras somente em letra minuscula
- Nao tem stopwords
- Foi tokenizado com o TweetTokenizer

In [16]:
# Funcao auxiliar para bypass do tokenizador, uma vez que este passo ja foi feito.
def identity_tokenizer(text):
    return text

tfidf_train = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)    
tfidf_train_fit = tfidf_train.fit_transform(tokenized_sent_train)

tfidf_train.get_feature_names()[50:60]



['#abha',
 '#ableg',
 '#abomb',
 '#abstorm',
 '#accident',
 '#accidentalprophecy',
 '#acenewsdesk',
 '#achedin',
 '#act',
 '#actionmoviestaughtus']

In [17]:
tfidf_test = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)    
tfidf_test_fit = tfidf_test.fit_transform(tokenized_sent_test)

tfidf_test.get_feature_names()[50:60]

['#amtrak',
 '#amwriting',
 '#anchorage',
 '#ancient',
 '#animalrescue',
 '#anime',
 '#anonymous',
 '#anthrax',
 '#anti-terrorism',
 '#anticipate']

In [18]:
print("TF-IDF DataFrame dimensions: {}\n".format(tfidf_train_fit.toarray().shape))
print("TF-IDF Number or Features: {}\n".format(len(tfidf_train.get_feature_names())))

TF-IDF DataFrame dimensions: (7613, 22900)

TF-IDF Number or Features: 22900



Faz sentido, ja que o numero de colunas do ``tfidf_train_fit`` corresponde ao numero de tokens, e a contagem do ``tfidf_train.get_feature_names()`` tambem. 

In [19]:
# Criando dataframes de treino e teste pos TF-IDF

tfidf_train_df = pd.DataFrame(tfidf_train_fit.toarray(), columns=tfidf_train.get_feature_names())
tfidf_test_df = pd.DataFrame(tfidf_test_fit.toarray(), columns=tfidf_test.get_feature_names())

In [20]:
# Acrescentando o target ao dataframe

tfidf_train_df["target_column"] = train["target"]
tfidf_test_df["target_column"] = 0

In [21]:
tfidf_train_df.head(3)

Unnamed: 0,!,#,##book,##fukushima,##youtube,#0215,#034,#039,#06,#09,...,ûò800000,ûòthe,ûòåêcnbc,ûó,ûóher,ûókody,ûónegligence,ûótech,ûówe,target_column
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [22]:
tfidf_test_df.head(3)

Unnamed: 0,!,#,#039,#05,#0518,#12k,#16,#1oak,#21dayfix,#26,...,ûò,ûò7,ûòdon,ûó,ûócategorically,ûókaiserjaegers,ûókill,ûówe,ûówere,target_column
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [23]:
train['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [25]:
tfidf_train_df['reason']

0       0.384545
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
7608    0.000000
7609    0.000000
7610    0.000000
7611    0.000000
7612    0.000000
Name: reason, Length: 7613, dtype: float64

Observamos acima que a palavra 'reason' tem score na primeira sentenca. Isso e coerente ja que ela aparece na primeira linha do dataframe de treino.

## To-Do
- Selecionar variaveis mais importantes (Chi^2 | Informacao Mutua)
- Selecionar colunas contendo essas variaveis tanto no treino quanto no teste
    - As colunas devem estar na mesma ordem!