In [10]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## Read in Data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

## EDA

In [3]:
pd.DataFrame({'count': train.target.value_counts(), 
              'percentage': train.target.value_counts(normalize=True)})

Unnamed: 0,count,percentage
0,4342,0.57034
1,3271,0.42966


In [4]:
train["target"].value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x127df3a20>

give keyords distinct numbers

In [5]:
def map_keywords(series):
    mapper = {}
    u_series = series.unique()
    for i in range(len(u_series)):
        mapper[u_series[i]] = i
        
    return mapper

train_keyword_map = map_keywords(train.keyword)

train['keyword_num'] = train['keyword'].map(train_keyword_map)
test['keyword_num'] = test['keyword'].map(train_keyword_map)

In [17]:
def text_preprocessing(data):
    #remove whitespace and lower all words
    data = data.apply(lambda x: x.strip().lower())
    #replace digits
    data = data.apply(lambda x: re.sub(r'\d+', '', x))
    #replace punctuation
    data = data.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    #tokenize
    data = data.apply(lambda x : word_tokenize(x))
    #filter out stopwords
    data = data.apply(lambda x: [word for word in x if word not in stop_words])
    #remove inflection and return base word
    lemmatizer = WordNetLemmatizer()
    data = data.apply(lambda x: [lemmatizer.lemmatize(word, pos ='v') for word in x])
    #parts of speech tagging
    #data = data.apply(lambda x: [pos_tag(x)])
    return data

In [18]:
train['pro_text'] = text_preprocessing(train.text)
test['pro_text'] = text_preprocessing(test.text)

In [19]:
train.head()

Unnamed: 0,id,keyword,location,text,target,keyword_num,pro_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,0,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,0,"[residents, ask, shelter, place, notify, offic..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0,"[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0,"[get, send, photo, ruby, alaska, smoke, wildfi..."


## TFIDF Vectorizer
Matrix of token counts with TF-IDF transformation

In [20]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(["".join(i) for i in train["text"]])

In [22]:
train['pro_text']

0       [deeds, reason, earthquake, may, allah, forgiv...
1           [forest, fire, near, la, ronge, sask, canada]
2       [residents, ask, shelter, place, notify, offic...
3       [people, receive, wildfires, evacuation, order...
4       [get, send, photo, ruby, alaska, smoke, wildfi...
                              ...                        
7608    [two, giant, crane, hold, bridge, collapse, ne...
7609    [ariaahrary, thetawniest, control, wild, fire,...
7610           [utckm, volcano, hawaii, httptcozdtoydebj]
7611    [police, investigate, ebike, collide, car, lit...
7612    [latest, home, raze, northern, california, wil...
Name: pro_text, Length: 7613, dtype: object

In [23]:
[" ".join(i) for i in train["pro_text"]]

['deeds reason earthquake may allah forgive us',
 'forest fire near la ronge sask canada',
 'residents ask shelter place notify officer evacuation shelter place order expect',
 'people receive wildfires evacuation order california',
 'get send photo ruby alaska smoke wildfires pour school',
 'rockyfire update california hwy close directions due lake county fire cafire wildfires',
 'flood disaster heavy rain cause flash flood streets manitou colorado spring areas',
 'im top hill see fire woods',
 'theres emergency evacuation happen build across street',
 'im afraid tornado come area',
 'three people die heat wave far',
 'haha south tampa get flood hah wait second live south tampa gon na gon na fvck flood',
 'rain flood florida tampabay tampa days ive lose count',
 'flood bago myanmar arrive bago',
 'damage school bus multi car crash break',
 'whats man',
 'love fruit',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'ridiculous',
 'london cool',
 'love ski',
 'wonderful day',
 'looo

In [26]:
vector = vectorizer.fit_transform(["".join(i) for i in train["pro_text"]])

In [31]:
sum(vector.todense())

matrix([[1.        , 1.        , 1.        , ..., 0.70710678, 0.70710678,
         0.57735027]])

In [29]:
["".join(i) for i in train["pro_text"]]

['deedsreasonearthquakemayallahforgiveus',
 'forestfirenearlarongesaskcanada',
 'residentsaskshelterplacenotifyofficerevacuationshelterplaceorderexpect',
 'peoplereceivewildfiresevacuationordercalifornia',
 'getsendphotorubyalaskasmokewildfirespourschool',
 'rockyfireupdatecaliforniahwyclosedirectionsduelakecountyfirecafirewildfires',
 'flooddisasterheavyraincauseflashfloodstreetsmanitoucoloradospringareas',
 'imtophillseefirewoods',
 'theresemergencyevacuationhappenbuildacrossstreet',
 'imafraidtornadocomearea',
 'threepeopledieheatwavefar',
 'hahasouthtampagetfloodhahwaitsecondlivesouthtampagonnagonnafvckflood',
 'rainfloodfloridatampabaytampadaysivelosecount',
 'floodbagomyanmararrivebago',
 'damageschoolbusmulticarcrashbreak',
 'whatsman',
 'lovefruit',
 'summerlovely',
 'carfast',
 'goooooooaaaaaal',
 'ridiculous',
 'londoncool',
 'loveski',
 'wonderfulday',
 'looooool',
 'wayicanteatshit',
 'nyclastweek',
 'lovegirlfriend',
 'cooool',
 'likepasta',
 'end',
 'bbcmtdwholesalemarket