In [1]:
import pandas as pd
import env
import utilities as utils
import numpy as np

import nltk

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')

In [3]:
train.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train.shape

(7613, 4)

In [5]:
train.keyword.value_counts(dropna=False)

NaN                      61
fatalities               45
deluge                   42
armageddon               42
harm                     41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 222, dtype: int64

## Clean up keyword series

In [6]:
train.keyword = train.keyword.str.replace('%20', '_').str.lower().str.strip()

In [7]:
train.keyword.value_counts(dropna=False)

NaN                    61
fatalities             45
deluge                 42
armageddon             42
harm                   41
                       ..
forest_fire            19
epicentre              12
threat                 11
inundation             10
radiation_emergency     9
Name: keyword, Length: 222, dtype: int64

## Clean up location

In [8]:
train.location = train.location.str.lower().str.strip()

In [9]:
train.location.value_counts()

usa                     105
new york                 77
london                   50
united states            50
nigeria                  35
                       ... 
nanaimo, bc, canada       1
laguna beach, calif.      1
whitby, on                1
#bossnation!              1
brizzle city !            1
Name: location, Length: 3164, dtype: int64

### Explore a bit before doing this ^^

## Clean up Text

In [10]:
import prepare as prep

In [11]:
train['text_cleaned'] = pd.Series([prep.basic_clean(string) for string in train.text])

In [12]:
train.text_cleaned.head()

id
1                forest fire near la ronge sask canada
4    just got sent this photo from ruby alaska as s...
5    rockyfire update  california hwy 20 closed in ...
6    flood disaster heavy rain causes flash floodin...
7    i'm on top of the hill and i can see a fire in...
Name: text_cleaned, dtype: object

In [13]:
train[train.text_cleaned.isnull()]

Unnamed: 0_level_0,keyword,location,text,target,text_cleaned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7613,pandemonium,houston tx,Pandemonium In Aba As Woman Delivers Baby With...,1,
7614,pandemonium,,World Class Tgirl Ass 02 - Scene 4 - Pandemoni...,0,
7616,pandemonium,,Hey all take a look at my review of 'Pandemoni...,0,
7617,pandemonium,"durham, nc",Element of Freedom at Mirage Saturday! 21+ Lad...,0,
7619,pandemonium,,World Class Tgirl Ass 02 - Scene 4 - Pandemoni...,0,
...,...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1,
10870,,,@aria_ahrary @TheTawniest The out of control w...,1,
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,
10872,,,Police investigating after an e-bike collided ...,1,
