In [2]:
from pathlib import Path

import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup


import libs.commons as commons

In [3]:
train_path = Path(commons.dataset_path) / "train.csv"

train_set = pd.read_csv(train_path)

In [4]:
train_set['text_len'] = train_set['text'].apply(len)

In [5]:
train_set.head()

Unnamed: 0,id,keyword,location,text,target,text_len
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,38
2,5,,,All residents asked to 'shelter in place' are ...,1,133
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88


In [6]:
## Frequency of Keyword values
# train_set.describe()
# train_set['keyword'].nunique()
key_group = train_set.groupby('keyword')
key_group.count()

Unnamed: 0_level_0,id,location,text,target,text_len
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ablaze,36,29,36,36,36
accident,35,28,35,35,35
aftershock,34,25,34,34,34
airplane%20accident,35,29,35,35,35
ambulance,38,26,38,38,38
...,...,...,...,...,...
wounded,37,27,37,37,37
wounds,33,27,33,33,33
wreck,37,27,37,37,37
wreckage,39,28,39,39,39


In [7]:
## Frequency of Keyword values
loc_group = train_set.groupby('location')
loc_group.count().sort_values(by='id', ascending=False)

Unnamed: 0_level_0,id,keyword,text,target,text_len
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
USA,104,104,104,104,104
New York,71,71,71,71,71
United States,50,50,50,50,50
London,45,45,45,45,45
Canada,29,29,29,29,29
...,...,...,...,...,...
Hueco Mundo,1,1,1,1,1
"Hughes, AR",1,1,1,1,1
"Huntington, WV",1,1,1,1,1
"Huntley, IL",1,1,1,1,1


In [8]:
def clean_loc(x):
    x = str(x)
    if x:
        return x.lower()
    else:
        return None

train_proc = train_set.copy()
train_proc['location_clean'] = train_proc['location'].apply(clean_loc)
loc_group = train_proc.groupby('location_clean')
loc_group.count().sort_values(by='id', ascending=False)

Unnamed: 0_level_0,id,keyword,location,text,target,text_len
location_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,2533,2472,0,2533,2533,2533
usa,104,104,104,104,104,104
new york,75,75,75,75,75,75
united states,50,50,50,50,50,50
london,49,49,49,49,49,49
...,...,...,...,...,...,...
"haiku, maui, hawaii",1,1,1,1,1,1
hailing from dayton,1,1,1,1,1,1
halfrica,1,1,1,1,1,1
"halifax, nouvelle-ìäcosse",1,1,1,1,1,1


In [9]:
for loc in loc_group.groups.keys():
    print(loc)

thern ireland
northern kentucky, usa
norwalk, ct
norway
norwich
not a u.s resident
not los angeles, not new york.
not of this world
not so cool ky
not steven yeun / amc.
not where i want to be, yet
nottingham
nottingham, england
nottingham, united kingdom
notts
nova scotia, canada
novi, mi
nowhere
nowhere islands/smash manor
nowhere. everywhere.
numa casa de old yellow bricks
numenor
nunya
nv
ny
ny capital district
ny || live easy? 
ny, ct & greece
ny, ny
nyc
nyc / international
nyc :) ex- #islamophobe
nyc area
nyc metro
nyc&nj
nyc, new york
nyc,us - cali, colombia
nyc-la-miami
nyhc
oakland
oakland, ca
oblivion?
ocean city, nj
odawara, japan
oes 4th point. sisstar & ti
official website
ogba, lagos, nigeria
ohio
ohio, usa
ojodu,lagos
ok
okanagan valley, bc
oklahoma
oklahoma city
oklahoma city, ok
oklahoma, usa
okuma town, fukushima
olathe, ks
old blighty
oldenburg // london
olympia, wa
oman muscat al seeb 
on
on a beach 
on a catwalk somewhere
on the court 
on the go
on the toilet havin

In [10]:
# Preprocess and clean text features
nltk.download("stopwords")
stopwords_set = set(stopwords.words("english"))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/olavosamp/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
def process_text(text):
    text = BeautifulSoup(text, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9#]", " ", text) # Keep only alphanumeric characters and hashtags
    text = text.lower()
    words = set(text.split()) # Split string into words
    
    # Remove stopwords
    words = list(words - stopwords_set)
    # words = [w for w in words if w not in stopwords.words("english")]
    
    # Stem words with PorterStemmer
    stemmer = PorterStemmer()
    def stem_word(x):
        return stemmer.stem(x)
    words = list(map(stem_word, words))
    
    return words

In [21]:
# Apply text preprocessing and BoW codification to the dataset text features
def process_dataset(dataset, cached_path=Path(commons.dataset_path) / 'train_processed.csv'):
    cached_path = Path(cached_path)
    if cached_path.is_file():
        return pd.read_csv(cached_path)
    
    dataset['text_proc'] = dataset['text'].apply(process_text)
    if cached_path:
        dataset.to_csv(cached_path)
    return dataset

In [19]:
for entry in train_proc.loc[0:20, 'text']:
    print(process_text(entry))


['deed', 'forgiv', '#earthquak', 'allah', 'reason', 'us', 'may']
['rong', 'canada', 'fire', 'near', 'la', 'forest', 'sask']
['place', 'shelter', 'notifi', 'offic', 'evacu', 'resid', 'expect', 'ask', 'order']
['receiv', 'california', 'evacu', 'peopl', '13', '000', '#wildfir', 'order']
['smoke', 'school', 'rubi', 'got', 'photo', 'pour', 'sent', '#wildfir', '#alaska']
['california', 'fire', 'lake', 'counti', 'direct', 'updat', 'close', 'hwi', '#rockyfir', '#cafir', '#wildfir', 'due', '20']
['rain', 'area', 'spring', 'street', '#flood', 'manit', 'heavi', 'flash', 'colorado', 'caus', 'flood', '#disast']
['fire', 'see', 'top', 'hill', 'wood']
['evacu', 'happen', 'emerg', 'build', 'street', 'across']
['area', 'tornado', 'afraid', 'come']
['three', 'die', 'peopl', 'far', 'heat', 'wave']
['#flood', 'haha', 'fvck', 'gonna', 'second', 'south', 'hah', 'get', 'flood', 'live', 'tampa', 'wait']
['#flood', 'count', 'lost', 'day', '#tampabay', '#tampa', '#rain', '18', '#florida', '19']
['#we', '#flood'

In [22]:
train_proc = process_dataset(train_set)

In [23]:
for entry in train_proc.loc[0:20, 'text']:
    print(process_text(entry))

['deed', 'forgiv', '#earthquak', 'allah', 'reason', 'us', 'may']
['rong', 'canada', 'fire', 'near', 'la', 'forest', 'sask']
['place', 'shelter', 'notifi', 'offic', 'evacu', 'resid', 'expect', 'ask', 'order']
['receiv', 'california', 'evacu', 'peopl', '13', '000', '#wildfir', 'order']
['smoke', 'school', 'rubi', 'got', 'photo', 'pour', 'sent', '#wildfir', '#alaska']
['california', 'fire', 'lake', 'counti', 'direct', 'updat', 'close', 'hwi', '#rockyfir', '#cafir', '#wildfir', 'due', '20']
['rain', 'area', 'spring', 'street', '#flood', 'manit', 'heavi', 'flash', 'colorado', 'caus', 'flood', '#disast']
['fire', 'see', 'top', 'hill', 'wood']
['evacu', 'happen', 'emerg', 'build', 'street', 'across']
['area', 'tornado', 'afraid', 'come']
['three', 'die', 'peopl', 'far', 'heat', 'wave']
['#flood', 'haha', 'fvck', 'gonna', 'second', 'south', 'hah', 'get', 'flood', 'live', 'tampa', 'wait']
['#flood', 'count', 'lost', 'day', '#tampabay', '#tampa', '#rain', '18', '#florida', '19']
['#we', '#flood'

In [47]:
def process_location(text):
    # print(text)
    text = BeautifulSoup(text, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9#]", " ", text) # Keep only alphanumeric characters and hashtags
    text = text.lower()
    words = text.split() # Split string into words
    # words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    # words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [65]:
train_not_missing = train_proc.dropna(axis=0, how='any', subset=['location']).reset_index(drop=True)
for entry in train_not_missing.loc[0:20, 'location']:
    print(process_location(entry))


[&#39;birmingham&#39;]
[&#39;est&#39;, &#39;september&#39;, &#39;2012&#39;, &#39;bristol&#39;]
[&#39;africa&#39;]
[&#39;philadelphia&#39;, &#39;pa&#39;]
[&#39;london&#39;, &#39;uk&#39;]
[&#39;pretoria&#39;]
[&#39;world&#39;, &#39;wide&#39;]
[&#39;paranaque&#39;, &#39;city&#39;]
[&#39;live&#39;, &#39;on&#39;, &#39;webcam&#39;]
[&#39;milky&#39;, &#39;way&#39;]
[&#39;greensboro&#39;, &#39;north&#39;, &#39;carolina&#39;]
[&#39;live&#39;, &#39;on&#39;, &#39;webcam&#39;]
[&#39;england&#39;]
[&#39;sheffield&#39;, &#39;township&#39;, &#39;ohio&#39;]
[&#39;india&#39;]
[&#39;barbados&#39;]
[&#39;anaheim&#39;]
[&#39;abuja&#39;]
[&#39;usa&#39;]
[&#39;south&#39;, &#39;africa&#39;]
[&#39;sao&#39;, &#39;paulo&#39;, &#39;brazil&#39;]


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(train_proc['text'])