In [2]:
from pathlib import Path

import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup

import libs.commons as commons

In [3]:
train_path = Path(commons.dataset_path) / "train.csv"
test_path = Path(commons.dataset_path) / "test.csv"
test_set = pd.read_csv(test_path)
train_set = pd.read_csv(train_path)

In [4]:
train_set['text_len'] = train_set['text'].apply(len)

Taking a look at the dataset by examining the first few rows of train and test sets

In [5]:
test_set.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
train_set.head()

Unnamed: 0,id,keyword,location,text,target,text_len
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,38
2,5,,,All residents asked to 'shelter in place' are ...,1,133
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88


Let's examine the keyword feature by sorting the data by the most frequently ocurring keywords

In [7]:
## Frequency of Keyword values
# train_set.describe()
print(f"{train_set['keyword'].nunique()} unique keywords")
key_group = train_set.groupby('keyword')
key_group.count().sort_values(by='id', ascending=False)

221 unique keywords


Unnamed: 0_level_0,id,location,text,target,text_len
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fatalities,45,32,45,45,45
deluge,42,29,42,42,42
armageddon,42,32,42,42,42
sinking,41,23,41,41,41
damage,41,30,41,41,41
...,...,...,...,...,...
forest%20fire,19,12,19,19,19
epicentre,12,9,12,12,12
threat,11,10,11,11,11
inundation,10,5,10,10,10


The keywords feature seems reasonable. We can see a few Unicode space symbols (%20), but the rarest keyword still has 9 occurrences.
Let's do the same to the location feature.

In [8]:
## Frequency of location values
loc_group = train_set.groupby('location')
loc_group.count().sort_values(by='id', ascending=False)

Unnamed: 0_level_0,id,keyword,text,target,text_len
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
USA,104,104,104,104,104
New York,71,71,71,71,71
United States,50,50,50,50,50
London,45,45,45,45,45
Canada,29,29,29,29,29
...,...,...,...,...,...
Hueco Mundo,1,1,1,1,1
"Hughes, AR",1,1,1,1,1
"Huntington, WV",1,1,1,1,1
"Huntley, IL",1,1,1,1,1


In [9]:
def clean_loc(x):
    x = str(x)
    if x:
        return x.lower()
    else:
        return None

train_proc = train_set.copy()
train_proc['location_clean'] = train_proc['location'].apply(clean_loc)
loc_group = train_proc.groupby('location_clean')
loc_group.count().sort_values(by='id', ascending=False)

Unnamed: 0_level_0,id,keyword,location,text,target,text_len
location_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,2533,2472,0,2533,2533,2533
usa,104,104,104,104,104,104
new york,75,75,75,75,75,75
united states,50,50,50,50,50,50
london,49,49,49,49,49,49
...,...,...,...,...,...,...
"haiku, maui, hawaii",1,1,1,1,1,1
hailing from dayton,1,1,1,1,1,1
halfrica,1,1,1,1,1,1
"halifax, nouvelle-ìäcosse",1,1,1,1,1,1


In [10]:
for loc in loc_group.groups.keys():
    print(loc)

thern ireland
northern kentucky, usa
norwalk, ct
norway
norwich
not a u.s resident
not los angeles, not new york.
not of this world
not so cool ky
not steven yeun / amc.
not where i want to be, yet
nottingham
nottingham, england
nottingham, united kingdom
notts
nova scotia, canada
novi, mi
nowhere
nowhere islands/smash manor
nowhere. everywhere.
numa casa de old yellow bricks
numenor
nunya
nv
ny
ny capital district
ny || live easy? 
ny, ct & greece
ny, ny
nyc
nyc / international
nyc :) ex- #islamophobe
nyc area
nyc metro
nyc&nj
nyc, new york
nyc,us - cali, colombia
nyc-la-miami
nyhc
oakland
oakland, ca
oblivion?
ocean city, nj
odawara, japan
oes 4th point. sisstar & ti
official website
ogba, lagos, nigeria
ohio
ohio, usa
ojodu,lagos
ok
okanagan valley, bc
oklahoma
oklahoma city
oklahoma city, ok
oklahoma, usa
okuma town, fukushima
olathe, ks
old blighty
oldenburg // london
olympia, wa
oman muscat al seeb 
on
on a beach 
on a catwalk somewhere
on the court 
on the go
on the toilet havin

It appears that a lot of Twitter users write locations that are badly formatted, jokes or simply non-informative. Even those who put a real location don't follow any standard: some write only country, while others write city or US state.

We could try to clean up this feature and use it, but it would be non-trivial and with no guaranteed results. We'll leave it be and use only text data for now.

# Text Preprocessing
To preprocess the text feature, we will remove HTML tags with beautiful soup and remove all characters except alphanumeric and hashtags.
Then, we'll split the text into words and keep only their radicals, using nltk PorterStemmer

In [11]:
# Preprocess and clean text features
nltk.download("stopwords")
stopwords_set = set(stopwords.words("english"))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olavo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def process_text(text):
    text = str(text)
    text = BeautifulSoup(text, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9#]", " ", text) # Keep only alphanumeric characters and hashtags
    text = text.lower()
    words = set(text.split()) # Split string into words
    
    # Remove stopwords
    words = list(words - stopwords_set)
    
    # Stem words with PorterStemmer
    stemmer = PorterStemmer()
    def stem_word(x):
        return stemmer.stem(x)
    words = list(map(stem_word, words))
    
    return words

 
# def process_dataset(dataset, cached_path=Path(commons.dataset_path) / 'train_processed.csv'):
#     '''Apply text preprocessing and BoW codification to the entire dataset'''
#     cached_path = Path(cached_path)
#     if cached_path.is_file():
#         return pd.read_csv(cached_path)
    
#     dataset['text_proc'] = dataset['text'].apply(process_text)
#     if cached_path:
#         dataset.to_csv(cached_path)
#     return dataset

We can see how each text is split into word radicals below

In [13]:
for entry in train_proc.loc[0:20, 'text']:
    print(process_text(entry))

['allah', 'reason', 'may', 'forgiv', '#earthquak', 'us', 'deed']
['forest', 'la', 'near', 'rong', 'fire', 'canada', 'sask']
['ask', 'place', 'resid', 'expect', 'shelter', 'evacu', 'notifi', 'order', 'offic']
['california', '000', 'peopl', '13', 'receiv', 'evacu', '#wildfir', 'order']
['smoke', 'sent', 'photo', '#alaska', 'rubi', 'got', '#wildfir', 'school', 'pour']
['close', 'due', 'fire', 'california', 'hwi', 'updat', 'counti', '#rockyfir', '20', '#wildfir', '#cafir', 'direct', 'lake']
['caus', '#disast', '#flood', 'flash', 'street', 'colorado', 'heavi', 'manit', 'rain', 'spring', 'flood', 'area']
['hill', 'see', 'fire', 'wood', 'top']
['emerg', 'build', 'across', 'evacu', 'happen', 'street']
['come', 'afraid', 'area', 'tornado']
['three', 'die', 'far', 'wave', 'peopl', 'heat']
['tampa', 'second', 'flood', 'live', 'hah', 'gonna', 'south', 'get', '#flood', 'wait', 'fvck', 'haha']
['#tampabay', '#rain', 'day', '#florida', '#flood', '19', '18', '#tampa', 'lost', 'count']
['bago', '#flood

Then, we'll process the entire dataset using the previous function and sklearn CountVectorizer to convert the processed texts into a Bag of Words representation

In [14]:
from libs.dataset import create_dataset

seed                 = 10
train_path           = Path(commons.dataset_path) / "train.csv"
test_path            = Path(commons.dataset_path) / "test.csv"

# Create train and validation datasets and save to file
train_x, val_x, train_y, val_y = create_dataset(train_path, test_path, seed=seed)


Cleaning text...

Assembling Bag of Words matrix...

Splitting dataset...

Saving dataset to file...


In [20]:
train_x

Unnamed: 0,#,#1,#360wisenew,#7,#9,#abstorm,#accid,#africa,#afterlif,#airplan,...,yr,z,z10,zak,zayn,zero,zombi,zone,zouma,zujwuiomb3
2572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2767,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6248,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4623,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7293,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
from libs.dataset import TextDataset

val_processed_path   = Path(commons.dataset_path) / "val_processed.csv"
val_dataloader = TextDataset(val_processed_path, target_column=commons.target_column_name,
        normalize=True, balance=False)

In [28]:
val_dataloader.dataset

Unnamed: 0,#,#1,#360wisenew,#7,#9,#abstorm,#accid,#africa,#afterlif,#airplan,...,yr,z,z10,zak,zayn,zero,zombi,zone,zouma,zujwuiomb3
0,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,39.012818,0.0,-0.077101,0.0,0.0
1,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0
2,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0
3,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0
4,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0
1519,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0
1520,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0
1521,-0.085294,-0.044426,-0.036262,0.0,-0.036262,-0.036262,0.0,-0.025633,-0.025633,0.0,...,-0.025633,0.0,0.0,0.0,0.0,-0.025633,0.0,-0.077101,0.0,0.0


In [35]:
val_dataloader.dataset.std(axis=0) == 0

#              False
#1             False
#360wisenew    False
#7              True
#9             False
               ...  
zero           False
zombi           True
zone           False
zouma           True
zujwuiomb3      True
Length: 5000, dtype: bool

In [37]:
print(val_dataloader.dataset.columns[val_dataloader.dataset.std(axis=0) == 0])
# val_dataloader.dataset.drop(columns=val_dataloader.dataset.std(axis=0) == 0)

Index(['#7', '#accid', '#airplan', '#animalrescu', '#antioch', '#artistsunit',
       '#arwx', '#atlanta', '#avalanch', '#bb17',
       ...
       'yell', 'yemen', 'yorker', 'z', 'z10', 'zak', 'zayn', 'zombi', 'zouma',
       'zujwuiomb3'],
      dtype='object', length=1702)
