# Disaster Tweets Classifier

In [1]:
# installing all dependencies required for the notebook
%pip install -r requirements.txt

Collecting blis==0.4.1
  Using cached blis-0.4.1-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (4.0 MB)
Collecting catalogue==1.0.0
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting certifi==2020.11.8
  Using cached certifi-2020.11.8-py2.py3-none-any.whl (155 kB)
Collecting cymem==2.0.4
  Using cached cymem-2.0.4-cp37-cp37m-macosx_10_9_x86_64.whl (31 kB)
[31mERROR: Could not find a version that satisfies the requirement en-core-web-sm==2.3.1 (from -r requirements.txt (line 9)) (from versions: none)[0m
[31mERROR: No matching distribution found for en-core-web-sm==2.3.1 (from -r requirements.txt (line 9))[0m
Note: you may need to restart the kernel to use updated packages.


# Data Exploration & Pre-processing

## Data exploration

In [29]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/train.csv', index_col='id')
df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [30]:
df.shape # 7613 rows, with 4 columns

(7613, 4)

In [31]:
# target 1 refers to disaster tweet, 0 is not a disaster tweet
df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [32]:
# checking for completeness of data
print(f"{np.sum(df['keyword'].isna())} rows have no keywords")
print(f"{np.sum(df['location'].isna())} rows have no location")
print(f"{np.sum(df['text'].isna())} rows have no text")
print(f"{np.sum(df['text'].isna())} rows have no target")

61 rows have no keywords
2533 rows have no location
0 rows have no text
0 rows have no target


In [33]:
# note that some keywords are phrases, with '%20' as a space
df['keyword'].value_counts() 

fatalities               45
armageddon               42
deluge                   42
sinking                  41
body%20bags              41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [34]:
# note that there are some non-location locations, like 'World Wide!!' and 'a feminist, modernist hag.'
df['location'].value_counts() 

USA                  104
New York              71
United States         50
London                45
Canada                29
                    ... 
a box                  1
Las Vegas, NV          1
MD                     1
Bangalore, INDIA       1
Highland Park, CA      1
Name: location, Length: 3341, dtype: int64

## Preprocessing

In [8]:
# download spaCy model for American English
!python3 -m spacy download en_core_web_sm

You should consider upgrading via the '/Users/weiting/Code/disaster-tweets-classifier/venv/bin/python3 -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [37]:
import spacy 
import en_core_web_md
nlp = en_core_web_md.load()

## Modifying spaCy's tokenizer

In [38]:
# Let's see what spaCy does with numbers, contractions, #hashtags, @mentions and URLs
s = "2020 can't get any worse #ihate2020 @bestfriend https://t.co"
doc = nlp(s)

# Let's look at the lemmas and is stopword of each token
print(f"Token\t\tLemma\t\tStopword")
print("="*40)
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")

Token		Lemma		Stopword
2020		2020		False
ca		can		True
n't		not		True
get		get		True
any		any		True
worse		bad		False
#		#		False
ihate2020		ihate2020		False
@bestfriend		@bestfriend		False
https://t.co		https://t.co		False


In [39]:
# Contractions are split into lemmas
# Numbers are their own features
# @mentions are maintained as a token
# We want to also keep #hashtags as a token, so we will modify the spaCy model's token_match

import re 

# Retrieve the default token-matching regex pattern
re_token_match = spacy.tokenizer._get_regex_pattern(nlp.Defaults.token_match)

# Add #hashtag pattern
re_token_match = f"({re_token_match}|#\w+)"
nlp.tokenizer.token_match = re.compile(re_token_match).match

# Now let's try again
s = "2020 can't get any worse #ihate2020 @bestfriend https://t.co"
doc = nlp(s)

# Let's look at the lemmas and is stopword of each token
print(f"Token\t\tLemma\t\tStopword")
print("="*40)
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")

Token		Lemma		Stopword
2020		2020		False
ca		can		True
n't		not		True
get		get		True
any		any		True
worse		bad		False
#ihate2020		#ihate2020		False
@bestfriend		@bestfriend		False
https://t.co		https://t.co		False


## Pre-processing a single tweet

In [12]:
# Features is a set of all lemmas (words) encountered thus far, add hashtags, mentions and URLs to track the number of each respectively
features = set({'#','@','URL'})

# Now let's process an original tweet with our modified spaCy model
s = df.loc[1,'text']
print(f"Original tweet: {s}")

# Modifying the tweet to include mentions, hashtags and a URL
s += ' @mention #hashtag http://t.co/test'

# To lowercase
s = s.lower()

# Creating a doc with spaCy
doc = nlp(s)

# Let's look at the lemmas and is stopword of each token
print(f"Token\t\tLemma\t\tStopword")
print("="*40)

lemmas = []
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")
    lemmas.append(token.lemma_)

# Union between lemmas and our features set
features |= set(lemmas)

# Constructing a bag of words for the tweet
freq = {'#':0,'@':0,'URL':0}
for word in lemmas:
    freq[str(word)] = 0
for token in doc: 
    if '#' in str(token): freq['#'] += 1 # Count number of hashtags, regardless of hashtag
    if '@' in str(token): freq['@'] += 1 # Count number of mentions, regardless of mention
    if 'http://' in str(token): freq['URL'] += 1 # Count number of URLs, regardless of URL
    freq[str(token.lemma_)] += 1
print(type(str(token)))
print(f"Bag of words for the tweet: {freq}")

Original tweet: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Token		Lemma		Stopword
our		-PRON-		True
deeds		deed		False
are		be		True
the		the		True
reason		reason		False
of		of		True
this		this		True
#earthquake		#earthquake		False
may		may		True
allah		allah		False
forgive		forgive		False
us		-PRON-		True
all		all		True
@mention		@mention		False
#hashtag		#hashtag		False
http://t.co/test		http://t.co/test		False
<class 'str'>
Bag of words for the tweet: {'#': 2, '@': 1, 'URL': 1, '-PRON-': 2, 'deed': 1, 'be': 1, 'the': 1, 'reason': 1, 'of': 1, 'this': 1, '#earthquake': 1, 'may': 1, 'allah': 1, 'forgive': 1, 'all': 1, '@mention': 1, '#hashtag': 1, 'http://t.co/test': 1}


## Preprocessing all data

In [40]:
# Now that we've preprocessed a single tweet, we can create a pre-process function for each tweet
def preprocess(s, nlp, features):
    """
    Given string s, spaCy model nlp, and set features (lemmas encountered),
    pre-process s and return updated features and bag-of-words representation dict freq
    - changes s to lower-case
    - tokenize s using nlp to create a doc
    - update features with lemmas encountered in s
    - create bag-of-words representation in dict type freq, including counts for hashtags, mentions and URLs
    """

    # To lowercase
    s = s.lower()

    # Creating a doc with spaCy
    doc = nlp(s)

    lemmas = []
    for token in doc:
        lemmas.append(token.lemma_)

    # Union between lemmas and our features set
    features |= set(lemmas)

    # Constructing a bag of words for the tweet
    freq = {'#':0,'@':0,'URL':0}
    for word in lemmas:
        freq[str(word)] = 0
    for token in doc: 
        if '#' in str(token): freq['#'] += 1 # Count number of hashtags, regardless of hashtag
        if '@' in str(token): freq['@'] += 1 # Count number of mentions, regardless of mention
        if 'http://' in str(token): freq['URL'] += 1 # Count number of URLs, regardless of URL
        freq[str(token.lemma_)] += 1
        
    return features, freq

In [43]:
preprocess_df = df # Duplicate for preprocessing
features = set({'#','@','URL'}) # Using set feature to contain all words (lemmas) seen

In [66]:
bleh = []
for i in range(len(foo)):
    features, freq = preprocess(foo.iloc[i]['text'], nlp, features)
    bleh.append(freq)
bleh

[{'#': 0,
  '@': 0,
  'URL': 0,
  '-PRON-': 2,
  'well': 2,
  'than': 2,
  'sean': 3,
  'bro': 1,
  '.': 2,
  'i': 1,
  'can': 1,
  'admit': 1,
  'that': 1,
  'be': 1,
  'flame': 1,
  'now': 1,
  'but': 1,
  'https://t.co/aomq1rykmj': 1},
 {'#': 0,
  '@': 0,
  'URL': 1,
  'eu': 1,
  'states': 1,
  'squabble': 1,
  'over': 1,
  'immigration': 1,
  '.': 2,
  'uk': 1,
  '-': 1,
  'france': 1,
  'eurotunnel': 1,
  'deluge': 1,
  'with': 1,
  'migrant': 1,
  'one': 1,
  'dead': 1,
  'as': 1,
  "'": 2,
  'thousand': 1,
  'storm': 1,
  'tunnel': 1,
  'http://t.co/vf6cklmcsx': 1},
 {'#': 0,
  '@': 0,
  'URL': 1,
  'photoset': 1,
  ':': 4,
  'littlebitofbass': 1,
  'silinski': 1,
  'ed': 1,
  'sheeran': 1,
  "onåê'the": 1,
  'hobbit': 1,
  'the': 1,
  'desolation': 1,
  'of': 1,
  'smaug': 1,
  "'": 1,
  'german': 1,
  'premiere': 1,
  '...': 1,
  'http://t.co/iosthxlcyv': 1},
 {'#': 0, '@': 0, 'URL': 0, 'armageddon': 1, 'https://t.co/ucsudk3q1d': 1},
 {'#': 0,
  '@': 0,
  'URL': 1,
  'correcti

In [42]:
bow_array = [] # Array bow_array of bow representations for each tweet; bow_array[i] is the bow representation for tweet id (i+1)
for i in range(len(preprocess_df)):
    features, freq = preprocess(preprocess_df.iloc[i]['text'],nlp,features)
    bow_array.append(freq)
len(bow_array)

KeyboardInterrupt: 

In [None]:
pre_process_df.head(n=2)

In [None]:
# Create dataframe for bag of words representation for each tweet
bow = pd.DataFrame('0', columns=features,index=range(1,len(preprocess_df)+1))
len(bow)

In [None]:
# Update bow[i] with bag-of-words freq of the tweet id (i+1)
for i in range(len(preprocess_df)):
    freq = bow_array[i]
    for f in freq:
        bow.loc[i+1,f]=freq[f]

# Join bag-of-words representation to train dataframe
# Append _data suffix to 'keyword','location','text','target' for features that are not lemma tokens
preprocess_df = preprocess_df.join(bow,lsuffix='_data')

# Saving bag-of-words representation for collaborators
preprocess_df.to_csv("data/train_preprocessed.csv",index=True,index_label='id')

In [None]:
preprocess_df.head(5)

## Splitting into training and validation data

In [19]:
from sklearn.model_selection import train_test_split

# stratify=y creates a balanced validation set
y = preprocess_df['target_data']

df_train, df_val = train_test_split(preprocess_df, test_size=0.10, random_state=101, stratify=y)

# Saving csv files for collaborators
df_train.to_csv("data/train_preprocessed_split.csv",index=True)
df_val.to_csv("data/val_preprocessed_split.csv",index=True)

print(df_train.shape, df_val.shape)

(6851, 21330) (762, 21330)


In [20]:
# Checking balance
print(f"""
Ratio of target=1 to target=0 tweets in:\n 
Original data set = {np.sum(preprocess_df['target_data']==1)/np.sum(preprocess_df['target_data']==0)},\n
Training data set = {np.sum(df_train['target_data']==1)/np.sum(df_train['target_data']==0)},\n
Validation data set = {np.sum(df_val['target_data']==1)/np.sum(df_val['target_data']==0)}""")


Ratio of target=1 to target=0 tweets in:
 
Original data set = 0.7533394748963611,

Training data set = 0.7535193242897363,

Validation data set = 0.7517241379310344
