In [1]:
import pandas as pd
import numpy as np
import re

data = pd.read_csv('../data/train.csv')

display(data)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## First step: tokenized data

Convert data to lowercase and remove all URLs

In [2]:
pd.options.mode.chained_assignment = None # silence warning related to overwriting original frame

tokenized_data = data.copy()
tokenized_data['text'] = tokenized_data['text'].str.lower()
tokenized_data['text'] = tokenized_data['text'].replace(to_replace = r'https?:\/\/.*[\r\n]*|[^\w\s]', value = " ", regex=True)

for i in range(len(tokenized_data['text'])):
    tokenized_data['text'][i] = [data for data in tokenized_data['text'][i].split(" ") if data]

display(tokenized_data)

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, residents, asked, to, shelter, in, place...",1
3,6,,,"[13, 000, people, receive, wildfires, evacuati...",1
4,7,,,"[just, got, sent, this, photo, from, ruby, ala...",1
...,...,...,...,...,...
7608,10869,,,"[two, giant, cranes, holding, a, bridge, colla...",1
7609,10870,,,"[aria_ahrary, thetawniest, the, out, of, contr...",1
7610,10871,,,"[m1, 94, 01, 04, utc, 5km, s, of, volcano, haw...",1
7611,10872,,,"[police, investigating, after, an, e, bike, co...",1


## Second step: ranked data

Transform tokens into rank indicators. Low frequency words = high rank

1. Create a frequencies dictionary mapping each token to its frequency

In [3]:
frequencies = {} # token : frequency

for tweet in tokenized_data['text']:
    for token in tweet:
        token = token.strip()    # strip necessary to remove whitespaces (newlines)
        if token:
            frequencies[token] = frequencies.get(token,0) + 1

2. Map each frequency to a rank

In [4]:
frequency_set = sorted(set(list(frequencies.values())))[::1]
num_ranks = len(frequency_set)
frequency_to_rank = {} # frequency : rank

for i, freq in enumerate(frequency_set):
    frequency_to_rank[freq] = i+1

3. Create ranked data

In [5]:
ranked_data = tokenized_data.copy()
for i in range(len(ranked_data['text'])):
    ranked_data['text'][i] = [frequency_to_rank[frequencies[data.strip()]] for data in ranked_data['text'][i] if data.strip()]

display(ranked_data)

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[94, 2, 169, 190, 19, 186, 171, 41, 84, 9, 2, ...",1
1,4,,,"[63, 146, 55, 22, 1, 1, 10]",1
2,5,,,"[152, 8, 9, 187, 7, 188, 24, 169, 86, 1, 173, ...",1
3,6,,,"[21, 4, 133, 2, 9, 48, 11, 188, 109]",1
4,7,,,"[161, 105, 12, 171, 41, 170, 1, 8, 165, 48, 17...",1
...,...,...,...,...,...
7608,10869,,,"[99, 20, 12, 16, 189, 40, 74, 126, 27, 60]",1
7609,10870,,,"[2, 2, 190, 153, 186, 23, 49, 97, 188, 109, 72...",1
7610,10871,,,"[5, 6, 26, 14, 10, 10, 181, 186, 27, 10]",1
7611,10872,,,"[119, 15, 150, 148, 27, 6, 42, 175, 189, 88, 1...",1


## Third step: padded data

Pad data with zeroes to ensure all vectors are of the same length

In [6]:
VECTOR_SIZE = 50
max_tweet_len = ranked_data['text'].str.len().max()
assert(VECTOR_SIZE >= max_tweet_len)

padded_data = ranked_data.copy()

for i in range(len(padded_data['text'])):
    tweet_len = len(padded_data['text'][i])
    num_zeros = VECTOR_SIZE - tweet_len
    assert(num_zeros > 0)
    padded_data['text'][i] = [0] * num_zeros + padded_data['text'][i]
    assert(len(padded_data['text'][i]) == VECTOR_SIZE)
display(padded_data)

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1,4,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
2,5,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3,6,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
4,7,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
...,...,...,...,...,...
7608,10869,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
7609,10870,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
7610,10871,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
7611,10872,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
