In [24]:
import pandas as pd
import numpy as np
import re

data = pd.read_csv('../data/train.csv')

Convert data to lowercase and remove all URLs

In [25]:
cleanedData = data['text'].str.lower()
cleanedData = cleanedData.replace(to_replace = r'https?:\/\/.*[\r\n]*|[^\w\s]', value = " ", regex=True)

Tokenize each string

In [26]:
tokens = []

for i in range(len(cleanedData)):
    tokens.append([data for data in cleanedData[i].split(" ") if data])

Find frequency of each token

In [35]:
frequencies = {} # token : frequency
for row in tokens:
    for word in row:
        frequencies[word] = frequencies.get(word, 0) + 1

Find rank of each token. Low rank (ie. 1) means word is more frequent

In [39]:
frequency_set = sorted(set(list(frequencies.values())))[::-1]
num_ranks = len(frequency_set)

frequency_to_rank = {}

for i, freq in enumerate(frequency_set):
    frequency_to_rank[freq] = i+1

ranks = {} # token : rank

for key, value in frequencies.items():
    ranks[key] = frequency_to_rank[value]

In [40]:
test_case_index = 33

print("1. Example entry:\n\t", data['text'][test_case_index])
print("\n2. Cleaned entry:\n\t",cleanedData[test_case_index])
print("\n3. Tokenize:\n\t", tokens[test_case_index])

print("\n4. Token Frequencies and Ranks:\n\t[token] : [frequency] | [rank]")
for token in tokens[test_case_index]:
    print(f"\t{token} : {frequencies[token]} | {ranks[token]}")

1. Example entry:
	 #AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi

2. Cleaned entry:
	  africanbaze  breaking news nigeria flag set ablaze in aba   

3. Tokenize:
	 ['africanbaze', 'breaking', 'news', 'nigeria', 'flag', 'set', 'ablaze', 'in', 'aba']

4. Token Frequencies and Ranks:
	[token] : [frequency] | [rank]
	africanbaze : 1 | 187
	breaking : 40 | 148
	news : 154 | 67
	nigeria : 4 | 184
	flag : 21 | 167
	set : 47 | 141
	ablaze : 28 | 160
	in : 1952 | 3
	aba : 14 | 174
