In [1]:
import fasttext
import pandas as pd

In [2]:
df = pd.read_csv('data/raw/URL Classification.csv', index_col=0, header=None, names=['index', 'url', 'category'])

In [3]:
df = df.dropna()

In [4]:
df.groupby('category').count().sort_values(by='url')

Unnamed: 0_level_0,url
category,Unnamed: 1_level_1
News,8989
Home,28269
Adult,35325
Kids,46182
Games,56477
Reference,58247
Health,60097
Shopping,95270
Sports,101328
Recreation,106586


In [5]:
df = df.sample(frac=1, random_state=0)

In [6]:
df['category'] = df['category'].apply(lambda x: x.lower())

In [7]:
s = df.groupby('category')['url'].apply(lambda x: list(x)[:8500])

In [8]:
import random
def train_test_split(s, train_size, shuffle=False):
  category_len = len(s[s.index[0]])
  l = int(category_len * train_size)
  p = category_len - l
  print('Each category will be splitted into:')
  print(f'\t+ train_data={l}')
  print(f'\t+ test_data={p}')

  x_train, x_test, y_train, y_test = [], [], [], []
  for i,v in s.iteritems():
    x_train.extend(v[:l])
    x_test.extend(v[l:])
    y_train.extend([i]*l)
    y_test.extend([i]*p)
  
  print(f'Done splitting:')
  print(f'\t+ total_train_data={len(x_train)}')
  print(f'\t+ total_test_data={len(x_test)}')

  if shuffle:
    random.seed(0)
    train = list(zip(x_train, y_train))
    test = list(zip(x_test, y_test))
    random.shuffle(train)
    random.shuffle(test)
    x_train, y_train = zip(*train)
    x_test, y_test = zip(*test)
    print('Data shuffled.')
  
  return x_train, x_test, y_train, y_test

In [9]:
x_train, x_test, y_train, y_test = train_test_split(s, 0.9, shuffle=True)

Each category will be splitted into:
	+ train_data=7650
	+ test_data=850
Done splitting:
	+ total_train_data=114750
	+ total_test_data=12750
Data shuffled.


In [10]:
len(x_test) == len(y_test)

True

In [43]:
from pathlib import Path
import os

def save_data(data:list, dir_name:str, base_filename:str, filename_suffix:str):
    Path(dir_name).mkdir(parents=True, exist_ok=True)
    file_path = os.path.join(dir_name, base_filename + "." + filename_suffix)
    with open(file_path, 'w') as f_out:
        for line in data:
            if line:
                f_out.write(line + '\n')
    
    print('Done!')

In [40]:
save_data(x_train, './data/raw/train', 'x_train', 'txt')
save_data(y_train, './data/raw/train', 'y_train', 'txt')
save_data(x_test, './data/raw/test', 'x_test', 'txt')
save_data(y_test, './data/raw/test', 'y_test', 'txt')

Done!
Done!
Done!
Done!


In [11]:
import re
from urllib.parse import urlparse

splitter = re.compile(r'([^a-zA-Z0-9])')

def tokenize_url(url:str)->str:
  if not url:
    return ''
  
  parsed = urlparse(url)
  tokens = []
  if parsed.scheme:
    tokens.append(parsed.scheme)
    tokens.append('://')
    url = url.lstrip(parsed.scheme).lstrip('://')

  splitted = splitter.split(url)
  tokens.extend(splitted)

  return ' '.join(list(filter(None, tokens)))

In [12]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(lowercase=True)
tokenizer.train(
    './data/raw/train/x_train.txt',
    vocab_size=120000,
    min_frequency=5,
    show_progress=True,
    special_tokens=['<pad>']
)






In [14]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [22]:
def bpe_tokenizer(url:str)-> str:
    return ' '.join(tokenizer.tokenize(url))

In [16]:
tokenize_url('http://www.vanderbilt.edu/kappaalphaorder/')

'http :// www . vanderbilt . edu / kappaalphaorder /'

In [23]:
bpe_tokenizer('http://www.vanderbilt.edu/kappaalphaorder/')

'http :// www . v ander bilt . edu / k appa alpha order /'

In [24]:
tokenize_url('http://www.fullcardreports.com/')

'http :// www . fullcardreports . com /'

In [25]:
bpe_tokenizer('http://www.fullcardreports.com/')

'http :// www . full card reports . com /'

In [26]:
FASTTEXT_LABEL = '__label__'

def prepare_data_for_fasttext(data, tokenizer=lambda x:x):
    text = []
    
    for d in data:
        tokenized_txt = tokenizer(d[0])
        txt = FASTTEXT_LABEL + d[1] + ' ' + tokenized_txt
        text.append(txt)
    return text

In [27]:
train = prepare_data_for_fasttext(zip(x_train, y_train), tokenizer=tokenize_url)
test = prepare_data_for_fasttext(zip(x_test, y_test), tokenizer=tokenize_url)

In [28]:
print(len(train))
print(len(test))

114750
12750


In [58]:
save_data(train, './data/preprocessed', 'urls', 'regex.train')
save_data(test, './data/preprocessed', 'urls', 'regex.test')

Done!
Done!


In [45]:
save_data(train, '/home/smida/test', 'urls', 'regex.train')
save_data(test, '/home/smida/test', 'urls', 'regex.test')

Done!
Done!


In [29]:
bpe_train = prepare_data_for_fasttext(zip(x_train, y_train), tokenizer=bpe_tokenizer)
bpe_test = prepare_data_for_fasttext(zip(x_test, y_test), tokenizer=bpe_tokenizer)

In [19]:
save_data(bpe_train, './data/preprocessed', 'urls', 'bpe.train')
save_data(bpe_test, './data/preprocessed', 'urls', 'bpe.test')

Done!
Done!


In [44]:
save_data(bpe_train, '/home/smida/test', 'urls', 'bpe.train')
save_data(bpe_test, '/home/smida/test', 'urls', 'bpe.test')

Done!
Done!


In [46]:
fasttext_params_re = {
    'input': '/home/smida/test/urls.regex.train'
}

fasttext_params_bpe = {
    'input': '/home/smida/test/urls.bpe.train'
}

In [215]:
fasttext_params_re = {
    'input': "./data/preprocessed/urls.regex.train",
    'lr': 0.1,
    'lrUpdateRate': 1000,
    'thread': 8,
    'epoch': 10,
    'wordNgrams': 3,
    'dim': 100,
    'loss': 'ova'
}

fasttext_params_bpe = {
    'input': "./data/preprocessed/urls.bpe.train",
    'lr': 0.1,
    'lrUpdateRate': 1000,
    'thread': 8,
    'epoch': 25,
    'wordNgrams': 3,
    'dim': 32,
    'loss': 'ova'
}


In [47]:
re_model = fasttext.train_supervised(**fasttext_params_re)

Read 1M words
Number of words:  128139
Number of labels: 15
Progress: 100.0% words/sec/thread:  148216 lr:  0.000000 avg.loss:  2.179256 ETA:   0h 0m 0s


In [48]:
re_model.test("/home/smida/test/urls.regex.test")

(12750, 0.34541176470588236, 0.34541176470588236)

In [50]:
bpe_model = fasttext.train_supervised(**fasttext_params_bpe)

Read 1M words
Number of words:  9995
Number of labels: 15
Progress: 100.0% words/sec/thread:  176381 lr:  0.000000 avg.loss:  1.902293 ETA:   0h 0m 0s


In [51]:
bpe_model.test("/home/smida/test/urls.bpe.test")

(12750, 0.46572549019607845, 0.46572549019607845)

In [218]:
bpe_model = fasttext.train_supervised(input="./data/preprocessed/urls.bpe.train", autotuneValidationFile='./data/preprocessed/urls.bpe.test.valid')

Progress: 100.0% Trials:   15 Best score:  0.531137 ETA:   0h 0m 0s
Training again with best arguments
Read 1M words
Number of words:  43851
Number of labels: 15
Progress: 100.0% words/sec/thread:  197040 lr:  0.000000 avg.loss:  1.483397 ETA:   0h 0m 0s


In [221]:
bpe_model.test("./data/preprocessed/urls.bpe.test.test")

(6375, 0.5221960784313725, 0.5221960784313725)

In [163]:
encoded_text = tokenizer.encode('http://www.fullcardreports.com/')
print(encoded_text)

Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [187]:
' '.join(encoded_text.tokens)

'http :// www . full card reports . com /'

In [222]:
bpe_model.predict('http :// www . full card reports . com /', k=2)

(('__label__news', '__label__business'), array([0.32726616, 0.16468467]))

In [None]:
train = prepare_data_for_fasttext(zip(x_train, y_train), tokenizer=encoded_text.tokens)
test = prepare_data_for_fasttext(zip(x_test, y_test), tokenizer=encoded_text.tokens)

In [None]:
model_bpe = fasttext.train_supervised(input="./data/preprocessed/urls.regex.train", lr=0.5 , dim=300, loss='ova', epoch=20, wordNgrams=6)