In [59]:
import pandas as pd
from tokenizers import Tokenizer
import numpy as np
from tqdm import tqdm

In [3]:
df = pd.read_csv('data/raw/URL Classification.csv', index_col=0, header=None, names=['index', 'url', 'category'])

In [4]:
df = df.dropna()

In [5]:
df.groupby('category').count().sort_values(by='url')

Unnamed: 0_level_0,url
category,Unnamed: 1_level_1
News,8989
Home,28269
Adult,35325
Kids,46182
Games,56477
Reference,58247
Health,60097
Shopping,95270
Sports,101328
Recreation,106586


In [6]:
df = df.sample(frac=1, random_state=0)

In [7]:
df['category'] = df['category'].apply(lambda x: x.lower())

In [8]:
s = df.groupby('category')['url'].apply(lambda x: list(x)[:8500])

In [9]:
import random
def train_test_split(s, train_size, shuffle=False):
  category_len = len(s[s.index[0]])
  l = int(category_len * train_size)
  p = category_len - l
  print('Each category will be splitted into:')
  print(f'\t+ train_data={l}')
  print(f'\t+ test_data={p}')

  x_train, x_test, y_train, y_test = [], [], [], []
  for i,v in s.iteritems():
    x_train.extend(v[:l])
    x_test.extend(v[l:])
    y_train.extend([i]*l)
    y_test.extend([i]*p)
  
  print(f'Done splitting:')
  print(f'\t+ total_train_data={len(x_train)}')
  print(f'\t+ total_test_data={len(x_test)}')

  if shuffle:
    random.seed(0)
    train = list(zip(x_train, y_train))
    test = list(zip(x_test, y_test))
    random.shuffle(train)
    random.shuffle(test)
    x_train, y_train = zip(*train)
    x_test, y_test = zip(*test)
    print('Data shuffled.')
  
  return x_train, x_test, y_train, y_test

In [27]:
x_train, x_test, y_train, y_test = train_test_split(s, 0.9, shuffle=True)

Each category will be splitted into:
	+ train_data=7650
	+ test_data=850
Done splitting:
	+ total_train_data=114750
	+ total_test_data=12750
Data shuffled.


In [28]:
len(y_train)

114750

In [29]:
y_train_df = pd.DataFrame(y_train, columns=['label'])
y_test_df = pd.DataFrame(y_test, columns=['label'])

In [30]:
y_train_df

Unnamed: 0,label
0,computers
1,reference
2,society
3,recreation
4,shopping
...,...
114745,adult
114746,kids
114747,shopping
114748,home


In [31]:
y_test_df

Unnamed: 0,label
0,news
1,business
2,home
3,recreation
4,games
...,...
12745,computers
12746,sports
12747,news
12748,recreation


In [32]:
y_train = pd.get_dummies(y_train_df).values.tolist()
y_test = pd.get_dummies(y_test_df).values.tolist()

In [40]:
y_train

[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [41]:
x_train

('http://www.podcastdirectory.com/',
 'http://www.vanderbilt.edu/kappaalphaorder/',
 'http://www.communityumc.org/',
 'http://www.boxersinneed.org/',
 'http://simplethoughts.net/',
 'http://www2.evansville.edu/ecoleweb/glossary/linus.html',
 'http://www.nytimes.com/2001/09/20/international/20capi.html',
 'http://www.fraktalstudio.de',
 'http://xpd.se/',
 'http://us.imdb.com/name/nm0703849/',
 'http://www.characterlink.net/',
 'http://www.falconsrule.com/',
 'http://www.xmission.com/~emailbox/tips.htm',
 'http://www.elecplay.com/reviews/view/?article=1671',
 'http://www.faqs.org/faqs/dogs-faq/breeds/frenchbulldogs/',
 'http://www.toltecincorporated.com',
 'http://www.blakepublishing.homestead.com/',
 'http://www.hazelhenderson.com/',
 'http://seventhsea.itgo.com/',
 'http://www.pageantrymagazine.com/promtime.html',
 'http://www.geocities.com/zeligactor/',
 'http://curry.edschool.virginia.edu/go/clic/nrrc/hist_r45.html',
 'http://www.pulseox.info/',
 'http://www.hotelj2.jp/',
 'http://ww

In [43]:
tokenizer = Tokenizer.from_file("tokenizers/bpe_tokenizer/bpe_tokenizer.json")

In [46]:
tokenizer.enable_padding(direction="right", pad_id=0, pad_token='<pad>', length=256)
tokenizer.enable_truncation(max_length=256)

In [47]:
tokenizer.get_vocab_size()

18407

In [60]:
def encode(tokenizer, data):
    input_ids = []
    for d in tqdm(data):
        input_ids.append(tokenizer.encode(d).ids)
    
    return np.array(input_ids)

In [65]:
x_train = encode(tokenizer, x_train)
x_test = encode(tokenizer, x_test)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 114750/114750 [00:05<00:00, 22126.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12750/12750 [00:00<00:00, 22952.74it/s]


In [71]:
def save_np_array(path, d):
    with open(path, 'wb') as f:
        np.save(f, d)

In [70]:
def load_np_array(path):
    with open(path, 'rb') as f:
        a = np.load(f)
    return a

In [78]:
y_test = np.argmax(y_test, axis=1)
y_train = np.argmax(y_train, axis=1)

In [80]:
len(x_train) == len(y_train)

True

In [81]:
len(x_test) == len(y_test)

True

In [82]:
save_np_array('data/preprocessed_for_cnn_train_data/x_train.npy', x_train)
save_np_array('data/preprocessed_for_cnn_train_data/x_test.npy', x_test)
save_np_array('data/preprocessed_for_cnn_train_data/x_train.npy', x_train)
save_np_array('data/preprocessed_for_cnn_train_data/y_train.npy', y_train)
save_np_array('data/preprocessed_for_cnn_train_data/y_test.npy', y_test)

In [75]:
a = load_np_array('data/preprocessed_for_cnn_train_data/x_train.npy')

In [77]:
len(a)

114750