In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, TFDistilBertModel

In [2]:
import torch
import transformers as ppb

In [3]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [4]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [5]:
import pandas as pd
import numpy as np

In [6]:
story6y = pd.read_csv("6yn.csv", index_col=0)

  mask |= (ar1 == a)


In [7]:
story6y.head()

Unnamed: 0,title,text,timestamp,url,score,dead,deleted
0,How to Not Mess Up Your Tech Startup,,2013-06-01 00:01:56+00:00,http://kristinabjoran.com/how-to-not-mess-up-y...,2.0,,
1,,,2013-06-01 00:03:32+00:00,,,True,True
2,Someday you may ditch your two-factor authenti...,,2013-06-01 00:04:34+00:00,http://arstechnica.com/security/2013/05/someda...,1.0,,
3,,,2013-06-01 00:05:35+00:00,,,True,True
4,Will State.com become the world's opinion forum?,,2013-06-01 00:05:54+00:00,http://pandodaily.com/2013/05/31/making-sense-...,2.0,,


Need to extract the urls, but first clean the text

In [8]:
# replace non values with nan
story6y.fillna(value=np.nan, inplace=True)
# filter out delted stories
story6y_f = story6y[story6y.deleted.isnull()]

In [9]:
# build the series of content, ignoring nan values
stories = story6y_f.title.fillna(' ').astype(str) + ' ' + story6y_f.text.fillna(' ').astype(str)

In [10]:
stories

0                     How to Not Mess Up Your Tech Startup  
2          Someday you may ditch your two-factor authenti...
4           Will State.com become the world's opinion for...
5          LightUp Teaches Kids Electronics With Augmente...
6          Getting Things Done: Why GTD for Sales is the ...
                                 ...                        
2344431      Italy Follows France in Levying a Digital Tax  
2344432    Amazon is looking into tech that can identify ...
2344433    Giant Concentric Circles in Granite Springs Va...
2344434                          Wyze Data leak 12-26-2019  
2344435    4M cards,4K drawers–coalition of book lovers r...
Length: 2243679, dtype: object

In [12]:
#extract domain names
import tldextract
domains = story6y_f.url.fillna(' ')

In [15]:
domains

0          http://kristinabjoran.com/how-to-not-mess-up-y...
2          http://arstechnica.com/security/2013/05/someda...
4          http://pandodaily.com/2013/05/31/making-sense-...
5          http://techcrunch.com/2013/05/31/lightup-helps...
6          http://blog.voltagecrm.com/getting-things-done...
                                 ...                        
2344431    https://www.wsj.com/articles/italy-follows-fra...
2344432    https://www.usatoday.com/story/tech/2019/12/27...
2344433    https://www.blackrockdesert.org/wiki/index.php...
2344434    https://forums.wyzecam.com/t/updated-12-29-19-...
2344435    https://www.washingtonpost.com/education/2019/...
Name: url, Length: 2243679, dtype: object

In [6]:
#process is slow, use multiple threads
from multiprocessing import  Pool
from functools import partial

In [16]:
# build domain extraction function
def doxtract(df):
    dom = df.apply(lambda x: tldextract.extract(x).domain)
    return dom
# build multithread function
def mdoxtract(df,efunc,nj=23):
    text_split = np.array_split(df, nj)
    pool = Pool(nj)
    fd = pd.concat(pool.map(efunc, text_split))
    pool.close()
    pool.join()
    return fd

In [18]:
mdomain = mdoxtract(domains, doxtract)

In [21]:
stories_aug = stories + ' ' + mdomain

In [7]:
import pickle
#pickle.dump( stories_aug, open( "stories_aug.p", "wb" ) )
stories_aug = pickle.load( open( "stories_aug.p", "rb" ) )

In [8]:
stories_aug = stories_aug.apply(lambda x: x.replace('\n',' ').replace('\r', ' '))

In [13]:
stories_aug.shape

(2243679,)

In [10]:
### Tokenize using bert tokenizer
# also make it multi
def tokenizerf(df,tokenizer):
    tk = df.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=200)))
    return tk
def mpt(text,func,nj=23):
    text_split = np.array_split(text, nj)
    pool = Pool(nj)
    tf = pd.concat(pool.map(func, text_split))
    pool.close()
    pool.join()
    return tf
tf2 = partial(tokenizerf,tokenizer=tokenizer)

In [11]:
tokenized0 = mpt(stories_aug,tf2)

In [13]:
pickle.dump( tokenized0, open( "stories_token_bert.p", "wb" ) )

In [14]:
# Padding
max_len = 0
for i in tokenized0.values:
    if len(i) > max_len:
        max_len = len(i)
    
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized0.values])

In [15]:
input_text = torch.tensor(padded) 

In [16]:
# build minibatches
from torch.utils.data import Dataset, DataLoader
textiter = DataLoader(input_text, batch_size=300, shuffle=False)

In [17]:
device = 'cuda'
model.to(device)
model.eval()
emb0 = np.zeros([1,768])

In [18]:
with torch.no_grad():
    for _, batch in enumerate(textiter):
        attention_mask = np.where(batch != 0, 1, 0)
        attention_mask = torch.tensor(attention_mask).to(device)
        a1 = batch.to(device)
        emo = model(a1, attention_mask=attention_mask)
        emb0 = np.append(emb0, emo[0][:,0,:].cpu().numpy(), axis=0)

In [9]:
#pickle.dump( emb0, open( "emb_HN.p", "wb" ) )
emb0 = pickle.load(open('emb_HN.p','rb'))

In [18]:
#remove the dummy emb
emb = emb0[1:]
emb.shape

(2243679, 768)

In [None]:
# build an autoencoder model for topic categorization
# input is already equal length vectors, no need to use LSTM..., will use dense layers + non-linear activations

In [None]:
# autoencoder for topic 