In [None]:
! apt-get update
! apt-get install --reinstall python*-decorator

In [2]:
! pip3 install --quiet pymongo
! pip3 install --quiet --upgrade html5lib
! pip3 install --quiet --upgrade beautifulsoup4
! pip3 install --quiet tqdm
# ! pip3 install --quiet --upgrade numpy
# ! pip3 install --quiet --upgrade scipy
# ! pip3 install --quiet --upgrade sklearn
# ! pip3 install --quiet --upgrade pandas
! pip3 install --quiet spacy
# ! pip3 install spacy-nightly

In [99]:
! pip3 install --quiet gensim

In [None]:
! pip install --quiet pymongo
! pip install --quiet tqdm
! pip install spacy-nightly

In [None]:
! spacy download en_core_web_md

In [None]:
! python3 -m spacy download en_core_web_md

In [None]:
! python3 -m spacy download en

In [9]:

import pandas as pd
import mxnet as mx
import modelling.fetch as fetch
from modelling.utils import get_articles
from mxnet import nd, autograd, gluon
from mxnet.gluon import Block, nn, rnn, Trainer
from mxnet.gluon.parameter import Parameter
import numpy as np
from tqdm import tqdm
from sklearn.metrics import *
mx.random.seed(1)

In [204]:
class ChildSumGRU(Block):
    def __init__(self, num_hidden, dictionary=None, embed_dim=None, dropout=0.5):
        super(ChildSumGRU, self).__init__()
        with self.name_scope():
            if dictionary: 
                self.dictionary = dictionary
                vocab_size = len(dictionary.keys())
                self.embed = nn.Embedding(vocab_size, embed_dim)
            self.net = rnn.GRU(num_hidden, dropout = dropout)
            
    def forward(self, F, tree):
        # set computation ctx (tree context? )
        # hidden state is sum of childrens hidden states, which are
        # simply obtained through recursion

        if self.embed:
            vec = self.embed(tree.dict_id)
        else:
            vec = tree.vector

        child_states = [self.forward(F, child) for child in tree.children]
        if child_states:
            hidden_previous = [F.add_n(*child_states)]
        else: 
            hidden_previous = [s.as_in_context(vec.context) for s in 
                               self.net.begin_state(batch_size = 1) ]
        output, _ = self.net(vec, hidden_previous)
        return output

In [5]:
class ClassifierTreeRNN(Block):
    def __init__(self, num_hidden, dictionary=None, embed_dim=None, dropout=0.5):
        super(ClassifierTreeRNN, self).__init__()
        with self.name_scope():
            self.gru = ChildSumGRU(num_hidden, dictionary, embed_dim, dropout)
            self.decoder = nn.Dense(1, activation = 'sigmoid', in_units = num_hidden)
    def forward(self, F, tree):
        output = self.gru(F, tree)
        # print('output: ', output)
        # print('hidden: ', hidden)
        return self.decoder(output) # reshape??? 

In [162]:
def get_head(doc):
    return [token for token in doc if token.head is token][0]

def data_split(df, test = .2, label_key = 'label', 
               data_key = 'body', pos_label = 'accepted'): 

    num_test = round(df.shape[0] * test)
    sorted_df = df.sort_values('added', ascending=False)
    test = sorted_df[:num_test]
    train = sorted_df[num_test:]

    labelify = lambda df: (df[label_key] == pos_label).astype(int).as_matrix()

    return (train[data_key], test[data_key],
            labelify(train), labelify(test))


In [193]:
class Tree(object):
    def __init__(self, ctx, text, vector, children, dictionary):
        self.text = text
        self.vector = nd.array([[vector]], ctx = ctx)
        self.children = [Tree(ctx, c.text, c.vector, c.children, dictionary) for c in children]
        self.dict_id = nd.array([[dictionary.token2id.get(text)]], ctx = ctx)

def to_gpu_tree(dictionary, c, ctx):
    return Tree(ctx, 
                c.text, 
                c.vector, 
                c.children,
                dictionary) 

In [181]:
from random import shuffle

def split(data, num):
    try:
        return np.array(np.array_split(data, num))
    except AttributeError:
        return list(map(list, np.array_split(np.array(data), num)))

def map_with_split_context(fn, ctx, data):
    splitted = split(data, len(ctx))
    li =  [fn(c, ctx[i]) for i,d in enumerate(splitted) for c in d]
    # shuffle(li) # should we make sure the batches are split across executors? 
    return li

def batchify(data, batch_size):
    return np.array(np.array_split(np.array(data), 
                             len(data)/batch_size))

In [163]:
from modelling.clustering import get_unique_items
from modelling.utils import preprocessor

def prepare_df(df, preprocessor, nlp, out_key = 'body'):
    lookup = [
        ('ge', 0.1, 'title'),
        ('tw', 0.5, 'body'),
        ('fa', 0.2, 'title') 
    ]

    uniques = [get_unique_items(df[df._id.str.contains(p)], i, k) for p,i,k in lookup]
    unique = pd.concat([uniques[i].assign(text = uniques[i][t[2]]) for i,t in enumerate(lookup)] )

    unique['text'] = unique.text.map(preprocessor)
    unique['nlp'] = unique.text.map(nlp) # pick per language... 
    unique = unique[unique.nlp.map(len) > 2]
    tokens = unique.nlp.map(get_head)
    unique = unique.drop(['nlp'], 1)
    unique[out_key] = tokens
    return unique

In [120]:
import spacy
from pymongo import MongoClient
from modelling.utils import get_articles
from modelling.fetch import create_df

nlp = spacy.load('en_core_web_md')

collection = MongoClient("209.177.92.45:80")['newsfilter'].news
df = create_df(get_articles(collection, label=True))

In [194]:
from mxnet import gpu, cpu
ctx = [gpu(i) for i in range(8)]

# unique = prepare_df(df, preprocessor, nlp, out_key='tokens')
X_train, X_test, y_train, y_test = data_split(unique, data_key='tokens')
fn = lambda c,ctx: to_gpu_tree(dictionary, c, ctx)
X_train = map_with_split_context(fn, ctx, X_train)
X_test = map_with_split_context(fn, ctx, X_test)

# load y on ctx??? 

single_train = list(zip(X_train, y_train))
single_test = list(zip(X_test, y_test))

In [166]:
from gensim.corpora.dictionary import Dictionary
from spacy.en import English

def make_dict(arr, tokenizer):
    lis = map(tokenizer, arr)
    docs = [[w.text for w in doc] for doc in lis]
    dictionary = Dictionary(docs)
    return dictionary

tokenizer = English().Defaults.create_tokenizer(nlp)
dictionary = make_dict(unique.text, tokenizer)

In [232]:
model = ClassifierTreeRNN(200, dictionary=dictionary, embed_dim = 50, dropout=0.5)

model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

loss = lambda yhat,y: - (1-y)*nd.log(1 - yhat) - y*nd.log(yhat) 

trainer = Trainer(model.collect_params(), 'sgd',
                  {'learning_rate': 0.05 }, )

In [None]:
def calc_loss(preds, y_test):
    preds = np.array(preds)
    predictions = (preds >= .5).astype(int)
    return (precision_score(y_test, predictions), recall_score(y_test, predictions), fbeta_score(y_test, predictions, beta = 1.5))

In [None]:
batch_size = 5

for epoch in range(10):
    preds = []
    for i,e in tqdm(enumerate(single_train)):
        d,l = e
        # print(d.vector.context.device_id)
        with autograd.record():
            z = model(mx.nd, d)
            preds.append(z[0].asscalar())
            lo = loss(z[0], l)
            lo.backward()
        if (i != 0) and i % batch_size == 0: 
            trainer.step(batch_size, ignore_stale_grad=True)
    print('training loss from epoch {}: '.format(epoch), calc_loss(preds, y_train))
    test_preds = [model(mx.nd, d)[0].asscalar() for d,l in single_test]
    print('test loss from epoch {}'.format(epoch), calc_loss(test_preds, y_test))


0it [00:00, ?it/s][A
1it [00:00,  5.55it/s][A
2it [00:00,  5.56it/s][A
3it [00:00,  6.37it/s][A
5it [00:00,  7.28it/s][A
7it [00:01,  6.70it/s][A
8it [00:01,  7.29it/s][A
9it [00:01,  7.11it/s][A
11it [00:01,  8.22it/s][A
13it [00:01,  8.83it/s][A
15it [00:01,  9.83it/s][A
17it [00:01, 10.91it/s][A
19it [00:02,  8.85it/s][A
21it [00:02,  7.96it/s][A
22it [00:02,  7.77it/s][A
24it [00:02,  8.45it/s][A
26it [00:03,  9.02it/s][A
27it [00:03,  7.90it/s][A
28it [00:03,  7.55it/s][A
30it [00:03,  9.03it/s][A
32it [00:03, 10.45it/s][A
34it [00:03, 10.54it/s][A
36it [00:03, 10.74it/s][A
38it [00:04,  9.73it/s][A
40it [00:04,  8.50it/s][A
42it [00:04,  8.94it/s][A
43it [00:04,  8.60it/s][A
45it [00:05,  9.14it/s][A
46it [00:05,  7.84it/s][A
47it [00:05,  7.50it/s][A
50it [00:05,  8.48it/s][A
51it [00:05,  8.69it/s][A
54it [00:05, 10.79it/s][A
57it [00:05, 12.40it/s][A
59it [00:06, 10.99it/s][A
61it [00:06, 11.96it/s][A
63it [00:06, 10.63it/s][A
65it [00:06,