In [None]:
! apt-get update
! apt-get install --reinstall python*-decorator

In [None]:
! pip3 install --quiet pymongo
! pip3 install --quiet --upgrade html5lib
! pip3 install --quiet --upgrade beautifulsoup4
! pip3 install --quiet tqdm
# ! pip3 install --quiet --upgrade numpy
# ! pip3 install --quiet --upgrade scipy
# ! pip3 install --quiet --upgrade sklearn
# ! pip3 install --quiet --upgrade pandas
! pip3 install --quiet spacy
# ! pip3 install spacy-nightly

In [None]:
! pip install --quiet pymongo
! pip install --quiet tqdm
! pip install spacy-nightly

In [12]:
! spacy download en_core_web_md

/bin/sh: 1: spacy: not found


In [None]:
! python3 -m spacy download en_core_web_md

In [None]:
! python3 -m spacy download en

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import mxnet as mx
import fetch.fetch as fetch
from mxnet import nd, autograd, gluon
from mxnet.gluon import Block, nn, rnn, Trainer
from mxnet.gluon.parameter import Parameter
import numpy as np
from tqdm import tqdm
from sklearn.metrics import *
mx.random.seed(1)

In [105]:
def calc_loss(preds, y_test):
    preds = np.array([p.asscalar() for p in preds])
    predictions = (preds >= .5).astype(int)
    return (precision_score(y_test, predictions), recall_score(y_test, predictions), fbeta_score(y_test, predictions, beta = 1.5))

In [100]:
class ChildSumGRU(Block):
    def __init__(self, num_hidden, dictionary=None, embed_dim=None, dropout=0.5):
        super(ChildSumGRU, self).__init__()
        with self.name_scope():
            if dictionary: 
                self.dictionary = dictionary
                vocab_size = len(dictionary.keys())
                self.embed = nn.Embedding(vocab_size, embed_dim)
            self.net = rnn.GRU(num_hidden, dropout = dropout)
            
    def forward(self, F, tree):
        # set computation ctx (tree context? )
        # hidden state is sum of childrens hidden states, which are
        # simply obtained through recursion
        try:
            vec = self.embed(nd.array([self.dictionary.token2id.get(tree.text)]))
        except AttributeError:
            vec = tree.vector

        child_states = [self.forward(F, child) for child in tree.children]
        if child_states:
            hidden_previous = [F.add_n(*child_states)]
        else: 
            hidden_previous = [s.as_in_context(vec.context) for s in 
                               self.net.begin_state(batch_size = 1) ]
        output, _ = self.net(vec, hidden_previous)
        return output

In [101]:
def get_head(doc):
    return [token for token in doc if token.head is token][0]

class ClassifierTreeRNN(Block):
    def __init__(self, num_hidden, dictionary=None, embed_dim=None, dropout=0.5):
        super(ClassifierTreeRNN, self).__init__()
        with self.name_scope():
            self.gru = ChildSumGRU(num_hidden, dictionary, embed_dim, dropout)
            self.decoder = nn.Dense(1, activation = 'sigmoid', in_units = num_hidden)
    def forward(self, F, tree):
        output = self.gru(F, tree)
        # print('output: ', output)
        # print('hidden: ', hidden)
        return self.decoder(output) # reshape??? 

In [166]:
from mxnet import gpu, cpu

ctx = [gpu(4), gpu(5), gpu(6), gpu(7)]

model = ClassifierTreeRNN(500, dropout=0.5)
model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

loss = lambda yhat,y: - (1-y)*nd.log(1 - yhat) - y*nd.log(yhat) 

trainer = Trainer(model.collect_params(), 'sgd',
                  {'learning_rate': 0.1 }, )

In [170]:
batch_size = 10

for epoch in range(10):
    preds = []
    for i,e in tqdm(enumerate(single_train)):
        d,l = e
        with autograd.record():
            z = model(mx.nd, d)
            preds.append(z[0])
            lo = loss(z[0], l)
            lo.backward()
        if (i != 0) and i % batch_size == 0: 
            trainer.step(batch_size, ignore_stale_grad=True)
    print('training loss from epoch {}: '.format(epoch), calc_loss(preds, y_train))
    test_preds = [model(mx.nd, d)[0] for d,l in single_test]
    print('test loss from epoch {}'.format(epoch), calc_loss(test_preds, y_test))

test loss from epoch 9 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)
training loss from epoch 9:  (0.70329670329670335, 0.45714285714285713, 0.51231527093596052)



8it [00:00, 69.26it/s]]]

test loss from epoch 8 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 8:  (0.69230769230769229, 0.45000000000000001, 0.50431034482758619)



8it [00:00, 67.34it/s]]]

test loss from epoch 7 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 7:  (0.68478260869565222, 0.45000000000000001, 0.50307125307125311)



8it [00:00, 69.36it/s]]]

test loss from epoch 6 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 6:  (0.68131868131868134, 0.44285714285714284, 0.4963054187192118)



8it [00:00, 69.54it/s]]]

test loss from epoch 5 (0.63636363636363635, 0.36842105263157893, 0.42325581395348838)


0it [00:00, ?it/s]

training loss from epoch 5:  (0.67391304347826086, 0.44285714285714284, 0.49508599508599499)



8it [00:00, 69.36it/s]]]

test loss from epoch 4 (0.63636363636363635, 0.36842105263157893, 0.42325581395348838)


0it [00:00, ?it/s]

training loss from epoch 4:  (0.68478260869565222, 0.45000000000000001, 0.50307125307125311)



8it [00:00, 68.86it/s]]]

test loss from epoch 3 (0.63636363636363635, 0.36842105263157893, 0.42325581395348838)


0it [00:00, ?it/s]

training loss from epoch 3:  (0.68888888888888888, 0.44285714285714284, 0.49753086419753084)



8it [00:00, 68.57it/s]]]

test loss from epoch 2 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 2:  (0.68235294117647061, 0.41428571428571431, 0.47125000000000006)



8it [00:00, 68.67it/s]]]

test loss from epoch 1 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 1:  (0.66279069767441856, 0.40714285714285714, 0.46197007481296759)



8it [00:00, 70.19it/s]]]

test loss from epoch 0 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 0:  (0.6470588235294118, 0.39285714285714285, 0.44687500000000002)



0it [00:00, ?it/s]t/s]]]

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

import fetch.fetch as fetch

df = fetch.create_df(fetch.get_labelled_articles("209.177.92.45:80"))

In [None]:
from modelling.clustering import get_unique_items

ge_unique = get_unique_items(df[df._id.str.contains('ge')], .1)
tw_unique = get_unique_items(df[df._id.str.contains('tw')], 0.35)
unique = pd.concat([ge_unique, tw_unique])[:604]

In [167]:
from modelling.utils import clean_html, preprocessor

unique.body = unique.body.map(preprocessor)

unique = unique[unique.body.str.len() > 5]
bodies = unique.body.as_matrix().tolist()

d = [get_head(doc) for doc in map(nlp, bodies)]

In [154]:
def batchify(data, batch_size):
    return np.array(np.split(np.array(data), 
                             len(data)/batch_size))

In [33]:
class Tree(object):
    def __init__(self, ctx, text, vector, children):
        self.text = text
        self.vector = nd.array([[vector]], ctx = ctx)
        self.children = [Tree(ctx, c.text, c.vector, c.children) for c in children]

def to_gpu_tree(c, ctx):
    return Tree(ctx, 
                c.text, 
                c.vector, 
                c.children) 

In [153]:
from random import shuffle

def split(data, num):
    try:
        return np.array(np.split(data, num))
    except AttributeError:
        return list(map(list, np.split(np.array(data), num)))

def map_with_split_context(fn, ctx, data):
    splitted = split(data, len(ctx))
    li =  [fn(c, ctx[i]) for i,d in enumerate(splitted) for c in d]
    # shuffle(li)
    return li

In [168]:
from sklearn.model_selection import train_test_split

labels = (unique.label == 'accepted').astype(int)
X_train, X_test, y_train, y_test = train_test_split(d, labels, test_size = .20)

X_train = map_with_split_context(to_gpu_tree, ctx, X_train)
X_test = map_with_split_context(to_gpu_tree, ctx, X_test)

# load y on ctx??? 

single_train = list(zip(X_train, y_train))
single_test = list(zip(X_test, y_test))

In [None]:
from gensim.corpora.dictionary import Dictionary

from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)

lis = unique.body.map(tokenizer)
docs = [[w.text for w in doc] for doc in lis.tolist()]
dictionary = Dictionary(docs)