In [None]:
! pip install --quiet --upgrade mxnet
! pip install --quiet pymongo 

In [7]:
! pip install --quiet gensim

In [12]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import mxnet as mx
import fetch.fetch as fetch
from mxnet import nd, autograd, gluon
from mxnet.gluon import Block, nn, rnn, Trainer
from mxnet.gluon.parameter import Parameter
import numpy as np
from tqdm import tqdm
from sklearn.metrics import *
mx.random.seed(1)

In [425]:
class ChildSumGRU(Block):
    def __init__(self, num_hidden, input_size, dictionary=None, embed_dim=None, dropout=0.5):
        super(ChildSumGRU, self).__init__()
        with self.name_scope():
            self.dictionary = dictionary
            if dictionary: 
                vocab_size = len(dictionary.keys())
                self.embed = nn.Embedding(vocab_size, embed_dim)
            self.net = rnn.SequentialRNNCell()
            with self.net.name_scope():
                self.net.add(rnn.GRUCell(num_hidden, input_size=input_size))
                if dropout > 0.:
                    self.net.add(rnn.DropoutCell(dropout))
            
    def forward(self, F, tree):
        # set computation ctx (tree context? )
        # hidden state is sum of childrens hidden states, which are
        # simply obtained through recursion
        child_states = [self.forward(F, child)[1] for child in tree.children]
        if child_states:
            hidden_previous = F.add_n(*[state[0] for state in child_states])
            hidden_previous = [hidden_previous]
        else: 
            hidden_previous = self.net.begin_state(batch_size = 1)
        try:
            vec = self.embed(nd.array([self.dictionary.token2id.get(tree.text)]))
        except AttributeError:
            vec = F.array([tree.vector])
        output, hidden = self.net(vec, hidden_previous)
        return output, hidden

In [434]:
def get_head(doc):
    return [token for token in doc if token.head is token][0]

class ClassifierTreeRNN(Block):
    def __init__(self, num_hidden, input_size, dictionary=None, embed_dim=None, dropout=0.5):
        super(ClassifierTreeRNN, self).__init__()
        with self.name_scope():
            self.gru = ChildSumGRU(num_hidden, input_size, dictionary, embed_dim, dropout)
            self.decoder = nn.Dense(1, activation = 'sigmoid', in_units = num_hidden)
    def forward(self, F, tree):
        output, hidden = self.gru(F, tree)
        # print('output: ', output)
        # print('hidden: ', hidden)
        output = self.decoder(output) # reshape??? 
        return output

In [435]:
model = ClassifierTreeRNN(300, 128, dropout=0)
model.collect_params().initialize(mx.init.Xavier())

loss = lambda yhat,y: - (1-y)*nd.log(1 - yhat) - y*nd.log(yhat) 

In [436]:
model.collect_params()

classifiertreernn90_ (
  Parameter classifiertreernn90_childsumgru0_sequentialrnncell0_gru0_i2h_weight (shape=(900, 128), dtype=<class 'numpy.float32'>)
  Parameter classifiertreernn90_childsumgru0_sequentialrnncell0_gru0_h2h_weight (shape=(900, 300), dtype=<class 'numpy.float32'>)
  Parameter classifiertreernn90_childsumgru0_sequentialrnncell0_gru0_i2h_bias (shape=(900,), dtype=<class 'numpy.float32'>)
  Parameter classifiertreernn90_childsumgru0_sequentialrnncell0_gru0_h2h_bias (shape=(900,), dtype=<class 'numpy.float32'>)
  Parameter classifiertreernn90_dense0_weight (shape=(1, 300), dtype=<class 'numpy.float32'>)
  Parameter classifiertreernn90_dense0_bias (shape=(1,), dtype=<class 'numpy.float32'>)
)

In [250]:
trainer = Trainer(model.collect_params(), 'sgd',
                  {'learning_rate': 0.05 })

In [239]:
def calc_loss(preds, y_test):
    preds = np.array([p.asscalar() for p in preds])
    predictions = (preds >= .5).astype(int)
    return (precision_score(y_test, predictions), recall_score(y_test, predictions), fbeta_score(y_test, predictions, beta = 1.5))

In [433]:
batch_size = 5

for epoch in range(10):
    preds = []
    for i,e in tqdm(enumerate(single_train)):
        d,l = e
        with autograd.record():
            z = model(mx.nd, d)
            preds.append(z[0])
            lo = loss(z[0], l)
            lo.backward()
        if i % batch_size == 0: 
            trainer.step(batch_size)
    print('training loss from epoch {}: '.format(epoch), calc_loss(preds, y_train))
    test_preds = [model(mx.nd, d)[0] for d,l in single_test]
    print('test loss from epoch {}'.format(epoch), calc_loss(test_preds, y_test))

UserWarning: Gradient of Parameter `classifiertreernn37_dense0_weight` on context cpu(0) has not been updated by backward since last `step`. This could mean a bug in your model that maked it only use a subset of the Parameters (Blocks) for this iteration. If you are intentionally only using a subset, call step with ignore_stale_grad=True to suppress this warning and skip updating of Parameters with stale gradient




0it [00:00, ?it/s]

In [None]:
param_dict = model.collect_params()

for k in param_dict.keys():
    print(k, param_dict[k].grad())

In [17]:
import spacy
nlp = spacy.load('en')

import fetch.fetch as fetch

df = fetch.create_df(fetch.get_labelled_articles("209.177.92.45:80"))

In [165]:
from modelling.clustering import get_unique_items

ge_unique = get_unique_items(df[df._id.str.contains('ge')], .1)
tw_unique = get_unique_items(df[df._id.str.contains('tw')], 0.35)
unique = pd.concat([ge_unique, tw_unique])[:604]

In [166]:
from modelling.utils import clean_html, preprocessor

unique.body = unique.body.map(preprocessor)

unique = unique[unique.body.str.len() > 5]
bodies = unique.body.as_matrix().tolist()

d = [get_head(doc) for doc in map(nlp, bodies)]

In [167]:
def batchify(data, batch_size):
    return np.array(np.split(np.array(data), 
                             len(data)/batch_size))

In [168]:
from sklearn.model_selection import train_test_split

labels = (unique.label == 'accepted').astype(int)
X_train, X_test, y_train, y_test = train_test_split(d, labels, test_size = .20)
single_train = list(zip(X_train, y_train))
single_test = list(zip(X_test, y_test))

In [56]:
from gensim.corpora.dictionary import Dictionary

from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)

lis = unique.body.map(tokenizer)
docs = [[w.text for w in doc] for doc in lis.tolist()]
dictionary = Dictionary(docs)

In [73]:
len(dictionary.keys())

3182

In [None]:
from sklearn.utils import resample

resample(unique[unique.label == 'accepted'].as_matrix(), 100)

In [None]:
def balance_my_frame(df, label):
    