In [None]:
! apt-get update
! apt-get install --reinstall python*-decorator

In [None]:
! pip3 install --quiet pymongo
! pip3 install --quiet --upgrade html5lib
! pip3 install --quiet --upgrade beautifulsoup4
! pip3 install --quiet tqdm
# ! pip3 install --quiet --upgrade numpy
# ! pip3 install --quiet --upgrade scipy
# ! pip3 install --quiet --upgrade sklearn
# ! pip3 install --quiet --upgrade pandas
! pip3 install --quiet spacy
# ! pip3 install spacy-nightly

In [None]:
! python3 -m spacy download en_core_web_md

In [None]:
! python3 -m spacy download en

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import mxnet as mx
import fetch.fetch as fetch
from mxnet import nd, autograd, gluon
from mxnet.gluon import Block, nn, rnn, Trainer
from mxnet.gluon.parameter import Parameter
import numpy as np
from tqdm import tqdm
from sklearn.metrics import *
mx.random.seed(1)

In [105]:
def calc_loss(preds, y_test):
    preds = np.array([p.asscalar() for p in preds])
    predictions = (preds >= .5).astype(int)
    return (precision_score(y_test, predictions), recall_score(y_test, predictions), fbeta_score(y_test, predictions, beta = 1.5))

In [100]:
class ChildSumGRU(Block):
    def __init__(self, num_hidden, dictionary=None, embed_dim=None, dropout=0.5):
        super(ChildSumGRU, self).__init__()
        with self.name_scope():
            if dictionary: 
                self.dictionary = dictionary
                vocab_size = len(dictionary.keys())
                self.embed = nn.Embedding(vocab_size, embed_dim)
            self.net = rnn.GRU(num_hidden, dropout = dropout)
            
    def forward(self, F, tree):
        # set computation ctx (tree context? )
        # hidden state is sum of childrens hidden states, which are
        # simply obtained through recursion
        try:
            vec = self.embed(nd.array([self.dictionary.token2id.get(tree.text)]))
        except AttributeError:
            vec = tree.vector

        child_states = [self.forward(F, child) for child in tree.children]
        if child_states:
            hidden_previous = [F.add_n(*child_states)]
        else: 
            hidden_previous = [s.as_in_context(vec.context) for s in 
                               self.net.begin_state(batch_size = 1) ]
        output, _ = self.net(vec, hidden_previous)
        return output

In [101]:
def get_head(doc):
    return [token for token in doc if token.head is token][0]

class ClassifierTreeRNN(Block):
    def __init__(self, num_hidden, dictionary=None, embed_dim=None, dropout=0.5):
        super(ClassifierTreeRNN, self).__init__()
        with self.name_scope():
            self.gru = ChildSumGRU(num_hidden, dictionary, embed_dim, dropout)
            self.decoder = nn.Dense(1, activation = 'sigmoid', in_units = num_hidden)
    def forward(self, F, tree):
        output = self.gru(F, tree)
        # print('output: ', output)
        # print('hidden: ', hidden)
        return self.decoder(output) # reshape??? 

In [166]:
from mxnet import gpu, cpu

ctx = [gpu(4), gpu(5), gpu(6), gpu(7)]

model = ClassifierTreeRNN(500, dropout=0.5)
model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

loss = lambda yhat,y: - (1-y)*nd.log(1 - yhat) - y*nd.log(yhat) 

trainer = Trainer(model.collect_params(), 'sgd',
                  {'learning_rate': 0.1 }, )

In [170]:
batch_size = 10

for epoch in range(10):
    preds = []
    for i,e in tqdm(enumerate(single_train)):
        d,l = e
        with autograd.record():
            z = model(mx.nd, d)
            preds.append(z[0])
            lo = loss(z[0], l)
            lo.backward()
        if (i != 0) and i % batch_size == 0: 
            trainer.step(batch_size, ignore_stale_grad=True)
    print('training loss from epoch {}: '.format(epoch), calc_loss(preds, y_train))
    test_preds = [model(mx.nd, d)[0] for d,l in single_test]
    print('test loss from epoch {}'.format(epoch), calc_loss(test_preds, y_test))

test loss from epoch 9 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


training loss from epoch 9:  (0.70329670329670335, 0.45714285714285713, 0.51231527093596052)





480it [00:06, 77.52it/s]

476it [00:06, 77.30it/s]

467it [00:06, 79.13it/s]

458it [00:05, 85.64it/s]

447it [00:05, 80.90it/s]

438it [00:05, 80.76it/s]

429it [00:05, 87.51it/s]

419it [00:05, 84.58it/s]

409it [00:05, 82.51it/s]

400it [00:05, 81.32it/s]

390it [00:05, 77.54it/s]

381it [00:04, 80.50it/s]

372it [00:04, 83.64it/s]

362it [00:04, 79.68it/s]

352it [00:04, 74.13it/s]

344it [00:04, 77.93it/s]

334it [00:04, 74.73it/s]

326it [00:04, 73.11it/s]

317it [00:04, 79.67it/s]

308it [00:04, 80.17it/s]

299it [00:03, 84.47it/s]

289it [00:03, 79.82it/s]

280it [00:03, 80.50it/s]

270it [00:03, 75.00it/s]

261it [00:03, 73.76it/s]

253it [00:03, 76.86it/s]

244it [00:03, 73.96it/s]

236it [00:03, 76.47it/s]

228it [00:03, 78.92it/s]

219it [00:02, 75.58it/s]

210it [00:02, 74.65it/s]

202it [00:02, 77.35it/s]

194it [00:02, 77.75it/s]

184it [00:02, 74.03it/s]

175it [00:02, 79.13it/s]

166it [00:02, 79.18it/s]

157it [00:02, 81.47it/s]

148it [00:01, 86.53it/s]

138it [00:01, 84.44it/s]

128it [00:01, 80.45it/s]

118it [00:01, 75.84it/s]

108it [00:01, 68.87it/s]

100it [00:01, 71.37it/s]

91it [00:01, 66.40it/s]

84it [00:01, 66.83it/s]

77it [00:01, 66.07it/s]

70it [00:00, 67.50it/s]

63it [00:00, 66.55it/s]

56it [00:00, 65.81it/s]

49it [00:00, 70.79it/s]

40it [00:00, 65.39it/s]

34it [00:00, 70.76it/s]

27it [00:00, 74.80it/s]

16it [00:00, 70.34it/s]

8it [00:00, 69.26it/s]

test loss from epoch 8 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 8:  (0.69230769230769229, 0.45000000000000001, 0.50431034482758619)





480it [00:06, 74.90it/s]

477it [00:06, 72.54it/s]

469it [00:06, 76.37it/s]

460it [00:06, 82.28it/s]

451it [00:05, 82.25it/s]

441it [00:05, 77.72it/s]

432it [00:05, 85.25it/s]

421it [00:05, 82.04it/s]

411it [00:05, 78.81it/s]

402it [00:05, 77.05it/s]

392it [00:05, 74.71it/s]

383it [00:05, 79.63it/s]

374it [00:05, 80.52it/s]

364it [00:04, 77.84it/s]

355it [00:04, 74.28it/s]

347it [00:04, 75.33it/s]

338it [00:04, 73.30it/s]

330it [00:04, 72.95it/s]

322it [00:04, 72.35it/s]

314it [00:04, 77.00it/s]

306it [00:04, 78.50it/s]

297it [00:04, 80.02it/s]

288it [00:03, 78.71it/s]

279it [00:03, 76.27it/s]

270it [00:03, 71.89it/s]

262it [00:03, 73.91it/s]

254it [00:03, 75.19it/s]

245it [00:03, 71.70it/s]

237it [00:03, 74.00it/s]

229it [00:03, 74.81it/s]

221it [00:03, 73.47it/s]

213it [00:02, 73.01it/s]

205it [00:02, 70.89it/s]

197it [00:02, 73.51it/s]

189it [00:02, 76.71it/s]

180it [00:02, 72.58it/s]

172it [00:02, 77.59it/s]

163it [00:02, 79.48it/s]

154it [00:02, 81.91it/s]

145it [00:02, 81.82it/s]

135it [00:01, 80.77it/s]

125it [00:01, 75.69it/s]

116it [00:01, 70.91it/s]

107it [00:01, 65.77it/s]

100it [00:01, 68.11it/s]

91it [00:01, 63.73it/s]

84it [00:01, 64.87it/s]

77it [00:01, 63.69it/s]

70it [00:01, 64.80it/s]

63it [00:00, 64.24it/s]

56it [00:00, 63.56it/s]

49it [00:00, 68.45it/s]

40it [00:00, 63.05it/s]

34it [00:00, 68.89it/s]

27it [00:00, 73.05it/s]

16it [00:00, 68.85it/s]

8it [00:00, 67.34it/s]

test loss from epoch 7 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 7:  (0.68478260869565222, 0.45000000000000001, 0.50307125307125311)





480it [00:06, 76.77it/s]

477it [00:06, 73.89it/s]

469it [00:06, 77.85it/s]

460it [00:05, 83.32it/s]

451it [00:05, 83.47it/s]

441it [00:05, 78.55it/s]

432it [00:05, 86.81it/s]

421it [00:05, 83.86it/s]

411it [00:05, 79.74it/s]

402it [00:05, 78.15it/s]

392it [00:05, 75.80it/s]

383it [00:05, 81.27it/s]

374it [00:04, 82.53it/s]

365it [00:04, 79.95it/s]

355it [00:04, 75.32it/s]

347it [00:04, 76.02it/s]

338it [00:04, 73.95it/s]

330it [00:04, 73.57it/s]

322it [00:04, 72.72it/s]

314it [00:04, 77.37it/s]

306it [00:04, 79.35it/s]

297it [00:03, 81.17it/s]

288it [00:03, 80.00it/s]

279it [00:03, 77.66it/s]

270it [00:03, 73.39it/s]

261it [00:03, 72.57it/s]

253it [00:03, 75.06it/s]

244it [00:03, 72.06it/s]

236it [00:03, 74.88it/s]

228it [00:03, 77.31it/s]

219it [00:02, 73.83it/s]

210it [00:02, 72.55it/s]

202it [00:02, 75.78it/s]

194it [00:02, 76.38it/s]

185it [00:02, 74.73it/s]

177it [00:02, 77.70it/s]

168it [00:02, 79.42it/s]

159it [00:02, 79.88it/s]

150it [00:01, 88.03it/s]

140it [00:01, 85.68it/s]

130it [00:01, 82.09it/s]

120it [00:01, 76.28it/s]

110it [00:01, 70.26it/s]

102it [00:01, 71.32it/s]

93it [00:01, 67.19it/s]

85it [00:01, 66.37it/s]

78it [00:01, 67.93it/s]

71it [00:00, 69.60it/s]

63it [00:00, 67.72it/s]

56it [00:00, 66.83it/s]

49it [00:00, 71.64it/s]

40it [00:00, 66.54it/s]

34it [00:00, 71.78it/s]

27it [00:00, 75.65it/s]

16it [00:00, 71.02it/s]

8it [00:00, 69.36it/s]

test loss from epoch 6 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 6:  (0.68131868131868134, 0.44285714285714284, 0.4963054187192118)





480it [00:06, 76.15it/s]

475it [00:06, 75.75it/s]

467it [00:06, 77.51it/s]

458it [00:05, 83.65it/s]

447it [00:05, 78.72it/s]

438it [00:05, 78.59it/s]

429it [00:05, 85.71it/s]

419it [00:05, 82.70it/s]

409it [00:05, 80.80it/s]

400it [00:05, 79.45it/s]

391it [00:05, 76.82it/s]

382it [00:05, 78.59it/s]

373it [00:04, 82.71it/s]

363it [00:04, 77.70it/s]

353it [00:04, 73.65it/s]

345it [00:04, 75.93it/s]

336it [00:04, 72.87it/s]

328it [00:04, 71.99it/s]

320it [00:04, 76.41it/s]

311it [00:04, 76.36it/s]

303it [00:04, 78.83it/s]

294it [00:03, 79.37it/s]

285it [00:03, 79.37it/s]

276it [00:03, 76.73it/s]

266it [00:03, 73.13it/s]

258it [00:03, 74.60it/s]

250it [00:03, 74.67it/s]

242it [00:03, 74.45it/s]

234it [00:03, 75.48it/s]

226it [00:03, 74.70it/s]

218it [00:02, 74.52it/s]

209it [00:02, 70.24it/s]

201it [00:02, 73.87it/s]

192it [00:02, 78.72it/s]

182it [00:02, 72.74it/s]

174it [00:02, 76.30it/s]

166it [00:02, 77.56it/s]

157it [00:02, 80.16it/s]

148it [00:01, 85.41it/s]

138it [00:01, 83.37it/s]

128it [00:01, 79.40it/s]

118it [00:01, 74.35it/s]

108it [00:01, 68.08it/s]

100it [00:01, 70.71it/s]

91it [00:01, 65.87it/s]

84it [00:01, 66.85it/s]

77it [00:01, 65.78it/s]

70it [00:00, 67.00it/s]

63it [00:00, 66.66it/s]

56it [00:00, 66.39it/s]

49it [00:00, 71.15it/s]

40it [00:00, 65.95it/s]

34it [00:00, 71.59it/s]

27it [00:00, 75.36it/s]

16it [00:00, 71.00it/s]

8it [00:00, 69.54it/s]

test loss from epoch 5 (0.63636363636363635, 0.36842105263157893, 0.42325581395348838)


0it [00:00, ?it/s]

training loss from epoch 5:  (0.67391304347826086, 0.44285714285714284, 0.49508599508599499)





480it [00:06, 76.41it/s]

477it [00:06, 73.77it/s]

469it [00:06, 77.14it/s]

460it [00:05, 82.85it/s]

451it [00:05, 82.99it/s]

441it [00:05, 78.48it/s]

432it [00:05, 86.98it/s]

421it [00:05, 83.24it/s]

411it [00:05, 79.80it/s]

402it [00:05, 78.06it/s]

392it [00:05, 75.55it/s]

383it [00:05, 80.98it/s]

374it [00:04, 81.84it/s]

363it [00:04, 78.20it/s]

353it [00:04, 74.09it/s]

345it [00:04, 76.66it/s]

336it [00:04, 73.72it/s]

328it [00:04, 72.10it/s]

320it [00:04, 76.31it/s]

311it [00:04, 75.64it/s]

303it [00:04, 78.71it/s]

294it [00:03, 79.99it/s]

285it [00:03, 80.06it/s]

276it [00:03, 77.75it/s]

266it [00:03, 74.14it/s]

258it [00:03, 75.46it/s]

250it [00:03, 74.88it/s]

242it [00:03, 75.34it/s]

234it [00:03, 75.93it/s]

226it [00:03, 75.16it/s]

218it [00:02, 74.84it/s]

209it [00:02, 71.13it/s]

201it [00:02, 74.44it/s]

192it [00:02, 79.54it/s]

182it [00:02, 73.34it/s]

174it [00:02, 77.06it/s]

165it [00:02, 77.36it/s]

156it [00:02, 80.26it/s]

147it [00:01, 85.89it/s]

137it [00:01, 83.78it/s]

127it [00:01, 78.50it/s]

118it [00:01, 74.93it/s]

108it [00:01, 68.45it/s]

101it [00:01, 69.85it/s]

93it [00:01, 67.06it/s]

85it [00:01, 65.69it/s]

78it [00:01, 66.95it/s]

71it [00:01, 68.40it/s]

63it [00:00, 66.53it/s]

56it [00:00, 65.85it/s]

49it [00:00, 70.40it/s]

40it [00:00, 65.58it/s]

34it [00:00, 71.34it/s]

27it [00:00, 75.28it/s]

16it [00:00, 70.93it/s]

8it [00:00, 69.36it/s]

test loss from epoch 4 (0.63636363636363635, 0.36842105263157893, 0.42325581395348838)


0it [00:00, ?it/s]

training loss from epoch 4:  (0.68478260869565222, 0.45000000000000001, 0.50307125307125311)





480it [00:06, 73.61it/s]

472it [00:06, 73.84it/s]

463it [00:06, 78.56it/s]

454it [00:05, 82.40it/s]

443it [00:05, 77.82it/s]

434it [00:05, 82.42it/s]

425it [00:05, 85.69it/s]

415it [00:05, 82.01it/s]

405it [00:05, 77.50it/s]

397it [00:05, 77.40it/s]

389it [00:05, 78.66it/s]

380it [00:05, 78.63it/s]

371it [00:04, 81.29it/s]

362it [00:04, 78.81it/s]

352it [00:04, 73.84it/s]

343it [00:04, 78.46it/s]

333it [00:04, 74.15it/s]

325it [00:04, 71.93it/s]

316it [00:04, 79.91it/s]

307it [00:04, 78.46it/s]

298it [00:03, 81.06it/s]

289it [00:03, 78.90it/s]

280it [00:03, 79.96it/s]

270it [00:03, 74.16it/s]

261it [00:03, 73.49it/s]

253it [00:03, 75.58it/s]

244it [00:03, 72.94it/s]

236it [00:03, 75.36it/s]

228it [00:03, 77.39it/s]

219it [00:02, 74.04it/s]

210it [00:02, 72.85it/s]

202it [00:02, 76.18it/s]

194it [00:02, 77.00it/s]

185it [00:02, 73.96it/s]

177it [00:02, 77.58it/s]

169it [00:02, 77.41it/s]

160it [00:02, 78.06it/s]

151it [00:02, 86.47it/s]

141it [00:01, 84.89it/s]

131it [00:01, 81.41it/s]

120it [00:01, 75.61it/s]

109it [00:01, 68.33it/s]

101it [00:01, 70.14it/s]

93it [00:01, 67.16it/s]

85it [00:01, 66.12it/s]

77it [00:01, 65.83it/s]

70it [00:01, 66.88it/s]

63it [00:00, 66.04it/s]

56it [00:00, 65.13it/s]

49it [00:00, 70.28it/s]

40it [00:00, 64.98it/s]

34it [00:00, 71.28it/s]

27it [00:00, 74.91it/s]

16it [00:00, 70.18it/s]

8it [00:00, 68.86it/s]

test loss from epoch 3 (0.63636363636363635, 0.36842105263157893, 0.42325581395348838)


0it [00:00, ?it/s]

training loss from epoch 3:  (0.68888888888888888, 0.44285714285714284, 0.49753086419753084)





480it [00:06, 75.65it/s]

472it [00:06, 76.00it/s]

463it [00:05, 82.24it/s]

453it [00:05, 87.29it/s]

442it [00:05, 80.20it/s]

432it [00:05, 89.10it/s]

421it [00:05, 85.22it/s]

411it [00:05, 81.63it/s]

402it [00:05, 79.40it/s]

392it [00:05, 76.46it/s]

383it [00:05, 81.39it/s]

374it [00:04, 82.99it/s]

363it [00:04, 79.12it/s]

353it [00:04, 75.34it/s]

345it [00:04, 78.68it/s]

336it [00:04, 75.14it/s]

328it [00:04, 73.87it/s]

320it [00:04, 78.67it/s]

311it [00:04, 77.59it/s]

302it [00:04, 81.27it/s]

293it [00:03, 79.59it/s]

284it [00:03, 79.66it/s]

275it [00:03, 77.61it/s]

266it [00:03, 73.61it/s]

258it [00:03, 75.56it/s]

250it [00:03, 75.29it/s]

242it [00:03, 75.21it/s]

234it [00:03, 76.20it/s]

226it [00:03, 75.60it/s]

218it [00:02, 75.97it/s]

209it [00:02, 71.72it/s]

201it [00:02, 74.97it/s]

192it [00:02, 79.99it/s]

182it [00:02, 74.43it/s]

174it [00:02, 77.85it/s]

165it [00:02, 77.86it/s]

156it [00:02, 80.86it/s]

147it [00:01, 86.23it/s]

137it [00:01, 83.84it/s]

127it [00:01, 79.06it/s]

118it [00:01, 75.19it/s]

108it [00:01, 68.33it/s]

101it [00:01, 69.79it/s]

93it [00:01, 66.81it/s]

85it [00:01, 65.39it/s]

77it [00:01, 64.94it/s]

70it [00:01, 66.35it/s]

63it [00:00, 65.47it/s]

56it [00:00, 64.75it/s]

49it [00:00, 70.19it/s]

40it [00:00, 64.99it/s]

34it [00:00, 70.34it/s]

27it [00:00, 74.30it/s]

16it [00:00, 70.09it/s]

8it [00:00, 68.57it/s]

test loss from epoch 2 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 2:  (0.68235294117647061, 0.41428571428571431, 0.47125000000000006)





480it [00:06, 74.37it/s]

472it [00:06, 74.48it/s]

463it [00:06, 79.70it/s]

454it [00:05, 83.17it/s]

443it [00:05, 78.45it/s]

434it [00:05, 83.80it/s]

425it [00:05, 87.02it/s]

415it [00:05, 82.45it/s]

405it [00:05, 77.51it/s]

397it [00:05, 78.07it/s]

389it [00:05, 78.68it/s]

380it [00:04, 78.23it/s]

371it [00:04, 80.49it/s]

361it [00:04, 77.05it/s]

351it [00:04, 74.18it/s]

343it [00:04, 78.56it/s]

333it [00:04, 74.40it/s]

324it [00:04, 73.15it/s]

315it [00:04, 80.23it/s]

306it [00:04, 80.32it/s]

297it [00:03, 82.14it/s]

288it [00:03, 80.70it/s]

279it [00:03, 78.71it/s]

270it [00:03, 74.86it/s]

261it [00:03, 73.87it/s]

253it [00:03, 76.35it/s]

244it [00:03, 73.05it/s]

236it [00:03, 75.30it/s]

228it [00:03, 77.96it/s]

219it [00:02, 75.09it/s]

210it [00:02, 74.29it/s]

202it [00:02, 77.00it/s]

194it [00:02, 77.62it/s]

185it [00:02, 74.56it/s]

177it [00:02, 77.53it/s]

169it [00:02, 77.74it/s]

160it [00:02, 78.73it/s]

151it [00:02, 86.47it/s]

141it [00:01, 84.83it/s]

131it [00:01, 80.83it/s]

120it [00:01, 75.32it/s]

109it [00:01, 68.42it/s]

101it [00:01, 70.00it/s]

93it [00:01, 67.14it/s]

85it [00:01, 65.83it/s]

77it [00:01, 65.29it/s]

70it [00:00, 67.12it/s]

63it [00:00, 66.70it/s]

56it [00:00, 66.11it/s]

49it [00:00, 70.92it/s]

40it [00:00, 65.32it/s]

34it [00:00, 71.14it/s]

27it [00:00, 75.31it/s]

16it [00:00, 70.63it/s]

8it [00:00, 68.67it/s]

test loss from epoch 1 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 1:  (0.66279069767441856, 0.40714285714285714, 0.46197007481296759)





480it [00:06, 78.04it/s]

478it [00:06, 74.55it/s]

469it [00:05, 78.57it/s]

460it [00:05, 84.11it/s]

451it [00:05, 84.52it/s]

441it [00:05, 80.22it/s]

432it [00:05, 88.82it/s]

421it [00:05, 85.71it/s]

411it [00:05, 82.28it/s]

402it [00:05, 80.25it/s]

392it [00:05, 77.39it/s]

383it [00:04, 82.30it/s]

374it [00:04, 83.74it/s]

364it [00:04, 81.06it/s]

354it [00:04, 75.22it/s]

345it [00:04, 78.42it/s]

336it [00:04, 74.99it/s]

327it [00:04, 72.12it/s]

318it [00:04, 79.74it/s]

309it [00:04, 79.22it/s]

300it [00:03, 82.97it/s]

291it [00:03, 80.41it/s]

282it [00:03, 80.21it/s]

271it [00:03, 73.45it/s]

263it [00:03, 77.32it/s]

255it [00:03, 76.72it/s]

245it [00:03, 73.51it/s]

237it [00:03, 76.92it/s]

228it [00:02, 79.97it/s]

219it [00:02, 76.44it/s]

211it [00:02, 76.18it/s]

203it [00:02, 77.05it/s]

195it [00:02, 76.92it/s]

187it [00:02, 77.62it/s]

179it [00:02, 77.38it/s]

171it [00:02, 78.18it/s]

162it [00:02, 82.50it/s]

153it [00:02, 84.58it/s]

143it [00:01, 87.78it/s]

132it [00:01, 80.93it/s]

122it [00:01, 76.17it/s]

112it [00:01, 71.82it/s]

104it [00:01, 69.14it/s]

96it [00:01, 72.56it/s]

87it [00:01, 67.44it/s]

80it [00:01, 67.51it/s]

73it [00:01, 69.55it/s]

65it [00:00, 69.23it/s]

57it [00:00, 68.27it/s]

50it [00:00, 71.11it/s]

40it [00:00, 66.84it/s]

34it [00:00, 72.35it/s]

27it [00:00, 76.58it/s]

16it [00:00, 71.94it/s]

8it [00:00, 70.19it/s]

test loss from epoch 0 (0.66666666666666663, 0.36842105263157893, 0.42723004694835676)


0it [00:00, ?it/s]

training loss from epoch 0:  (0.6470588235294118, 0.39285714285714285, 0.44687500000000002)





480it [00:06, 76.14it/s]

474it [00:06, 75.20it/s]

466it [00:06, 77.02it/s]

457it [00:05, 83.78it/s]

446it [00:05, 78.13it/s]

437it [00:05, 80.07it/s]

428it [00:05, 85.05it/s]

419it [00:05, 83.24it/s]

409it [00:05, 80.80it/s]

400it [00:05, 79.06it/s]

391it [00:05, 77.04it/s]

382it [00:05, 78.81it/s]

373it [00:04, 82.77it/s]

363it [00:04, 77.89it/s]

353it [00:04, 73.88it/s]

345it [00:04, 76.37it/s]

336it [00:04, 73.54it/s]

328it [00:04, 71.83it/s]

320it [00:04, 76.86it/s]

311it [00:04, 76.45it/s]

302it [00:04, 80.51it/s]

293it [00:03, 79.24it/s]

284it [00:03, 79.69it/s]

275it [00:03, 77.71it/s]

266it [00:03, 73.70it/s]

258it [00:03, 75.16it/s]

250it [00:03, 74.64it/s]

242it [00:03, 74.90it/s]

234it [00:03, 75.74it/s]

226it [00:03, 75.02it/s]

218it [00:02, 74.83it/s]

209it [00:02, 70.77it/s]

201it [00:02, 74.26it/s]

192it [00:02, 79.50it/s]

182it [00:02, 73.43it/s]

174it [00:02, 76.60it/s]

166it [00:02, 77.82it/s]

157it [00:02, 79.85it/s]

148it [00:01, 85.58it/s]

138it [00:01, 83.70it/s]

128it [00:01, 79.47it/s]

118it [00:01, 74.48it/s]

108it [00:01, 67.78it/s]

100it [00:01, 70.28it/s]

91it [00:01, 65.60it/s]

84it [00:01, 66.94it/s]

77it [00:01, 65.73it/s]

70it [00:01, 67.05it/s]

63it [00:00, 66.43it/s]

56it [00:00, 65.58it/s]

49it [00:00, 70.44it/s]

40it [00:00, 65.20it/s]

34it [00:00, 70.24it/s]

27it [00:00, 73.92it/s]

16it [00:00, 69.90it/s]

8it [00:00, 68.76it/s]

0it [00:00, ?it/s]

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

import fetch.fetch as fetch

df = fetch.create_df(fetch.get_labelled_articles("209.177.92.45:80"))

In [None]:
from modelling.clustering import get_unique_items

ge_unique = get_unique_items(df[df._id.str.contains('ge')], .1)
tw_unique = get_unique_items(df[df._id.str.contains('tw')], 0.35)
unique = pd.concat([ge_unique, tw_unique])[:604]

In [167]:
from modelling.utils import clean_html, preprocessor

unique.body = unique.body.map(preprocessor)

unique = unique[unique.body.str.len() > 5]
bodies = unique.body.as_matrix().tolist()

d = [get_head(doc) for doc in map(nlp, bodies)]

In [154]:
def batchify(data, batch_size):
    return np.array(np.split(np.array(data), 
                             len(data)/batch_size))

In [33]:
class Tree(object):
    def __init__(self, ctx, text, vector, children):
        self.text = text
        self.vector = nd.array([[vector]], ctx = ctx)
        self.children = [Tree(ctx, c.text, c.vector, c.children) for c in children]

def to_gpu_tree(c, ctx):
    return Tree(ctx, 
                c.text, 
                c.vector, 
                c.children) 

In [153]:
from random import shuffle

def split(data, num):
    try:
        return np.array(np.split(data, num))
    except AttributeError:
        return list(map(list, np.split(np.array(data), num)))

def map_with_split_context(fn, ctx, data):
    splitted = split(data, len(ctx))
    li =  [fn(c, ctx[i]) for i,d in enumerate(splitted) for c in d]
    # shuffle(li)
    return li

In [168]:
from sklearn.model_selection import train_test_split

labels = (unique.label == 'accepted').astype(int)
X_train, X_test, y_train, y_test = train_test_split(d, labels, test_size = .20)

X_train = map_with_split_context(to_gpu_tree, ctx, X_train)
X_test = map_with_split_context(to_gpu_tree, ctx, X_test)

# load y on ctx??? 

single_train = list(zip(X_train, y_train))
single_test = list(zip(X_test, y_test))

In [None]:
from gensim.corpora.dictionary import Dictionary

from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)

lis = unique.body.map(tokenizer)
docs = [[w.text for w in doc] for doc in lis.tolist()]
dictionary = Dictionary(docs)