In [13]:
import pickle
import pandas as pd
import numpy as np
from collections import defaultdict
df_folda = '/Users/sango.m.ab/Desktop/research/script/anaphora/src/lstm/dataframe'
domain_dict = {'OC':'Yahoo!知恵袋','OW':'白書','OY':'Yahoo!ブログ',
    'PB':'書籍','PM':'雑誌','PN':'新聞'}

In [2]:
def load_dataset(df_folda):
    dataset_dict = {}
    for domain in domain_dict:
        with open(f'{df_folda}/dataframe_list_{domain}.pickle', 'rb') as f:
            df_list = pickle.load(f)
        dataset = []
        for df in df_list:
            y_ga = np.array(df['ga_case'], dtype=np.int32)
            y_o = np.array(df['o_case'], dtype=np.int32)
            y_ni = np.array(df['ni_case'], dtype=np.int32)
            df = df.drop('ga_case', axis=1).drop('o_case', axis=1).drop('ni_case', axis=1).drop('ga_dep_tag', axis=1).drop('o_dep_tag', axis=1).drop('ni_dep_tag', axis=1)
            x = np.array(df, dtype=np.float32)
#             y_ga_index = np.array([y_ga.argmax()], dtype=np.int32)
            dataset.append((x, y_ga))
        dataset_dict[domain] = dataset
    return dataset_dict


In [4]:
with open('./dataframe/dataframe_list_OC.pickle', 'rb') as f:
    df_list = pickle.load(f)

In [93]:
dataset = []
for df in df_list:
    y_ga = np.array(df['ga_case'], dtype=np.int32)
    y_o = np.array(df['o_case'], dtype=np.int32)
    y_ni = np.array(df['ni_case'], dtype=np.int32)
    df = df.drop('ga_case', axis=1).drop('o_case', axis=1).drop('ni_case', axis=1).drop('ga_dep_tag', axis=1).drop('o_dep_tag', axis=1).drop('ni_dep_tag', axis=1)
    x = np.array(df, dtype=np.float32)
#     y_ga_index = np.array([y_ga.argmax()], dtype=np.int32)
    dataset.append((x, y_ga))

In [22]:
import chainer
from chainer import training
from chainer.training import extensions

In [23]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import reporter

In [82]:
n_class = 91
batchsize = 100

In [83]:
train_iter = chainer.iterators.SerialIterator(dataset[:11000], batchsize)
test_iter = chainer.iterators.SerialIterator(dataset[11000:], batchsize,
                                             repeat=False, shuffle=False)

In [84]:
class RNNEncoder(chainer.Chain):

    """A LSTM-RNN Encoder with Word Embedding.

    This model encodes a sentence sequentially using LSTM.

    Args:
        n_layers (int): The number of LSTM layers.
        n_vocab (int): The size of vocabulary.
        n_units (int): The number of units of a LSTM layer and word embedding.
        dropout (float): The dropout ratio.

    """

    def __init__(self, n_layers, n_units, dropout=0.1):
        super(RNNEncoder, self).__init__()
        with self.init_scope():
            self.encoder = L.NStepLSTM(n_layers, n_units, n_units, dropout)

        self.n_layers = n_layers
        self.out_units = n_units
        self.dropout = dropout

    def __call__(self, xs):
        last_h, last_c, ys = self.encoder(None, None, xs)
        assert(last_h.shape == (self.n_layers, len(xs), self.out_units))
        concat_outputs = last_h[-1]
        return concat_outputs

In [85]:
encoder = RNNEncoder(n_layers=1, n_units=234, dropout=0.5)

In [86]:
class TextClassifier(chainer.Chain):

    """A classifier using a given encoder.

     This chain encodes a sentence and classifies it into classes.

     Args:
         encoder (Link): A callable encoder, which extracts a feature.
             Input is a list of variables whose shapes are
             "(sentence_length, )".
             Output is a variable whose shape is "(batchsize, n_units)".
         n_class (int): The number of classes to be predicted.

     """

    def __init__(self, encoder, n_class, dropout=0.1):
        super(TextClassifier, self).__init__()
        with self.init_scope():
            self.encoder = encoder
            self.output = L.Linear(encoder.out_units, n_class)
        self.dropout = dropout

    def __call__(self, xs, ys):
        concat_outputs = self.predict(xs)
        concat_truths = F.concat(ys, axis=0)

        loss = F.softmax_cross_entropy(concat_outputs, concat_truths)
        accuracy = F.accuracy(concat_outputs, concat_truths)
        reporter.report({'loss': loss.data}, self)
        reporter.report({'accuracy': accuracy.data}, self)
        return loss

    def predict(self, xs, softmax=False, argmax=False):
        concat_encodings = F.dropout(self.encoder(xs), ratio=self.dropout)
        concat_outputs = self.output(concat_encodings)
        if softmax:
            return F.softmax(concat_outputs).data
        elif argmax:
            return self.xp.argmax(concat_outputs.data, axis=1)
        else:
            return concat_outputs

In [87]:
model = TextClassifier(encoder, n_class)

In [88]:
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

In [89]:
updater = training.StandardUpdater(
    train_iter, optimizer,  converter=convert_seq, device=-1)
trainer = training.Trainer(updater, (10, 'epoch'), out='result')

# Evaluate the model with the test dataset for each epoch
trainer.extend(extensions.Evaluator(
    test_iter, model,  converter=convert_seq, device=-1))

# Take a best snapshot
record_trigger = training.triggers.MaxValueTrigger(
    'validation/main/accuracy', (1, 'epoch'))
trainer.extend(extensions.snapshot_object(
    model, 'best_model.npz'),
    trigger=record_trigger)

# Write a log of evaluation statistics for each epoch
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
    ['epoch', 'main/loss', 'validation/main/loss',
     'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

# Print a progress bar to stdout
trainer.extend(extensions.ProgressBar())

In [90]:
trainer.run()

epoch       main/loss   validation/main/loss  main/accuracy  validation/main/accuracy  elapsed_time
[J     total [####..............................................]  9.09%
this epoch [#############################################.....] 90.91%
       100 iter, 0 epoch / 10 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
[4A[J1           2.79333     2.66317               0.318545       0.329309                  86.1012       
[J     total [#########.........................................] 18.18%
this epoch [########################################..........] 81.82%
       200 iter, 1 epoch / 10 epochs
    1.3784 iters/sec. Estimated time to finish: 0:10:52.933378.
[4A[J2           2.51246     2.51589               0.341364       0.34023                   164.085       
[J     total [#############.....................................] 27.27%
this epoch [####################################..............] 72.73%
       300 iter, 2 epoch / 10 epochs
    1.3931 iter

In [231]:
from chainer import Chain
from chainer import reporter
import chainer.links as L
import chainer.functions as F

class BiLSTMBase(Chain):
    def __init__(self, input_size, n_labels, dropout=0.5):
        super(BiLSTMBase, self).__init__()
        with self.init_scope():
      # self.f_lstm = L.LSTM(None, feature_size, dropout)
      # self.b_lstm = L.LSTM(None, feature_size, dropout)
            self.nstep_bilstm = L.NStepBiLSTM(n_layers=1, in_size=input_size, out_size=input_size, dropout=dropout)
            self.l1 = L.Linear(input_size*2, n_labels)
        self.dropout = dropout

    def __call__(self, xs, ys):
        pred_ys = self.traverse(xs)
        
        loss = .0
        for pred_y, y in zip(pred_ys, ys):
            _loss = F.softmax_cross_entropy(pred_y, y)
            loss += _loss
        reporter.report({'loss': loss.data}, self)
        
        accuracy = .0
        pred_ys = [F.softmax(pred_y) for pred_y in pred_ys]
        pred_ys = [pred_y.data.argmax(axis=0)[1] for pred_y in pred_ys]
        ys = [y.argmax(axis=0) for y in ys]
        for pred_y, y in zip(pred_ys, ys):
            if y == pred_y:
                accuracy += 1/len(ys)
        reporter.report({'accuracy': accuracy}, self)
        return loss

    def traverse(self, xs):
        hx, cx = None, None
        hx, cx, ys = self.nstep_bilstm(xs=xs, hx=hx, cx=cx)
        return [self.l1(y) for y in ys]

In [232]:
model = BiLSTMBase(input_size=234, n_labels=2)

In [233]:
from nlp_utils import convert_seq
train_iter = chainer.iterators.SerialIterator(dataset[:11000], batchsize)
test_iter = chainer.iterators.SerialIterator(dataset[11000:], batchsize,
                                             repeat=False, shuffle=False)

# Setup an optimizer
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

# Set up a trainer
updater = training.StandardUpdater(
    train_iter, optimizer, converter=convert_seq, device=-1)
trainer = training.Trainer(updater, (10, 'epoch'), out='result')

# Evaluate the model with the test dataset for each epoch
trainer.extend(extensions.Evaluator(
    test_iter, model, converter=convert_seq, device=-1))

# Take a best snapshot
record_trigger = training.triggers.MaxValueTrigger(
    'validation/main/accuracy', (1, 'epoch'))
trainer.extend(extensions.snapshot_object(
    model, 'best_model.npz'),
    trigger=record_trigger)

# Write a log of evaluation statistics for each epoch
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
    ['epoch', 'main/loss', 'validation/main/loss',
     'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

# Print a progress bar to stdout
trainer.extend(extensions.ProgressBar())


In [234]:
trainer.run()

epoch       main/loss   validation/main/loss  main/accuracy  validation/main/accuracy  elapsed_time
[J     total [####..............................................]  9.09%
this epoch [#############################################.....] 90.91%
       100 iter, 0 epoch / 10 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
[4A[J1           12.2332     9.90821               0.370364       0.470169                  158.395       
[J     total [#########.........................................] 18.18%
this epoch [########################################..........] 81.82%
       200 iter, 1 epoch / 10 epochs
   0.76206 iters/sec. Estimated time to finish: 0:19:41.006930.
[4A[J2           9.11983     9.26177               0.502818       0.50607                   301.96        
[J     total [#############.....................................] 27.27%
this epoch [####################################..............] 72.73%
       300 iter, 2 epoch / 10 epochs
   0.75302 iter

In [129]:
xs, ys = dataset[0]

In [161]:
pred_ys = model.traverse([xs])

In [122]:
model.nstep_bilstm(xs=xs, hx=None, cx=None)

AssertionError: 

In [138]:
pred_ys = [F.softmax(pred_y) for pred_y in pred_ys]

In [157]:
pred_ys

[variable([[-9.23650414e-02,  1.37938619e-01],
           [-1.59259275e-01,  1.12848647e-01],
           [-1.26221865e-01,  1.58787176e-01],
           [ 3.71674359e-01,  6.14576861e-02],
           [ 1.99825794e-01, -3.85012478e-01],
           [ 2.04078197e-01, -2.53929496e-01],
           [ 3.04586053e-01,  3.18914711e-01],
           [ 3.23670924e-01,  3.59954953e-01],
           [ 2.28468806e-01,  5.96089736e-02],
           [ 7.04916120e-02,  1.95891261e-01],
           [ 1.34538442e-01,  2.49079287e-01],
           [ 3.89353603e-01,  2.32209042e-01],
           [-6.84104264e-02, -2.30646729e-02],
           [ 4.19745982e-01, -4.42816466e-02],
           [ 3.21061641e-01, -8.94789845e-02],
           [ 3.37609887e-01,  2.99052000e-01],
           [ 9.12030786e-02,  5.33087194e-01],
           [ 3.08924198e-01,  2.80614078e-01],
           [ 1.37659743e-01, -1.45233870e-02],
           [ 2.95186639e-01,  1.68519124e-01],
           [ 3.30723003e-02, -1.08975716e-01],
           [-

In [162]:
pred_ys = [F.softmax(pred_y) for pred_y in pred_ys]

In [177]:
pred_ys[0].data.argmax(axis=0)[1]

16

In [175]:
ys.argmax(axis=0)

1

In [239]:
dataset[0][0].shape[1]

234