In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchtext.data as data

In [2]:
class RNN_Text(nn.Module):
    def __init__(self, embed_num, class_num):
        super(RNN_Text, self).__init__()
        V = embed_num
        C = class_num
        H = 256

        self.embed = nn.Embedding(V, 100)
        self.rnn = nn.LSTM(100, H, bidirectional=True)
        self.out = nn.Linear(H * 2, C)

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        ##x = x.unsqueeze(1)  # (N, Ci, W, D)
        x, (_, __) = self.rnn(x, (self.h, self.c))

        logit = self.out(x[-1])
        return logit

    def inithidden(self, b):
        self.h = Variable(torch.randn(2, b, 256))
        self.c = Variable(torch.randn(2, b, 256))

In [3]:
class mydataset(data.Dataset):
    @staticmethod
    def sort_key(ex):
        return len(ex.text)
    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            for i,line in enumerate(open(path,'r',encoding='utf-8')):
                if i==0:
                    continue
                line = line.strip().split(',')
                txt = line[5].split(' ')
                #txt= [ d.split('/')[0] for d in line[1].split(' ') ]
                examples += [ data.Example.fromlist( [txt, line[2]],fields ) ]
        super(mydataset, self).__init__(examples, fields, **kwargs)

In [4]:
text_field = data.Field(fix_length=20)
#text_field = data.Field()
label_field = data.Field(sequential=False, batch_first = True, unk_token = None)

train_data = mydataset(text_field,label_field,path='data_sentiment_train.csv')

test_data = mydataset(text_field,label_field,path='data_sentiment_test.csv')

text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

train_iter, test_iter = data.Iterator.splits(
                            (train_data, test_data),
                            batch_sizes=(100, 1), repeat=False)#, device = -1)
print(len(text_field.vocab))

795


In [5]:
rnn = RNN_Text(len(text_field.vocab),2)
optimizer = torch.optim.Adam(rnn.parameters())
print(rnn.train())

RNN_Text(
  (embed): Embedding(795, 100)
  (rnn): LSTM(100, 256, bidirectional=True)
  (out): Linear(in_features=512, out_features=2, bias=True)
)


In [6]:
%%time
for epoch in range(15):
    z = 0
    totalloss = 0
    for batch in train_iter:
        optimizer.zero_grad()

        txt = batch.text
        label = batch.label
        # print (txt.size())
        rnn.inithidden(txt.size(1))

        pred = rnn(txt)
        # print(pred.size(), label.size())
        # print(label)
        loss = F.cross_entropy(pred, label)
        totalloss += loss.data

        loss.backward()
        optimizer.step()
        # print(data,label)

    print(epoch, 'epoch')
    print(totalloss)

torch.save(rnn, 'model/rnn_model.pt')

0 epoch
tensor(19.9264)
1 epoch
tensor(8.4664)
2 epoch
tensor(0.2618)
3 epoch
tensor(0.0379)
4 epoch
tensor(0.3437)
5 epoch
tensor(0.0282)
6 epoch
tensor(0.0140)
7 epoch
tensor(0.0093)
8 epoch
tensor(0.0067)
9 epoch
tensor(0.0052)
10 epoch
tensor(0.0042)
11 epoch
tensor(0.0034)
12 epoch
tensor(0.0030)
13 epoch
tensor(0.0194)
14 epoch
tensor(0.0026)
CPU times: user 15min 49s, sys: 10.8 s, total: 16min
Wall time: 3min 50s


  "type " + obj.__name__ + ". It won't be checked "


In [7]:
%%time
correct = 0
incorrect = 0
rnn.eval()
for batch in test_iter:
    txt = batch.text
    label = batch.label

    rnn.inithidden(txt.size(1))

    pred = rnn(txt)
    _, ans = torch.max(pred, dim=1)

    if ans.data[0] == label.data[0]:
        correct += 1
    else:
        incorrect += 1

print('correct : ', correct)
print('incorrect : ', incorrect)
print()

correct :  140
incorrect :  0

CPU times: user 2.42 s, sys: 0 ns, total: 2.42 s
Wall time: 609 ms
