# DSAI-LSTM Subtractor

## Import package

In [1]:
import numpy as np
from six.moves import range
import sys

## Parameters Config

In [2]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

In [3]:
TRAINING_SIZE = 80000
DIGITS = 3
REVERSE = False
MAXLEN = DIGITS + 1 + DIGITS
chars = '0123456789-+'
#RNN = layers.LSTM
HIDDEN_SIZE = 128
BATCH_SIZE = 128
LAYERS = 1

## Data Generation

In [4]:
%%time
data = []
label = []
seen = set()

print('Generating data...')
while len(data) < TRAINING_SIZE:
    f = lambda: int(''.join(np.random.choice(list('0123456789')) for i in range(np.random.randint(1, DIGITS + 1))))
    a, b = f(), f()
    
    if(a<b):
        a,b = b,a
    operator = np.random.choice(list('+-'))    
#     q = '{}-{}'.format(a, b)
    q = str(a).zfill(3) + operator + str(b).zfill(DIGITS)    

    if(q not in seen):
        query = q + ' ' * (MAXLEN - len(q))
        seen.add(query)
        data.append(query)
        
        if(operator == "+"):
            ans = str(a+b).zfill(DIGITS+1)
        else:
            ans = str(a-b).zfill(DIGITS+1)
        
        #ans += ' '* (DIGITS + 1 - len(ans))
        
        label.append(ans)
        
    
print(data[:9])
print(label[:9])

Generating data...
['560-026', '946+013', '009+004', '075-007', '820-008', '938-572', '676+007', '158+084', '151-067']
['0534', '0959', '0013', '0068', '0812', '0366', '0683', '0242', '0084']
Wall time: 15.6 s


## Processing

In [5]:
class CharacterTable(object):
    def __init__(self, chars):
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
    
    def encode(self, C, num_rows):
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x
    
    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return "".join(self.indices_char[i] for i in x)

In [6]:
ctable = CharacterTable(chars)

print('Vectorization...')
x = np.zeros((len(data), MAXLEN, len(chars)))
for i, sentence in enumerate(data):
    x[i] = ctable.encode(sentence, MAXLEN)

y = np.zeros((len(label), DIGITS + 1, len(chars)))
for i, sentence in enumerate(label):
    y[i] = ctable.encode(sentence, DIGITS + 1)

print("x.shape", x.shape)
print("x[0]", x[0])
print("ctable.decode(x[0]):", ctable.decode(x[0]))

print("y.shape", y.shape)
print("y[0]", y[0])
print("ctable.decode(y[0]):", ctable.decode(y[0]))

Vectorization...
x.shape (80000, 7, 12)
x[0] [[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]
ctable.decode(x[0]): 560-026
y.shape (80000, 4, 12)
y[0] [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
ctable.decode(y[0]): 0534


In [7]:
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

# train_test_split
train_x = x[:40000]
train_y = y[:40000]

test_x = x[40000:]
test_y = y[40000:]

split_at = len(train_x) - len(train_x) // 10
print(len(train_x))

(x_train, x_val) = train_x[:split_at], train_x[split_at:]
(y_train, y_val) = train_y[:split_at], train_y[split_at:]

print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print('Validation Data:')
print(x_val.shape)
print(y_val.shape)

print('Testing Data:')
print(test_x.shape)
print(test_y.shape)

40000
Training Data:
(36000, 7, 12)
(36000, 4, 12)
Validation Data:
(4000, 7, 12)
(4000, 4, 12)
Testing Data:
(40000, 7, 12)
(40000, 4, 12)


In [8]:
import random
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.utils.data as Data
import torch.nn.functional as F

In [9]:
x_train_torch = torch.Tensor(x_train)
y_train_torch = torch.Tensor(y_train)

x_val_torch = torch.Tensor(x_val)
y_val_torch = torch.Tensor(y_val)

test_x_torch = torch.Tensor(test_x)
test_y_torch = torch.Tensor(test_y)

print('torch data')

print('Training Data:')
print(x_train_torch.size())
print(y_train_torch.size())

print('Validation Data:')
print(x_val_torch.size())
print(y_val_torch.size())

print('Testing Data:')
print(test_x_torch.size())
print(test_y_torch.size())



torch data
Training Data:
torch.Size([36000, 7, 12])
torch.Size([36000, 4, 12])
Validation Data:
torch.Size([4000, 7, 12])
torch.Size([4000, 4, 12])
Testing Data:
torch.Size([40000, 7, 12])
torch.Size([40000, 4, 12])


In [10]:
train_dataset = Data.TensorDataset(x_train_torch, y_train_torch)
train_loader = Data.DataLoader(
    dataset=train_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # 要不要打乱数据 (打乱比较好)
)

test_dataset = Data.TensorDataset(test_x_torch, test_y_torch)
test_loader = Data.DataLoader(
    dataset=test_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # 要不要打乱数据 (打乱比较好)
)

for step, (batch_x, batch_y) in enumerate(train_loader):  # 每一步 loader 释放一小批数据用来学习
    print(step)
    print(batch_x.size())
    print(batch_y.size())
    break

0
torch.Size([128, 7, 12])
torch.Size([128, 4, 12])


## Build Pytorch Model

### Bidirectional LSTM

In [11]:
USE_CUDA = torch.cuda.is_available()
#USE_CUDA = False
print(USE_CUDA)

True


In [12]:

# torch.manual_seed(1)    # reproducible

# Hyper Parameters
EPOCH = 20               # train the training data n times, to save time, we just train 1 epoch
BATCH_SIZE = 128
HIDDEN_SIZE = HIDDEN_SIZE
TIME_STEP = 7          # rnn time step / image height
INPUT_SIZE = len(chars)         # rnn input size / image width
LR = 0.01               # learning rate
DOWNLOAD_MNIST = False   # set to True if haven't download the data

# Data Loader for easy mini-batch return in training
train_loader = train_loader

class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(         # if use nn.RNN(), it hardly learns
            input_size=INPUT_SIZE,
            hidden_size=HIDDEN_SIZE,         # rnn hidden unit
            num_layers=2,           # number of rnn layer
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
            bidirectional=True,
        )

        self.out = nn.Linear(HIDDEN_SIZE*2 , len(chars))

    def forward(self, x): 
        # x (batch, time_step, input_size)
        # h_state (n_layers, batch, hidden_size)
        # r_out (batch, time_step, output_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # h_state 也要作为 RNN 的一个输入，此為None
        
        outs = []    # 保存所有时间点的预测值
        #for time_step in range(r_out.size(1)):    # 对每一个时间点计算 output
        for time_step in range(-4, -1+1, 1):#只取最後4個time_step的輸出
            outs.append(self.out(r_out[:, time_step, :]))
        return  F.softmax(torch.stack(outs, dim=1), dim=2)



### Training

In [13]:
print("USE_CUDA:", USE_CUDA)    
    
rnn = RNN()
if(USE_CUDA):
    rnn.cuda()
print(rnn)

optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters
loss_func = nn.MSELoss()

h_state = None      # for initial hidden state
for epoch in range(EPOCH):
    print('*'*30, "epoch:",epoch, '*'*30)
    for step, (batch_x, batch_y) in enumerate(train_loader):  # 每一步 loader 释放一小批数据用来学习
        
        if(USE_CUDA):
            prediction = rnn(batch_x.cuda())   # rnn output
            loss = loss_func(prediction.cuda(), batch_y.cuda())
        else:
            prediction = rnn(batch_x)
            loss = loss_func(prediction, batch_y)
            
        optimizer.zero_grad()                   # clear gradients for this training step
        loss.backward(retain_graph=True)        # backpropagation, compute gradients
        optimizer.step()                        # apply gradients        
        
        if(step%100 == 0):
            print("loss:",loss.data.item())
            
            show_pred = np.array([ctable.decode(b.data.cpu().numpy()) for b in prediction])
            show_y = np.array([ctable.decode(b.data.cpu().numpy()) for b in batch_y])

            acc = sum(show_pred == show_y)/BATCH_SIZE
            print("accuracy:", acc)

            for i in range(10):
                question = ctable.decode(batch_x[i].data.numpy())
                print("question:", question ,end='  ')
                print("pred:", show_pred[i] ,end=' ')
                if show_pred[i] == show_y[i]:
                    print(colors.ok + '☑' + colors.close, end=' ')
                else:
                    print(colors.fail + '☒' + colors.close, end=' ')
                print("target:", show_y[i])    

            print('-'*50)

USE_CUDA: True
RNN(
  (rnn): LSTM(12, 128, num_layers=2, batch_first=True, bidirectional=True)
  (out): Linear(in_features=256, out_features=12, bias=True)
)
****************************** epoch: 0 ******************************
loss: 0.0765417218208313
accuracy: 0.0
question: 550-003  pred: 4444 [91m☒[0m target: 0547
question: 540-127  pred: 4444 [91m☒[0m target: 0413
question: 563+052  pred: 4444 [91m☒[0m target: 0615
question: 263-092  pred: 4444 [91m☒[0m target: 0171
question: 848-252  pred: 4444 [91m☒[0m target: 0596
question: 684+034  pred: 4444 [91m☒[0m target: 0718
question: 643-039  pred: 4444 [91m☒[0m target: 0604
question: 984+295  pred: 4444 [91m☒[0m target: 1279
question: 410+014  pred: 4444 [91m☒[0m target: 0424
question: 085-068  pred: 4444 [91m☒[0m target: 0017
--------------------------------------------------
loss: 0.0535803884267807
accuracy: 0.0
question: 319+000  pred: 0599 [91m☒[0m target: 0319
question: 926+663  pred: 1997 [91m☒[0m target:

loss: 0.0016267775790765882
accuracy: 0.96875
question: 551+009  pred: 0560 [92m☑[0m target: 0560
question: 866-003  pred: 0863 [92m☑[0m target: 0863
question: 502-007  pred: 0495 [92m☑[0m target: 0495
question: 651+048  pred: 0699 [92m☑[0m target: 0699
question: 535+037  pred: 0572 [92m☑[0m target: 0572
question: 017-001  pred: 0016 [92m☑[0m target: 0016
question: 079-009  pred: 0070 [92m☑[0m target: 0070
question: 942+075  pred: 1017 [92m☑[0m target: 1017
question: 890+060  pred: 0950 [92m☑[0m target: 0950
question: 736-259  pred: 0477 [92m☑[0m target: 0477
--------------------------------------------------
loss: 0.0010746417101472616
accuracy: 0.9921875
question: 314+081  pred: 0395 [92m☑[0m target: 0395
question: 622-015  pred: 0607 [92m☑[0m target: 0607
question: 052-031  pred: 0021 [92m☑[0m target: 0021
question: 198-061  pred: 0137 [92m☑[0m target: 0137
question: 812-007  pred: 0805 [92m☑[0m target: 0805
question: 114+078  pred: 0192 [92m☑[0m targ

loss: 6.009166827425361e-05
accuracy: 1.0
question: 384-006  pred: 0378 [92m☑[0m target: 0378
question: 061-061  pred: 0000 [92m☑[0m target: 0000
question: 504+009  pred: 0513 [92m☑[0m target: 0513
question: 166-066  pred: 0100 [92m☑[0m target: 0100
question: 279-004  pred: 0275 [92m☑[0m target: 0275
question: 551-067  pred: 0484 [92m☑[0m target: 0484
question: 629+078  pred: 0707 [92m☑[0m target: 0707
question: 842+528  pred: 1370 [92m☑[0m target: 1370
question: 094-041  pred: 0053 [92m☑[0m target: 0053
question: 588+051  pred: 0639 [92m☑[0m target: 0639
--------------------------------------------------
****************************** epoch: 9 ******************************
loss: 7.71996274124831e-05
accuracy: 1.0
question: 941-015  pred: 0926 [92m☑[0m target: 0926
question: 535-000  pred: 0535 [92m☑[0m target: 0535
question: 449-046  pred: 0403 [92m☑[0m target: 0403
question: 075-006  pred: 0069 [92m☑[0m target: 0069
question: 989-052  pred: 0937 [92m☑[0

****************************** epoch: 13 ******************************
loss: 0.00017289390962105244
accuracy: 1.0
question: 619+014  pred: 0633 [92m☑[0m target: 0633
question: 283+047  pred: 0330 [92m☑[0m target: 0330
question: 567+048  pred: 0615 [92m☑[0m target: 0615
question: 864-227  pred: 0637 [92m☑[0m target: 0637
question: 414+077  pred: 0491 [92m☑[0m target: 0491
question: 044-020  pred: 0024 [92m☑[0m target: 0024
question: 380-281  pred: 0099 [92m☑[0m target: 0099
question: 845-004  pred: 0841 [92m☑[0m target: 0841
question: 718-049  pred: 0669 [92m☑[0m target: 0669
question: 165-009  pred: 0156 [92m☑[0m target: 0156
--------------------------------------------------
loss: 5.77321152377408e-05
accuracy: 1.0
question: 786-019  pred: 0767 [92m☑[0m target: 0767
question: 738+007  pred: 0745 [92m☑[0m target: 0745
question: 613-392  pred: 0221 [92m☑[0m target: 0221
question: 093+020  pred: 0113 [92m☑[0m target: 0113
question: 651-054  pred: 0597 [92m☑

loss: 0.0011491944314911962
accuracy: 0.96875
question: 575-325  pred: 0250 [92m☑[0m target: 0250
question: 084+030  pred: 0114 [92m☑[0m target: 0114
question: 152-008  pred: 0144 [92m☑[0m target: 0144
question: 124+071  pred: 0195 [92m☑[0m target: 0195
question: 772+001  pred: 0773 [92m☑[0m target: 0773
question: 729-386  pred: 0343 [92m☑[0m target: 0343
question: 309-082  pred: 0227 [92m☑[0m target: 0227
question: 882+031  pred: 0913 [92m☑[0m target: 0913
question: 082-027  pred: 0055 [92m☑[0m target: 0055
question: 024-006  pred: 0018 [92m☑[0m target: 0018
--------------------------------------------------
loss: 0.0002307343966094777
accuracy: 0.9921875
question: 532+119  pred: 0651 [92m☑[0m target: 0651
question: 770-022  pred: 0748 [92m☑[0m target: 0748
question: 921+324  pred: 1245 [92m☑[0m target: 1245
question: 529-275  pred: 0254 [92m☑[0m target: 0254
question: 849-522  pred: 0327 [92m☑[0m target: 0327
question: 184-143  pred: 0041 [92m☑[0m targ

### save and restore model 

In [14]:
model_name = 'subtracter_torch_rnn.pkl'
model_params_name = 'subtracter_torch_rnn_params.pkl'

torch.save(rnn, model_name)  # 保存整个网络
torch.save(rnn.state_dict(), model_params_name)   # 只保存网络中的参数 (速度快, 占内存少)

def restore_net():
    # restore entire net1 to net2
    rnn = torch.load(model_name)
    return rnn
    
def restore_params():
    # 新建 rnn
    rnn = RNN()  #上面已定義的RNN
    rnn.load_state_dict(torch.load(model_params_name)) # 将保存的参数复制到 rnns
    return rnn
    
rnn = restore_net()
if(USE_CUDA):
    rnn.cuda()
print(rnn)

RNN(
  (rnn): LSTM(12, 128, num_layers=2, batch_first=True, bidirectional=True)
  (out): Linear(in_features=256, out_features=12, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


## Testing

In [15]:
print("test size:", test_x_torch.size())

test size: torch.Size([40000, 7, 12])


In [16]:
def test():
    total_correct = 0
    acc = 0
    for step, (batch_x, batch_y) in enumerate(test_loader):  # 每一步 loader 释放一小批数据用来学习
        if(USE_CUDA):
            prediction = rnn(batch_x.cuda())   # rnn output
        else:
            prediction = rnn(batch_x)

        if(USE_CUDA):
            pred = np.array([ctable.decode(b.data.cpu().numpy()) for b in prediction])
            y = np.array([ctable.decode(b.data.cpu().numpy()) for b in batch_y])
        else:
            pred = np.array([ctable.decode(b.data.numpy()) for b in prediction])
            y = np.array([ctable.decode(b.data.numpy()) for b in batch_y])
        
        total_correct = total_correct + sum(pred == y)

    acc = total_correct / test_x_torch.size(0)
    print("accuracy:", acc)
    print(total_correct, '/', test_x_torch.size(0) )

test()

accuracy: 0.99405
39762 / 40000
