In [1]:
import numpy as np

ModuleNotFoundError: No module named 'numpy'

In [9]:
class Tensor(object):
    def __init__(self, data, creator=None, create_op=None):
        self.data = np.array(data)
        self.creator = creator
        self.create_op = create_op
        self.grad = None
        
    def __add__(self, other):
        return Tensor(self.data+other.data, (self, other), 'add')
    
    def backward(self, grad):
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
    
        if self.create_op == 'add':
            self.creator[0].backward(self.grad)
            self.creator[1].backward(self.grad)
            
    def __str__(self):
        return str(self.data.__str__())
    
    def __repr__(self):
        return str(self.data.__repr__())

In [10]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])
g = Tensor([1,2,3,4,5])
h = Tensor([1,2,3,4,5])
b = g + h
d = a + b
e = b + c
f = d + e
f.backward(Tensor(np.array([1,1,1,1,1])))
print('a grad:', a.grad.data)
print('b grad:', b.grad.data)
print('c grad:', c.grad.data)
print('d grad:', d.grad.data)
print('e grad:', e.grad.data)
print('g grad:', g.grad.data)  # g & h 's grad is error, so here we have to check grad for b
print('h grad:', h.grad.data)  # b could backward only until all of b's previous nodes have be finished,
                               # Let's check next version

a grad: [1 1 1 1 1]
b grad: [2 2 2 2 2]
c grad: [1 1 1 1 1]
d grad: [1 1 1 1 1]
e grad: [1 1 1 1 1]
g grad: [3 3 3 3 3]
h grad: [3 3 3 3 3]


In [11]:
id(h.grad)

4678998728

In [8]:
import numpy as np
class Tensor(object):
    def __init__(self, data, autograd=False, creator=None, create_op=None, id=None):
        self.data = np.array(data)
        self.shape = self.data.shape
        self.creator = creator
        self.create_op = create_op
        self.autograd = autograd
        self.grad = None
        self.children = {}
        if id is None:
            id = np.random.choice(10000)
        self.id = id
        
        if creator is not None:
            for c in creator:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data+other.data, autograd=True, creator=(self, other), create_op='add')
        return Tensor(self.data+other.data)
        
    def __neg__(self):
        if self.autograd:
            return Tensor(self.data*-1, autograd=True, creator=(self,), create_op='neg')
        return Tensor(self.data*-1)
    
    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data-other.data, autograd=True, creator=(self, other), create_op='sub')
        return Tensor(self.data-other.data)
        
    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data*other.data, autograd=True, creator=(self, other), create_op='mul')
        return Tensor(self.data*other.data)
        
    def sum(self, dim):
        assert self.data.ndim>dim, 'axis %d is out of bounds for array of dimension %d' % (dim, self.data.ndim)
        if self.autograd:
            return Tensor(self.data.sum(dim), autograd=True, creator=(self,), create_op='sum_'+str(dim))
        return Tensor(self.data.sum(dim))
        
    def expand(self, dim, copies):
        assert self.data.ndim>=dim, 'axis %d is out of bounds for array of dimension %d' % (dim, self.data.ndim)
        if self.autograd:
            return Tensor(np.expand_dims(self.data, axis=dim).repeat(copies, axis=dim), autograd=True, creator=(self,), create_op='expand_'+str(dim))
        return Tensor(np.expand_dims(self.data, axis=dim).repeat(copies, axis=dim))
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creator=(self,), create_op='transpose')
        return Tensor(self.data.transpose())
    
    def mm(self, other):
        if self.autograd:
            return Tensor(self.data.dot(other.data), autograd=True, creator=(self, other), create_op='mm')
        return Tensor(self.data.dot(other.data))
    
    def check_creator_grad_count(self):
        for c in self.children:
            if self.children[c] != 0:
                return False
        return True
    
    def backward(self, grad=None, child_grad_node=None):
        if not self.autograd:
            return
        
        if child_grad_node is not None:
            if self.children[child_grad_node.id] == 0:
                assert self.children[child_grad_node.id] != 0, \
                'creator %d'' children %d has grad count == 0, backprop can has one pass' % (self.id, child_grad_node.id)
            else:
                self.children[child_grad_node.id] -= 1

        if grad is None:
            grad = Tensor(np.ones_like(self.data, dtype=np.float, shape=self.shape))
            
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
   
        if self.creator is not None and self.check_creator_grad_count():
            if self.create_op == 'add':
                self.creator[0].backward(self.grad, self)
                self.creator[1].backward(self.grad, self)
            elif self.create_op == 'neg':
                self.creator[0].backward(self.grad.__neg__(), self)
            elif self.create_op == 'sub':
                self.creator[0].backward(self.grad, self)
                self.creator[1].backward(self.grad.__neg__(), self)
            elif self.create_op == 'mul':
                self.creator[0].backward(self.grad*self.creator[1], self)
                self.creator[1].backward(self.grad*self.creator[0], self)
            elif self.create_op == 'transpose':
                self.creator[0].backward(self.grad.transpose(), self)
            elif self.create_op == 'mm':
                c0 = Tensor(self.creator[0].data, autograd=False)  # no auto grad
                c1 = Tensor(self.creator[1].data, autograd=False)  # no auto grad
                self.creator[0].backward(self.grad.mm(c1.transpose()), self)
                self.creator[1].backward(c0.transpose().mm(self.grad), self)
            elif self.create_op[:4] == 'sum_':
                dim = int(self.create_op[4:])
                new = self.grad.expand(dim, self.creator[0].data.shape[dim])
                self.creator[0].backward(new, self)
            elif self.create_op[:7] == 'expand_':
                dim = int(self.create_op[7:])
                new = self.grad.sum(dim)
                self.creator[0].backward(new, self)
            
    def zero_grad(self):
        self.grad = None
       
    def step(self, alpha):
        if self.grad is None:
            return
        self.data -= self.grad.data*alpha
    
    def __str__(self):
        return str(self.data.__str__())
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __getitem__(self, ind):
        if isinstance(ind, int) or isinstance(ind, slice): # n是索引 or 切片
            return self.data[ind]
        elif isinstance(ind, tuple):
            a_ind, b_ind = ind
            return self.data[a_ind, b_ind]
            

# slice index operation for Tensor

In [9]:
a = Tensor(np.random.rand(3,3), autograd=False)
print('a:\n', a)
print('single index 0 for a:\n', a[0])
print('single index 2 for a:\n', a[2])
print('slice index 0:2 for a:\n', a[0:2])
print('tuple slice index 0:2, 0:2 for a:\n', a[0:2,0:2])

a:
 [[0.84426575 0.85794562 0.84725174]
 [0.6235637  0.38438171 0.29753461]
 [0.05671298 0.27265629 0.47766512]]
single index 0 for a:
 [0.84426575 0.85794562 0.84725174]
single index 2 for a:
 [0.05671298 0.27265629 0.47766512]
slice index 0:2 for a:
 [[0.84426575 0.85794562 0.84725174]
 [0.6235637  0.38438171 0.29753461]]
tuple slice index 0:2, 0:2 for a:
 [[0.84426575 0.85794562]
 [0.6235637  0.38438171]]


## common op on Tensor, note the following example that b is same factor for d & e

In [10]:
a = Tensor([1,2,3,4,5], True)
b = Tensor([2,2,2,2,2], True)
c = Tensor([5,4,3,2,1], True)
g = Tensor([6,2,3,4,5], True)
h = Tensor([7,2,3,4,5], True)
b = g + h
d = a + b
e = b + c
f = d + e
f.backward(Tensor(np.array([1,1,1,1,1])))
print('a grad:', a.grad.data)
print('b grad:', b.grad.data)
print('c grad:', c.grad.data)
print('d grad:', d.grad.data)
print('e grad:', e.grad.data)
print('g grad:', g.grad.data)  # Now, everything is OK
print('h grad:', h.grad.data) 

a grad: [1 1 1 1 1]
b grad: [2 2 2 2 2]
c grad: [1 1 1 1 1]
d grad: [1 1 1 1 1]
e grad: [1 1 1 1 1]
g grad: [2 2 2 2 2]
h grad: [2 2 2 2 2]


# negative operation for Tensor

In [11]:
a = Tensor([1,2,3,4,5], autograd=True)
b = Tensor([2,2,2,2,2], autograd=True)
c = Tensor([5,4,3,2,1], autograd=True)

d = a + (-b)
e = (-b) + c
f = d + e

f.backward(Tensor(np.array([1,1,1,1,1])))
print(b.grad.data == np.array([-2,-2,-2,-2,-2]))
print('b.grad:', b.grad)

[ True  True  True  True  True]
b.grad: [-2 -2 -2 -2 -2]


# op on expand & sum

In [12]:
x = Tensor(np.array([[1,2,3],
                     [4,5,6]]))

In [13]:
b = x.sum(0)
print(b)
print(b.data == np.array([5, 7, 9]))

b = x.sum(1)
print(b)
print(b.data == np.array([6, 15]))

[5 7 9]
[ True  True  True]
[ 6 15]
[ True  True]


In [14]:
b = x.expand(dim=2, copies=4)
print(b)
print(b.data == np.array([[[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]],

       [[4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6]]]))

[[[1 1 1 1]
  [2 2 2 2]
  [3 3 3 3]]

 [[4 4 4 4]
  [5 5 5 5]
  [6 6 6 6]]]
[[[ True  True  True  True]
  [ True  True  True  True]
  [ True  True  True  True]]

 [[ True  True  True  True]
  [ True  True  True  True]
  [ True  True  True  True]]]


# the grad for matrix

In [15]:
a = Tensor([[1,2],[3,4]], autograd=True)
b = Tensor([[2,1],[4,3]], autograd=True)
c = a + b
d = c.mm(c)
print('c:\n',c)
print('d:\n',d)

c:
 [[3 3]
 [7 7]]
d:
 [[30 30]
 [70 70]]


In [16]:
d.backward(Tensor([[1,1],[1,1]]))
print('c.grad:\n', c.grad)
print('a.grad:\n', a.grad)
print('b.grad:\n', b.grad)

c.grad:
 [[16 24]
 [16 24]]
a.grad:
 [[16 24]
 [16 24]]
b.grad:
 [[16 24]
 [16 24]]


# train a nerual network, to simplify the network code

In [17]:
# the previous version
import numpy
np.random.seed(0)

alpha = 0.1
data = np.array([[0,0],[0,1],[1,0],[1,1]])
target = np.array([[0],[1],[0],[1]])

orgw_0_1 = np.random.rand(2,3)
orgw_1_2 = np.random.rand(3,1)

weights_0_1 = orgw_0_1.copy()
weights_1_2 = orgw_1_2.copy()

for i in range(10):
    # Predict
    layer_1 = data.dot(weights_0_1)
    layer_2 = layer_1.dot(weights_1_2)
    
    # Compare
    diff = (layer_2 - target)
    sqdiff = (diff * diff)
    loss = sqdiff.sum(0) # mean squared error loss

    # Learn: this is the backpropagation piece
    layer_1_grad = 2*diff.dot(weights_1_2.transpose())
    weight_1_2_update = layer_1.transpose().dot(2*diff)
    weight_0_1_update = data.transpose().dot(layer_1_grad)
    
    weights_1_2 -= weight_1_2_update * alpha
    weights_0_1 -= weight_0_1_update * alpha
    print(loss[0])

5.066439994622396
1.725208044893435
0.970729785737745
0.4484578158939851
0.19705058205505
0.11889682222130549
0.07853709477623544
0.050724621963417184
0.03190534467093544
0.019585091267885605


In [18]:
print(orgw_1_2)

[[0.43758721]
 [0.891773  ]
 [0.96366276]]


In [19]:
print(weights_1_2)

[[0.28729166]
 [0.49330624]
 [0.77311751]]


In [20]:
print(weight_1_2_update)

[[-0.00115038]
 [-0.01600303]
 [-0.10218782]]


In [21]:
print(diff)
print(layer_2)
print(target)

[[ 0.        ]
 [-0.0855257 ]
 [ 0.10838725]
 [ 0.02286155]]
[[0.        ]
 [0.9144743 ]
 [0.10838725]
 [1.02286155]]
[[0]
 [1]
 [0]
 [1]]


In [22]:
# use the previous tensor framework, we can save the codes
import numpy
np.random.seed(0)

alpha = 0.1
data = Tensor([[0,0],[0,1],[1,0],[1,1]], autograd=True)
target = Tensor([[0],[1],[0],[1]], autograd=True)

weights_0_1 = Tensor(orgw_0_1.copy(), autograd=True)
weights_1_2 = Tensor(orgw_1_2.copy(), autograd=True)
print('weights_1_2:\n', weights_1_2)

for i in range(10):
    # Empyt grad
    weights_0_1.zero_grad()
    weights_1_2.zero_grad()

    # Predict
    layer_1 = data.mm(weights_0_1)
    layer_2 = layer_1.mm(weights_1_2)
    
    # Compare
    diff = layer_2 - target
    sqdiff = diff * diff
    loss = sqdiff.sum(0) # mean squared error loss

    # Learn: this is the backpropagation piece
    loss.backward()
    
    # Update weights
    weights_1_2.data -= weights_1_2.grad.data * alpha
    weights_0_1.data -= weights_0_1.grad.data * alpha
    print(loss[0])

weights_1_2:
 [[0.43758721]
 [0.891773  ]
 [0.96366276]]
5.066439994622396
1.725208044893435
0.970729785737745
0.4484578158939851
0.19705058205505
0.11889682222130549
0.07853709477623544
0.050724621963417184
0.03190534467093544
0.019585091267885605


In [23]:
print(weights_1_2.data)

[[0.28729166]
 [0.49330624]
 [0.77311751]]


In [24]:
print(weights_1_2.grad)
print(weight_1_2_update)

[[-0.00115038]
 [-0.01600303]
 [-0.10218782]]
[[-0.00115038]
 [-0.01600303]
 [-0.10218782]]


In [25]:
print(diff)
print(layer_2)
print(target)

[[ 0.        ]
 [-0.0855257 ]
 [ 0.10838725]
 [ 0.02286155]]
[[0.        ]
 [0.9144743 ]
 [0.10838725]
 [1.02286155]]
[[0]
 [1]
 [0]
 [1]]


In [26]:
# add optimizer class
class SGD(object):
    def __init__(self, parameters, alpha):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero_grad(self):
        for one_param in self.parameters:
            one_param.zero_grad()
    
    def step(self):
        for one_param in self.parameters:
            one_param.step(self.alpha)

In [27]:
# another vesion for previous network
# use the previous tensor framework, we can save the codes
import numpy
np.random.seed(0)

data = Tensor([[0,0],[0,1],[1,0],[1,1]], autograd=True)
target = Tensor([[0],[1],[0],[1]], autograd=True)

weights_0_1 = Tensor(orgw_0_1.copy(), autograd=True)
weights_1_2 = Tensor(orgw_1_2.copy(), autograd=True)

sgd = SGD(parameters=[weights_0_1, weights_1_2], alpha=0.1)

for i in range(10):
    # Empyt grad
    sgd.zero_grad()

    # Predict
    layer_2 = data.mm(weights_0_1).mm(weights_1_2)
    
    # Compare
    diff = layer_2 - target
    sqdiff = diff * diff
    loss = sqdiff.sum(0) # mean squared error loss

    # Learn: this is the backpropagation piece
    loss.backward()
    
    # Update weights
    sgd.step()
    
    print(loss[0])

5.066439994622396
1.725208044893435
0.970729785737745
0.4484578158939851
0.19705058205505
0.11889682222130549
0.07853709477623544
0.050724621963417184
0.03190534467093544
0.019585091267885605


In [66]:
# add layer support
class ItemIdGen(object):
    def __init__(self):
        self.gen_id = {}
        
        self.default_items = ['Linear_Weights_', 'Linear_Bias_', 'Sequential_', 'RnnCell_', 'Embedding_Weights_']
        for one_item in self.default_items:
            self.gen_id[one_item] = 0
    
    def add_item(self, item):
        self.default_items.append(item)
        self.gen_id[item] = 0
    
    def get_next_id(self, item):
        assert item in self.gen_id, 'The item %s is not in Id Gen base' % item
        
        cur_id = self.gen_id[item]
        self.gen_id[item] += 1
        
        return cur_id

item_id_gen = ItemIdGen()

class Parameter(object):
    def __init__(self, name, value):
        self.name = name
        self.value = value
        
    def get_name(self):
        return self.name
    
    def get_value(self):
        return self.value

    def backward(self, grad):
        self.value.backward(grad)
    
    def step(self, alpha):
        self.value.step(alpha)
        
    def zero_grad(self):
        self.value.zero_grad()

    def __repr__(self):
        return self.name+': '+str(self.value.__repr__())

class Layer(object):
    def __init__(self):
        self.parameters = []
    
    def get_parameters(self):
        return self.parameters

    def __repr__(self):
        layer_repr = ''
        for one in self.get_parameters():
            layer_repr += one.__repr__()+'\n'
        return layer_repr

class LinearLayer(Layer):
    def __init__(self, inns, outs, bias=True):
        super(LinearLayer, self).__init__()
        
        self.weights = Tensor(np.random.rand(inns, outs)*np.sqrt(2.0/inns), autograd=True)
        self.parameters.append(Parameter(self.get_name('Linear_Weights_'), self.weights))
        
        if bias:
            self.bias = Tensor(np.zeros(outs), autograd=True)
            self.parameters.append(Parameter(self.get_name('Linear_Bias_'), self.bias))
        else:
            self.bias = None

    def get_name(self, prefix):
        suf_id = item_id_gen.get_next_id(prefix)
        return prefix+str(suf_id)
        
    def forward(self, x):
        y = x.mm(self.weights)
        if self.bias is not None:
            y += self.bias.expand(0, x.shape[0])
        
        return y

In [67]:
# add layers supportion(layer container)
class  Sequential(Layer):
    def __init__(self, layers=list()):
        super(Sequential, self).__init__()
        self.layers = layers
        
    def get_parameters(self):
        layers_params = []
        for one in self.layers:
            layers_params += one.get_parameters()
        return layers_params
    
    def add(self, layer):
        self.layers.append(layer)
    
    def forward(self, x):
        p = x
        for one in self.layers:
            p = one.forward(p)
            
        return p

In [68]:
# new version of previous neural network
import numpy
np.random.seed(0)

data = Tensor([[0,0],[0,1],[1,0],[1,1]], autograd=True)
target = Tensor([[0],[1],[0],[1]], autograd=True)

L1 = LinearLayer(2,3,bias=False)
L2 = LinearLayer(3,1,bias=False)
L1.weights.data = orgw_0_1.copy()     # comment this line to see different result
L2.weights.data = orgw_1_2.copy()     # comment this line to see different result

model = Sequential([L1, L2])
sgd = SGD(parameters=model.get_parameters(), alpha=0.1)

for i in range(10):
    # Empyt grad
    sgd.zero_grad()

    # Predict
    layer_2 = model.forward(data)
    
    # Compare
    diff = layer_2 - target
    sqdiff = diff * diff
    loss = sqdiff.sum(0) # mean squared error loss

    # Learn: this is the backpropagation piece
    loss.backward()
    
    # Update weights
    sgd.step()
    
    print(loss[0])

5.066439994622396
1.725208044893435
0.970729785737745
0.4484578158939851
0.19705058205505
0.11889682222130549
0.07853709477623544
0.050724621963417184
0.03190534467093544
0.019585091267885605


# add loss function

In [69]:
class Loss(object):
    pass

class MSELoss(object):
    def __init__(self):
        super(MSELoss, self).__init__()
    
    def __call__(self, pred, gt):
        diff = pred - gt
        sqdiff = diff * diff
        loss = sqdiff.sum(0)
        return loss

In [70]:
# new version of previous neural network
import numpy
np.random.seed(0)

data = Tensor([[0,0],[0,1],[1,0],[1,1]], autograd=True)
target = Tensor([[0],[1],[0],[1]], autograd=True)

L1 = LinearLayer(2,3,bias=False)
L2 = LinearLayer(3,1,bias=False)
L1.weights.data = orgw_0_1.copy()     # comment this line to see different result
L2.weights.data = orgw_1_2.copy()     # comment this line to see different result

model = Sequential([L1, L2])
sgd = SGD(parameters=model.get_parameters(), alpha=0.1)
criterion = MSELoss()

for i in range(10):
    # Empty grad
    sgd.zero_grad()

    # Predict
    pred = model.forward(data)
    
    # Compare, mean squared error loss
    loss = criterion(pred, target)

    # Learn: this is the backpropagation piece
    loss.backward()
    
    # Update weights
    sgd.step()
    
    print(loss[0])

5.066439994622396
1.725208044893435
0.970729785737745
0.4484578158939851
0.19705058205505
0.11889682222130549
0.07853709477623544
0.050724621963417184
0.03190534467093544
0.019585091267885605


# Now, we want to add 3 common operation in nerual network forward: non-linear(relu, sigmoid, tanh), index-select, cross-entropy support for class Tensor

In [71]:
import numpy as np
class Tensor(object):
    def __init__(self, data, autograd=False, creator=None, create_op=None, id=None):
        self.data = np.array(data)
        self.shape = self.data.shape
        self.creator = creator
        self.create_op = create_op
        self.autograd = autograd
        self.grad = None
        self.children = {}
        self.restore_children = {}
        if id is None:
            id = np.random.choice(10000)
        self.id = id
        
        if creator is not None:
            for c in creator:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
                # save the counter sync
                if self.id not in c.restore_children:
                    c.restore_children[self.id] = 1
                else:
                    c.restore_children[self.id] += 1

                # for rnn
                c.restore_graph() # just flow back it if needed,                    

    def restore_graph(self):
        if self.creator is not None and self.grad is not None: # it is a middle variable, and has ever been grad flowed
            for c in self.creator:
                assert self.id in c.children, 'we are in restore graph, so its id should be in creator children'
                c.children[self.id] = c.restore_children[self.id]
                c.restore_graph() # just flow back it if needed
                
            
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data+other.data, autograd=True, creator=(self, other), create_op='add')
        return Tensor(self.data+other.data)
        
    def __neg__(self):
        if self.autograd:
            return Tensor(self.data*-1, autograd=True, creator=(self,), create_op='neg')
        return Tensor(self.data*-1)
    
    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data-other.data, autograd=True, creator=(self, other), create_op='sub')
        return Tensor(self.data-other.data)
        
    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data*other.data, autograd=True, creator=(self, other), create_op='mul')
        return Tensor(self.data*other.data)
        
    def sum(self, dim):
        assert self.data.ndim>dim, 'axis %d is out of bounds for array of dimension %d' % (dim, self.data.ndim)
        if self.autograd:
            return Tensor(self.data.sum(dim), autograd=True, creator=(self,), create_op='sum_'+str(dim))
        return Tensor(self.data.sum(dim))
    
    def mean(self, dim):
        assert self.data.ndim>dim, 'axis %d is out of bounds for array of dimension %d' % (dim, self.data.ndim)
        if self.autograd:
            return Tensor(self.data.mean(dim), autograd=True, creator=(self,), create_op='mean_'+str(dim))
        return Tensor(self.data.mean(dim))
    
    def expand(self, dim, copies):
        assert self.data.ndim>=dim, 'axis %d is out of bounds for array of dimension %d' % (dim, self.data.ndim)
        if self.autograd:
            return Tensor(np.expand_dims(self.data, axis=dim).repeat(copies, axis=dim), autograd=True, creator=(self,), create_op='expand_'+str(dim))
        return Tensor(np.expand_dims(self.data, axis=dim).repeat(copies, axis=dim))
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creator=(self,), create_op='transpose')
        return Tensor(self.data.transpose())
    
    def mm(self, other):
        if self.autograd:
            return Tensor(self.data.dot(other.data), autograd=True, creator=(self, other), create_op='mm')
        return Tensor(self.data.dot(other.data))
    
    def relu(self):
        if self.autograd:
            return Tensor(np.where(self.data>0, self.data, 0), autograd=True, creator=(self,), create_op='relu')
        return Tensor(np.where(self.data>0, self.data, 0))
    
    def sigmoid(self):
        new = 1/(1+np.exp(-self.data))
        if self.autograd:
            return Tensor(new, autograd=True, creator=(self,), create_op='sigmoid')
        return Tensor(new)
    
    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data), autograd=True, creator=(self,), create_op='tanh')
        return Tensor(np.tanh(self.data))
    
    def index_select(self, inds):
        if self.autograd:
            new = Tensor(self.data[inds.data], autograd=True, creator=(self,), create_op='index_select')
            new.ind_sel = inds
            return new
        return Tensor(self.data[inds.data])
    
    def cross_entropy(self, gt):
        '''
        gt is assumed to have the shape (n,1) or (n,), each element is in [0,n_classes-1] (n is the number of samples)
        self.data is assumed to have shape (n, n_classes)
        '''
        assert self.data.ndim <= 2, 'the data''s dims should be smaller than 2'
        assert gt.data.ndim <= 2, 'the target inds array''s ndim should be equal to 1'
        
        softmax_output = np.exp(self.data)/np.exp(self.data).sum(axis=self.data.ndim-1, keepdims=True)
        gt_inds = gt.data.flatten()
        softmax_output = softmax_output.reshape(len(gt_inds), -1)
        loss = -np.log(softmax_output)[np.arange(len(gt_inds)), gt_inds]
        loss = loss.mean()     

        if self.autograd:
            new = Tensor([loss], autograd=True, creator=(self,), create_op='cross_entropy')
            new.gt = gt_inds
            new.softmax_output = softmax_output
            return new
        return Tensor([loss])
        
    def check_creator_grad_count(self):
        for c in self.children:
            if self.children[c] != 0:
                return False
        return True
    
    def backward(self, grad=None, child_grad_node=None):
        if not self.autograd:
            return
        
        if child_grad_node is not None:
            if self.children[child_grad_node.id] == 0:
                assert self.children[child_grad_node.id] != 0, \
                'creator %d'' children %d has grad count == 0, backprop can has one pass' % (self.id, child_grad_node.id)
            else:
                self.children[child_grad_node.id] -= 1

        if grad is None:
            grad = Tensor(np.ones_like(self.data, dtype=np.float, shape=self.shape))
            
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
   
        if self.creator is not None and self.check_creator_grad_count():
            if self.create_op == 'add':
                self.creator[0].backward(self.grad, self)
                self.creator[1].backward(self.grad, self)
            elif self.create_op == 'neg':
                self.creator[0].backward(self.grad.__neg__(), self)
            elif self.create_op == 'sub':
                self.creator[0].backward(self.grad, self)
                self.creator[1].backward(self.grad.__neg__(), self)
            elif self.create_op == 'mul':
                self.creator[0].backward(self.grad*self.creator[1], self)
                self.creator[1].backward(self.grad*self.creator[0], self)
            elif self.create_op == 'transpose':
                self.creator[0].backward(self.grad.transpose(), self)
            elif self.create_op == 'mm':
                c0 = Tensor(self.creator[0].data, autograd=False)  # no auto grad
                c1 = Tensor(self.creator[1].data, autograd=False)  # no auto grad
                self.creator[0].backward(self.grad.mm(c1.transpose()), self)
                self.creator[1].backward(c0.transpose().mm(self.grad), self)
            elif self.create_op[:4] == 'sum_':
                dim = int(self.create_op[4:])
                new = self.grad.expand(dim, self.creator[0].data.shape[dim])
                self.creator[0].backward(new, self)
            elif self.create_op[:7] == 'expand_':
                dim = int(self.create_op[7:])
                new = self.grad.sum(dim)
                self.creator[0].backward(new, self)
            elif self.create_op == 'relu':
                factor = np.where(self.data>0, 1, 0)
                self.creator[0].backward(self.grad*factor, self)
            elif self.create_op == 'sigmoid':
                new = Tensor(self.data*(1-self.data))*self.grad
                self.creator[0].backward(new, self)
            elif self.create_op == 'tanh':
                new = Tensor(1-self.data*self.data)*self.grad
                self.creator[0].backward(new, self)
            elif self.create_op == 'index_select':
                new_grad = np.zeros_like(self.creator[0].data)
                inds = self.ind_sel.data.flatten()
                grad_ = self.grad.data.reshape(len(inds),-1)
                for i in range(len(inds)):
                    new_grad[inds[i]] += grad_[i]
                self.creator[0].backward(Tensor(new_grad), self)
            elif self.create_op == 'cross_entropy':
                new_grad = self.softmax_output.copy()
                new_grad[np.arange(len(self.gt)), self.gt] -= 1
                new_grad = Tensor(new_grad)*self.grad
                self.creator[0].backward(new_grad, self)
            elif self.create_op[:5] == 'mean_':
                dim = int(self.create_op[5:])
                copies = self.creator[0].data.shape[dim]
                new = self.grad.expand(dim, copies)
                new.data = new.data*1.0/copies
                self.creator[0].backward(new, self)

    def zero_grad(self):
        self.grad = None
       
    def step(self, alpha):
        if self.grad is None:
            return
        self.data -= self.grad.data*alpha
    
    def __str__(self):
        return str(self.data.__str__())
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __getitem__(self, ind):
        if isinstance(ind, int) or isinstance(ind, slice): # n是索引 or 切片
            return self.data[ind]
        elif isinstance(ind, tuple):
            a_ind, b_ind = ind
            return self.data[a_ind, b_ind]
            

In [34]:
# add sigmoid & tanh & word embedding layer
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return x.sigmoid()
    
class Tanh(Layer):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return x.tanh()

class EmbeddingLayer(Layer):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        w = np.random.rand(vocab_size, hidden_size)
        self.embedding_weights = Tensor(w, autograd=True)
        self.parameters.append(self.embedding_weights)
        
    def forward(self, words):
        return self.embedding_weights.index_select(words)
    
class CrossEntropyLoss(Loss):
    def __init__(self):
        super().__init__()
    
    def __call__(self, pred, target):
        '''
        The function combines softmax & cross entropy
        '''
        return pred.cross_entropy(target)

In [35]:
# op on index_select

In [36]:
x = Tensor(np.eye(5), autograd=True)
x.index_select(Tensor([[1,2,3],[2,3,4]])).backward()
print(x.grad)

[[0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1.]]


In [37]:
x = Tensor(np.random.rand(2,4), autograd=True)
gt = Tensor(np.array([1,0]))
y = x.cross_entropy(gt)
print('x:',x)
print('y:', y)
print('x'' softmax output:', y.softmax_output)
print('gt dist matrix:', y.gt)

x: [[0.88533766 0.67987946 0.45612977 0.48340862]
 [0.78873943 0.22944183 0.8802976  0.31369239]]
y: [1.26928082]
x softmax output: [[0.31904777 0.25979234 0.20770794 0.21345196]
 [0.30401173 0.17377627 0.33316054 0.18905146]]
gt dist matrix: [1 0]


In [38]:
y.backward()
print(x.grad)

[[ 0.31904777 -0.74020766  0.20770794  0.21345196]
 [-0.69598827  0.17377627  0.33316054  0.18905146]]


In [39]:
# put all together(without cross entropy)
import numpy
np.random.seed(0)

data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

embed = EmbeddingLayer(5,3)
model = Sequential([embed, Tanh(), LinearLayer(3,1), Sigmoid()])
criterion = MSELoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.5)

for i in range(20):
    # zero grad
    optim.zero_grad()
    # predict
    pred = model.forward(data)
    # loss
    loss = criterion(pred, target)
    # backward
    loss.backward()
    # step grad
    optim.step()
    # print loss
    print(loss[0])

1.237625619852779
0.9392089326123719
0.7078099783922267
0.5197284758168169
0.36610028172821235
0.2551263989440395
0.18256411199440728
0.13620932854196413
0.10586065235996045
0.08517312638904313
0.0704825128526079
0.05966252515717865
0.05144032172950523
0.04502472491541149
0.03990542615281734
0.03574191459555345
0.032299977764343464
0.02941412761239609
0.026964621528852706
0.02486297336106265


# neural network with cross entropy loss

In [40]:
# put all together(with cross entropy)
import numpy
np.random.seed(0)

data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

embed = EmbeddingLayer(5,3)
model = Sequential([embed, Tanh(), LinearLayer(3,2)])
criterion = CrossEntropyLoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.25)

for i in range(10):
    # zero grad
    optim.zero_grad()
    # predict
    pred = model.forward(data)
    # loss
    loss = criterion(pred, target)
    # backward
    loss.backward()
    # step grad
    optim.step()
    # print loss
    print(loss[0])

0.7015193954153844
0.661821171510658
0.6180748425684603
0.5504361740525101
0.4526345603907933
0.3424455258888502
0.2500484390577531
0.18608274753695986
0.14402676254263264
0.11577686966363329


In [41]:
def softmax(pred):
    return np.exp(pred.data)/np.exp(pred.data).sum(axis=1, keepdims=True)
def predict(softmax_output):
    return softmax_output.argmax(axis=1)
print(pred)
print(softmax(pred))
print('target:',target)
print('predict:', predict(softmax(pred)))

[[ 2.08439576e+00 -1.64601813e-03]
 [-1.19118162e+00  9.18184926e-01]
 [ 2.08439576e+00 -1.64601813e-03]
 [-1.19118162e+00  9.18184926e-01]]
[[0.88953909 0.11046091]
 [0.10818977 0.89181023]
 [0.88953909 0.11046091]
 [0.10818977 0.89181023]]
target: [[0]
 [1]
 [0]
 [1]]
predict: [0 1 0 1]


# now it is for rnn

In [42]:
class RNNCell(Layer):
    def __init__(self, embedding_size, hidden_size, vocab_size, activation='sigmoid'):
        super(RNNCell, self).__init__()
        
        self.input_weights = LinearLayer(embedding_size, hidden_size)
        self.hidden_state = LinearLayer(hidden_size, hidden_size)
        self.output_weights = LinearLayer(hidden_size, vocab_size)
        
        if activation == 'sigmoid':
            self.activation = Sigmoid()
        else:
            self.activation = Tanh()
            
        self.parameters = self.input_weights.get_parameters()+self.hidden_state.get_parameters()+self.output_weights.get_parameters()
        
    def forward(self, input, hidden):
        word_input = self.input_weights.forward(input)
        cur_hidden = self.hidden_state.forward(hidden) + word_input
        cur_hidden = self.activation.forward(cur_hidden)
        output = self.output_weights.forward(cur_hidden)
        
        return output, cur_hidden

class RNN_Model(Sequential):
    def __init__(self, embedding_size, hidden_size, vocab_size):
        super(RNN_Model, self).__init__()
        
        self.word_embedding = EmbeddingLayer(vocab_size, embedding_size)
        self.rnn = RNNCell(embedding_size, hidden_size, vocab_size)
        
        self.add(self.word_embedding)
        self.add(self.rnn)
        
    def forward(self, input, hidden):
        word_embeds = self.word_embedding.forward(input)
        output, hidden = self.rnn.forward(word_embeds, hidden)
        
        return output, hidden
        

In [43]:
class QA_Dataset:
    def __init__(self, path):
        self.path = path
        self.start_token = '<START>'
        self.pad_token = '<PAD>'
        self.word2ind = {}
        self.ind2word = {}
        self.corpus = []
    
    def remove_noneed(self, r):
        no_needs = ['0','1','2','3','4','5','6','7','8','9','\n','\t1', '\t2','\t3','\t4','\t5''\t6','\t7','\t8','\t9', '.','?','\t']
        for n in no_needs:
            r = r.replace(n, '')
        return r

    def parse(self):
        f = open(path)
        raw = f.readlines()
        f.close()
        self.corpus = [self.remove_noneed(r).split(' ')[1:] for r in raw]
        
        # add pad & start token
        ind = 0
        self.word2ind[self.pad_token] = ind
        self.ind2word[ind] = self.pad_token
        ind = 1
        self.word2ind[self.start_token] = ind
        self.ind2word[ind] = self.start_token
        ind = 2
        
        self.max_len = 0
        for sent in self.corpus:
            if self.max_len < len(sent):
                self.max_len = len(sent)
                
            for w in sent:
                if w not in self.word2ind:
                    self.word2ind[w] = ind
                    self.ind2word[ind] = w
                    ind += 1
        #self.max_len += 1  # add start token
        np.random.shuffle(self.corpus)
        
    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        sent = self.corpus[idx]
        sent_inds = np.array([self.word2ind[self.pad_token]]*(self.max_len-len(sent))+[self.word2ind[w] for w in sent])
        return sent_inds
    
    def get_sent(self, idx):
        return self.corpus[idx]
    
    def get_vob_len(self):
        return len(self.word2ind)
    
    def get_start_token(self):
        return self.start_token


# do explore in dataset

In [44]:
from torch.utils.data import DataLoader
path = '../tasksv11/en/qa1_single-supporting-fact_train.txt'
qa_ds = QA_Dataset(path)
qa_ds.parse()
print('max length is', qa_ds.max_len)

max length is 6


In [45]:
batch_size = 1
train_dataloader = DataLoader(qa_ds, batch_size=batch_size, shuffle=True)  #20000
print(qa_ds.get_sent(0), 'ind:', qa_ds[0])
print('we have', len(qa_ds),'training samples.')

['Mary', 'travelled', 'to', 'the', 'kitchen'] ind: [ 0  2 18  4  5 20]
we have 3000 training samples.


In [46]:
batch_size = 16
train_dataloader = DataLoader(qa_ds, batch_size=batch_size, shuffle=True)  #20000
batch_x = iter(train_dataloader).next()
print(batch_x)
print('-'*64)
for idx, one_sent in enumerate(batch_x):
    print(idx, sep ='.', end=' ')
    for one_word in one_sent:
        print(qa_ds.ind2word[one_word.item()],sep=' ', end=' ')
    print('')


tensor([[ 0, 14, 18,  4,  5, 19],
        [ 0,  0, 10, 11, 14,  6],
        [ 0, 14, 17,  4,  5,  6],
        [ 0,  2,  8,  4,  5, 15],
        [ 0,  0, 10, 11,  7,  6],
        [ 0, 14,  8,  4,  5, 20],
        [ 0,  0, 10, 11, 12,  9],
        [ 0, 14, 18,  4,  5, 15],
        [ 0, 14,  3,  4,  5, 15],
        [ 0, 14,  8,  4,  5,  9],
        [ 0,  7, 17,  4,  5, 16],
        [ 0,  0, 10, 11, 12,  9],
        [ 0,  2,  8,  4,  5,  6],
        [ 0,  2,  8,  4,  5, 20],
        [ 0,  0, 10, 11, 14,  9],
        [ 0,  0, 10, 11, 12,  9]])
----------------------------------------------------------------
0 <PAD> Sandra travelled to the bedroom 
1 <PAD> <PAD> Where is Sandra bathroom 
2 <PAD> Sandra journeyed to the bathroom 
3 <PAD> Mary went to the garden 
4 <PAD> <PAD> Where is John bathroom 
5 <PAD> Sandra went to the kitchen 
6 <PAD> <PAD> Where is Daniel hallway 
7 <PAD> Sandra travelled to the garden 
8 <PAD> Sandra moved to the garden 
9 <PAD> Sandra went to the hallway 
10 <PAD> 

In [72]:
epoches = 15
batch_size = 100
path = '../tasksv11/en/qa1_single-supporting-fact_train.txt'
qa_ds = QA_Dataset(path)
qa_ds.parse()
train_dataloader = DataLoader(qa_ds, batch_size=batch_size, shuffle=True)  #20000

word_embedding_size = 64
hidden_size = 64
vocab_size = qa_ds.get_vob_len()
model = RNN_Model(word_embedding_size, hidden_size, vocab_size)
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters(), alpha=0.001)

for epoch in range(epoches):
    total_acc = 0
    total_loss = 0
    counter = 0
    for one_bs in train_dataloader:
        hidden = Tensor(np.zeros((batch_size, hidden_size)), autograd=True)
        
        optim.zero_grad()
        
        for i in range(qa_ds.max_len-1):
            x = one_bs[:,i]
            y = one_bs[:,i+1]

            input = Tensor(x.detach().cpu().numpy(), autograd=True)
            target = Tensor(y.detach().cpu().numpy(), autograd=True)

            pred, hidden = model.forward(input, hidden)
            
            loss = criterion(pred, target)
            acc = (pred.data.argmax(axis=1) == target.data).mean()
            total_loss += loss[0]
            total_acc += acc
            counter += 1
        
            loss.backward()
        optim.step()
            
    print('In epoch %d, nn gets loss %.4f, acc %.4f' % (epoch, total_loss/counter, total_acc/counter))

TypeError: super(type, obj): obj must be an instance or subtype of type

In [48]:
pred.data.argmax(axis=1)

array([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19])

In [49]:
target

array([ 6,  6,  6, 19,  9,  9,  9,  6,  6, 15,  6,  9, 16,  6,  6, 15,  9,
        6, 15, 16,  6, 16, 20,  6, 15, 20,  9, 20,  9,  6,  6, 16, 15, 15,
       16, 16, 20, 16, 16, 19, 20, 19, 15,  6, 20, 20, 15, 16, 19, 20,  6,
       15, 20,  9, 16,  6, 19, 20, 15, 15, 15, 20,  6,  6, 15, 15,  9, 15,
        6,  9, 19, 15, 16,  6, 16, 19, 15,  9, 15, 20,  9, 15, 15,  9, 20,
        6, 20, 16,  6, 15,  6,  6, 20, 16, 19, 20, 20,  6, 19, 19])

# Get one word, then predict one word

In [50]:
test_num = 5
for i in range(test_num):
    sent = qa_ds[i]
    print('Gt sentence:', ' '.join(qa_ds.get_sent(i)))
    pred_sent = ""
    hidden = Tensor(np.zeros((1, hidden_size)), autograd=True)
    for j in range(len(sent)-1):
        x = sent[j]
        gt = sent[j+1]
        
        input = Tensor([x], autograd=True)
        pred, hidden = model.forward(input, hidden)
        pred_y = pred.data.argmax(axis=1)[0]
        print('Get', qa_ds.ind2word[x],'Pred:', qa_ds.ind2word[pred_y], 'Gt:', qa_ds.ind2word[gt])
        pred_sent += qa_ds.ind2word[pred_y] +" "
    print('Pred sentence:', pred_sent)
    print('-'*32)

Gt sentence: Mary moved to the garden
Get <PAD> Pred: <PAD> Gt: Mary
Get Mary Pred: journeyed Gt: moved
Get moved Pred: to Gt: to
Get to Pred: the Gt: the
Get the Pred: bathroom Gt: garden
Pred sentence: <PAD> journeyed to the bathroom 
--------------------------------
Gt sentence: Where is Mary kitchen
Get <PAD> Pred: <PAD> Gt: <PAD>
Get <PAD> Pred: Where Gt: Where
Get Where Pred: is Gt: is
Get is Pred: Mary Gt: Mary
Get Mary Pred: bathroom Gt: kitchen
Pred sentence: <PAD> Where is Mary bathroom 
--------------------------------
Gt sentence: John went to the kitchen
Get <PAD> Pred: <PAD> Gt: John
Get John Pred: journeyed Gt: went
Get went Pred: to Gt: to
Get to Pred: the Gt: the
Get the Pred: bathroom Gt: kitchen
Pred sentence: <PAD> journeyed to the bathroom 
--------------------------------
Gt sentence: Where is Daniel office
Get <PAD> Pred: <PAD> Gt: <PAD>
Get <PAD> Pred: Where Gt: Where
Get Where Pred: is Gt: is
Get is Pred: Mary Gt: Daniel
Get Daniel Pred: bathroom Gt: office
Pre

# Get words(or context, or sequence), then predict one word

In [55]:
test_num = 5
for i in range(test_num):
    sent = qa_ds[i]
    print('Gt sentence:', ' '.join(qa_ds.get_sent(i)))
    context = ""
    hidden = Tensor(np.zeros((1, hidden_size)), autograd=True)
    for j in range(len(sent)-1):
        x = sent[j]
        gt = sent[j+1]
        context += qa_ds.ind2word[x]+" "
        
        input = Tensor([x], autograd=True)
        pred, hidden = model.forward(input, hidden)
        
    pred_y = pred.data.argmax(axis=1)[0]
    print('Context:', context,'Pred:', qa_ds.ind2word[pred_y], '-- Gt:', qa_ds.ind2word[gt])
    print('-'*64)

Gt sentence: Mary moved to the garden
Context: <PAD> Mary moved to the  Pred: bathroom -- Gt: garden
----------------------------------------------------------------
Gt sentence: Where is Mary kitchen
Context: <PAD> <PAD> Where is Mary  Pred: bathroom -- Gt: kitchen
----------------------------------------------------------------
Gt sentence: John went to the kitchen
Context: <PAD> John went to the  Pred: bathroom -- Gt: kitchen
----------------------------------------------------------------
Gt sentence: Where is Daniel office
Context: <PAD> <PAD> Where is Daniel  Pred: bathroom -- Gt: office
----------------------------------------------------------------
Gt sentence: Where is Sandra bedroom
Context: <PAD> <PAD> Where is Sandra  Pred: bathroom -- Gt: bedroom
----------------------------------------------------------------
