### BASIC NEURAL NETWORK TEMPLATE

In [None]:
import numpy as np
import matplotlib.pyplot as plt

epochs_num, lr = 100000, 0.01
errs, iters = [], []

inputs = np.array([[19, 28, 37], [46, 55, 64], [73, 82, 91]], dtype=float)
inputs /= inputs.max()
outputs = np.array([[91, 82, 73], [64, 55, 46], [37, 28, 19]], dtype=float)
outputs /= outputs.max()
weights = np.random.random((inputs.shape[1],outputs.shape[1]))

for iteration in range(epochs_num):
    total_err = 0
    for i in range(len(inputs)):
        layer0 = inputs[i:i+1]
        layer1 = np.dot(layer0, weights)
        layer1_delta = outputs[i:i+1] - layer1
        weights += lr * layer0.T.dot(layer1_delta)
        total_err += np.sum(layer1_delta ** 2)
    if (iteration + 1) % 1000 == 0:
        iters.append(iteration)
        errs.append(total_err / len(inputs))

plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.plot(iters, errs)
plt.show()
print("Final loss:", total_err / len(inputs))

### OPTIMIZED NEURAL NETWORK WITH HIDDEN LAYERS

#### Activation functions
**tanh**:
`def tanh(x): return np.tanh(x)`

**relu**:
`def relu(x): return (x * (x > 0))`

**sigmoid**:
`def sigmoid(x): return (1 / (1 + np.exp(-x)))`

**softmax**:
`def softmax(x): return (np.exp(x) / np.sum(np.exp(x), axis = 1, keepdims = True))`


#### Activation2Deriv functions
**tanh2deriv**:
`def tanh2deriv(x): return (1 - x**2) `

**relu2deriv**:
`def relu2deriv(x): return x > 0`

**sigmoid2deriv**:
`def sigmoid2deriv(x): return x * (1 - x)`

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def relu(x): return (x * (x > 0))
def relu2deriv(x): return x > 0

epochs_num, lr, hidsize = 100000, 0.01, 20
errs, iters = [], []

inputs = np.array([[19, 28, 37], [46, 55, 64], [73, 82, 91]], dtype=float)
inputs /= inputs.max()
outputs = np.array([[91, 82, 73], [64, 55, 46], [37, 28, 19]], dtype=float)
outputs /= outputs.max()
weights01 = np.random.random((inputs.shape[1],hidsize))
weights12 = np.random.random((hidsize,outputs.shape[1]))

for iteration in range (epochs_num):
    total_err = 0
    for i in range (len(inputs)):
        layer0 = inputs[i:i+1]
        layer1 = relu(layer0.dot(weights01))
        dropout_mask = np.random.randint(2, size = layer1.shape)
        layer1 *= dropout_mask
        layer2 = layer1.dot(weights12)
        layer2_delta = outputs[i:i+1] - layer2
        layer1_delta = layer2_delta.dot(weights12.T) * relu2deriv(layer1) * dropout_mask
        weights12 += lr * layer1.T.dot(layer2_delta)
        weights01 += lr * layer0.T.dot(layer1_delta)
        total_err += np.sum(layer2_delta ** 2)
    if not ((iteration + 1) % 1000):
        iters.append(iteration)
        errs.append(total_err / len(inputs))

plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.plot(iters, errs)
plt.show()
print("Final loss:", total_err / len(inputs))

### CONVOLUTIONAL NEURAL NETWORK (CNN)

#### Approach:

**Layer0 -> Layer1**

Instead of taking a whole layer0 and multiplying it by weights01, we split layer0 into bunch of sections with size ```(kernel_rows, kernel_cols)```. After this, we take these sections matrix with size ```(images_n, sections_n, 1, kernels_row, kernels_column)``` and transform (flatten) it so its size becomes ```(images_n * sections_n, kernels_row * kernels_column)```, because kernels (kinda "weights" for finding signs in data (image mostly) ) are used to return list of probability for each sign for single section, so basically we need to pass to kernels sized something like ```(n_of_all_sections_in_images_in_batch, size_of_each_section)```, so it will return us something like ```(n_of_all_sections_in_images_in_batch, probability_for_each_sign)```


**Layer1 -> Layer2**

After we get list of probabilities for signs for each section, we reshape it in something like ```(images_n, sections_n * kernels_n)``` (it's layer1) and feed it into weights12, after which we get list of predictions for each input (image, not section!) with shape ```(images_n, output.shape[1])```


**Calculating deltas**

To get layer2_delta we just need, as always, subtract from actual output our prediction. To get layer1_delta, we need to dot layer2_delta with weights12 transposed, i.e. ```layer2_delta.dot(weights12.T)```. Also multiply it by derivative of activation function as always, and if we are using dropout_mask, multiply by it to. So by the end we should get something like ```layer1_delta = layer2_delta.dot(weights12.T) * tanh2deriv(layer1) * dropout_mask```.


**Backpropogation**

To backpropogate through weights12, we just need to dot layer1 transposed with layer2_delta and multiply it by learning rate and update weights12 by this value, so we need to write something like ```weights12 += layer1.dot(layer2_delta) * alpha```

To backpropogate through kernels, we need to reshape our layer1_delta into size ```(n_of_all_sections_in_images_in_batch, probability_for_each_sign)``` and dot transposed flattened input by it and multiply it by learning rate and update kernels by this value, so we write something like ```kernels += flattened_input.T.dot(layer1_delta.reshape(#(n_of_all_sections_in_images_in_batch, probability_for_each_sign)))```

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist

def tanh(x): return np.tanh(x)
def tanh2deriv(x): return (1 - x**2)
def softmax(x): return (np.exp(x) / np.sum(np.exp(x), axis = 1, keepdims = True))

def get_image_sections (image, row_size, column_size):
    sects = []
    for row in range (image.shape[1] - row_size):
        for column in range (image.shape[2] - column_size):
            sects.append(image[:, row:row + row_size, column:column + column_size].reshape(-1, 1, row_size, column_size))
    return sects

(x_train, y_train), (x_test, y_test) = mnist.load_data()

train_size, test_size = 1000, 3000

train_images, train_labels = (x_train[0:train_size].reshape(train_size, 28*28) / 255, np.zeros((len(y_train[0:train_size]),10)))
for i,j in enumerate(y_train[0:train_size]):
    train_labels[i][j] = 1

test_images, test_labels = (x_test[0:test_size].reshape(test_size, 28*28) / 255, np.zeros((len(y_test[0:test_size]), 10)))
for i,j in enumerate(y_test[0:test_size]):
    test_labels[i][j] = 1

row_size, column_size, kernels_n = 3, 3, 16
hidsize = (28 - row_size) * (28 - column_size) * kernels_n

kernels = 0.02 * np.random.random((row_size * column_size, kernels_n)) - 0.01
weights12 = 0.2 * np.random.random((hidsize, 10)) - 0.1

iterations, alpha, batch_size = 100000, 2, 100
test_accuracy, train_accuracy, iters = [], []

for iteration in range (iterations):
    for batch_index in range (train_images.shape[0]//batch_size):
        layer0 = train_images[batch_index * batch_size : (batch_index + 1) * batch_size].reshape(-1, 28, 28)
        sects = get_image_sections(layer0, row_size, column_size)
        expanded_input = np.concatenate(sects, axis = 1)
        flattened_input = expanded_input.reshape(expanded_input.shape[0] * expanded_input.shape[1], -1)
        layer1 = tanh(flattened_input.dot(kernels)).reshape(batch_size, -1)
        dropout_mask = np.random.randint(2, size=layer1.shape)
        layer1 *= dropout_mask * 2
        layer2 = softmax(layer1.reshape(batch_size, -1).dot(weights12))
        layer2_delta = (train_labels[batch_index * batch_size : (batch_index + 1) * batch_size] - layer2) / (batch_size * layer2.shape[0])
        layer1_delta = layer2.dot(weights12.T) * dropout_mask * tanh2deriv(layer1)
        weights12 += layer1.T.dot(layer2_delta) * alpha
        kernels += flattened_input.T.dot(layer1_delta.reshape(-1, kernels_n)) * alpha
        
    if not ((iteration + 1) % 1000):
        train_correct_cnt = 0
        for image in range (train_images):
            layer0 = train_images[image:image + 1].reshape(-1, 28, 28)
            sects = get_image_sections(layer0, row_size, column_size)
            expanded_input = np.concatenate(sects, axis = 1)
            flattened_input = expanded_input.reshape(expanded_input.shape[0] * expanded_input.shape[1], -1)
            layer1 = tanh(flattened_input.dot(kernels).reshape(1, -1))
            layer2 = layer1.dot(weights12)
            train_correct_cnt += int(np.argmax(layer2) == np.argmax(train_labels[image:image + 1]))            
        test_correct_cnt = 0
        for image in range (len(test_images)):
            layer0 = test_images[image:image + 1].reshape(-1, 28, 28)
            sects = get_image_sections(layer0, row_size, column_size)
            expanded_input = np.concatenate(sects, axis = 1)
            flattened_input = expanded_input.reshape(expanded_input.shape[0] * expanded_input.shape[1], -1)
            layer1 = tanh(flattened_input.dot(kernels).reshape(1, -1))
            layer2 = layer1.dot(weights12)
            test_correct_cnt += int(np.argmax(layer2) == np.argmax(test_labels[image:image + 1]))
        print(test_correct_cnt)
        test_accuracy.append(test_correct_cnt / test_size)
        train_accuracy.append(train_accuracy / train_size)
        iters.append(iteration)
        
plt.plot(iters, test_accuracy, label = "Test_acc")
plt.plot(iters, train_accuracy, label = "Train_acc")
plt.show()
print(f"Final test accuracy: {correct_cnt}")

## INTRODUCTION TO NLP


### SENTIMENT ANALYSIS

In [None]:
import numpy as np

def sigmoid(x): return (1 / (1 + np.exp(-x)))

train_input = ["it has fur", "its fur is really nice", "you are a nice guy", "she is talkative", "good boy"]
train_output = [1, 1, 0, 0, 1]

allwords = set(list(" ".join(train_input).split()))
vocab = {}

for i, word in enumerate(allwords):
    vocab[word] = i

iterations, alpha, hidsize = 100000, 0.1, 30

weights01 = 0.2 * np.random.random((len(vocab), hidsize)) - 0.1
weights12 = 0.2 * np.random.random((hidsize, 1)) - 0.1

for iteration in range (iterations):
    for input in range (len(train_input)):
        layer0 = []
        for word in train_input[input].split():
            layer0.append(vocab[word])
        layer1 = sigmoid(np.sum(weights01[layer0],axis = 0))
        layer2 = sigmoid(layer1.dot(weights12))

        layer2_delta = train_output[input] - layer2
        layer1_delta = layer2_delta.dot(weights12.T)

        weights12 += np.outer(layer1, layer2_delta) * alpha
        weights01[layer0] += layer1_delta * alpha

        print(layer2_delta**2)

### BAG OF WORDS (MISSING WORD PREDICTION)

In [None]:
import numpy as np
import math
from collections import Counter
def sigmoid ( x ) : return ( 1 / ( 1 + np.exp ( -x ) ) )
def find_similar ( target ) : 
    scores = Counter ( )
    for word, index in vocab.items ( ) :
        scores [ word ] = - math.sqrt ( sum ( ( weights01 [ vocab [ target ] ] - weights01 [ index ] ) ** 2 ) )
    return scores.most_common ( 10 )
f = open ( 'reviews.txt' )
raw_reviews = f.readlines ( )
f.close ( )
allwords = list ( set ( " ".join ( raw_reviews ) .split ( ) ) )
vocab = { }
for index, word in enumerate ( allwords ) :
    vocab [ word ] = index
input_data = [ ]
concatenated_data = [ ]
for review in raw_reviews:
    sequence_data = [ ]
    for word in review.split ( ) :
        sequence_data.append ( vocab [ word ] )
        concatenated_data.append ( vocab [ word ] )
    input_data.append ( sequence_data )
concatenated_data = np.array ( concatenated_data )
hidden_size, negative_size, context_windows_size, iterations, alpha = 30, 8, 4, 3, 0.05
weights01 = np.random.rand ( len ( vocab ), hidden_size )
weights12 = np.random.rand ( len ( vocab ), hidden_size )
last_layer_target = np.zeros ( negative_size + 1 )
last_layer_target [ 0 ] = 1
for iteration, input_sequence in enumerate ( input_data * iterations ):
    for target in range ( len ( input_sequence ) ):
        chosen_words = [ input_sequence [ target ] ] + concatenated_data [ ( np.random.random ( negative_size ) * len ( concatenated_data ) ) .astype('int') ] .tolist()
        layer_0 = input_sequence [ max ( 0, target - context_windows_size ) : target ] + input_sequence [ target + 1 : min ( len ( input_sequence ) , target + 1 + context_windows_size ) ]
        layer_1 = np.mean ( weights01 [ layer_0 ] , axis = 0 )
        layer_2 = sigmoid ( layer_1.dot ( weights12 [ chosen_words ] .T ) ) 
        layer_2_delta = layer_2 - last_layer_target
        layer_1_delta = layer_2_delta.dot ( weights12 [ chosen_words ] )
        weights01 [ layer_0 ] -= layer_1_delta * alpha
        weights12 [ chosen_words ] -= np.outer ( layer_2_delta, layer_1 ) * alpha
    if not iteration % 5000:
        print ( f"Progress: { iteration } / { len ( input_data * iterations ) }" )
        print ( find_similar ( "beautiful" ) )

### RECURRENT NEURAL NETWORK (RNN) (MISSING WORD PREDICTION)

In [None]:
import numpy as np
import random

def softmax ( x ) : return np.exp ( x - np.max ( x ) ) / np.exp ( x - np.max ( x ) ) .sum ( axis=0 )

f = open ( "qa1_single-supporting-fact_train.txt", 'r' )
raw_data = f.readlines ( )
f.close ( )
input_data_raw = list ( )
for sentence in raw_data:
    input_data_raw.append ( sentence.lower ( ) .replace ( "\n","" ) .split ( " " ) [ 1: ] )
allwords = set ( )
for sentence in input_data_raw:
    for word in sentence:
        allwords.add ( word )

vocab = { }
for index, word in enumerate ( allwords ):
    vocab [ word ] = index

input_data = []
for sentence in input_data_raw:
    sentence_data = list ( )
    for word in sentence:
        sentence_data.append ( vocab [ word ] )
    input_data.append ( sentence_data )

embeding_size = 10

start = np.zeros ( embeding_size )  
prediction_weights = ( np.random.random ( ( embeding_size, len ( vocab ) ) ) - 0.5 ) * 0.1
recurrent = np.eye ( embeding_size )
embeddings = ( np.random.random ( ( len ( vocab ) ,  embeding_size ) ) - 0.5 ) * 0.1
onehots = np.eye ( len ( vocab ) )
iterations = 100000
alpha = 0.001
for iteration in range ( iterations ) :
    loss = 0
    input_sequence = input_data [ iteration % len ( input_data ) ]
    layers = list ( )
    layer = { }
    layer [ 'hidden' ] = start
    layers.append ( layer )
    for new_word in input_sequence:
        layer = { }
        layer [ 'prediction' ] = softmax ( layers [ -1 ] [ 'hidden' ] .dot ( prediction_weights ) )
        layer [ 'prediction_target' ] = new_word
        loss += -np.log ( layer [ 'prediction' ] [ new_word ] ) 
        layer [ 'hidden' ] = layers [ -1 ] [ 'hidden' ] .dot ( recurrent ) + embeddings [ new_word ]
        layers.append ( layer )
    for layer_i in reversed ( range ( len ( layers ) ) ) : 
        layer = layers [ layer_i ]
        if layer_i > 0:
            layer [ 'output_delta' ] = layer [ 'prediction' ] - onehots [ layer [ 'prediction_target' ] ]
            prediction_delta = layer [ 'output_delta' ] .dot ( prediction_weights.T )
            if layer_i == len ( layers ) - 1:
                layer [ 'hidden_delta' ] = prediction_delta
            else:
                layer [ 'hidden_delta' ] = prediction_delta + layers [ layer_i + 1 ] [ 'hidden_delta' ] .dot ( recurrent.T )
        else:
            layer [ 'hidden_delta' ] = layers [ layer_i + 1 ] [ 'hidden_delta' ] .dot ( recurrent.T )
    start -= layers [ 0 ] [ 'hidden_delta' ] * alpha / float ( len ( input_sequence ) )
    for prevlayer_i, layer in enumerate ( layers [ 1: ] ) :
        prediction_weights -= np.outer ( layers [ prevlayer_i ] [ 'hidden' ], layer [ 'output_delta' ] ) * alpha / float ( len ( input_sequence ) )
        embeddings_i = input_sequence [ prevlayer_i ]
        embeddings [ embeddings_i ] -= layer [ 'hidden_delta' ] * alpha / float ( len ( input_sequence ) )
        recurrent -= np.outer ( layers [ prevlayer_i ] [ 'hidden' ], layer [ 'hidden_delta' ] ) * alpha / float ( len ( input_sequence ) )
    if not ( iteration % 1000 ) :
        print ( f"Perplexity: { np.exp ( loss/len ( input_sequence ) ) }" )
        trues = list()
        preds = list()
        layers = list ( )
        layer = { }
        layer [ 'hidden' ] = start
        layers.append ( layer )
        sequence_i = random.randint ( 0, len ( input_data ) ) 
        print ( input_data_raw [ sequence_i ] )
        for new_word in input_data [ sequence_i ] :
            layer = { }
            layer [ 'prediction' ] = softmax ( layers [ -1 ] [ 'hidden' ] .dot ( prediction_weights ) )
            trues.append ( list ( vocab.keys ( ) ) [ list ( vocab.values ( ) ) .index ( new_word ) ] )
            preds.append ( list ( vocab.keys ( ) ) [ list ( vocab.values ( ) ) .index ( layer [ 'prediction' ] .argmax ( ) ) ] ) 
            layer [ 'hidden' ] = layers [ -1 ] [ 'hidden' ] .dot ( recurrent ) + embeddings [ new_word ]
            layers.append ( layer )
        for i in range ( 1, len ( trues ) ) :
            print ( f"True: { trues [ i ] }  Prediction: { preds [ i ] } ")

Perplexity: 83.07716653366394
['where', 'is', 'mary?', '\tgarden\t11']
True: is  Prediction: kitchen. 
True: mary?  Prediction: 	hallway	2 
True: 	garden	11  Prediction: 	garden	14 
Perplexity: 82.72856350062432
['john', 'went', 'back', 'to', 'the', 'bathroom.']
True: went  Prediction: 	hallway	13 
True: back  Prediction: 	office	2 
True: to  Prediction: 	office	2 
True: the  Prediction: the 
True: bathroom.  Prediction: the 
Perplexity: 82.82384944279562
['mary', 'went', 'to', 'the', 'kitchen.']
True: went  Prediction: 	office	5 
True: to  Prediction: 	office	5 
True: the  Prediction: 	garden	10 
True: kitchen.  Prediction: the 
Perplexity: 82.71018252125117
['sandra', 'went', 'to', 'the', 'bedroom.']
True: went  Prediction: the 
True: to  Prediction: the 
True: the  Prediction: the 
True: bedroom.  Prediction: the 
Perplexity: 81.84271188042023
['daniel', 'went', 'back', 'to', 'the', 'hallway.']
True: went  Prediction: the 
True: back  Prediction: the 
True: to  Prediction: the 
True

## FRAMEWORK

In [54]:
import numpy as np
class Tensor (object):
    def __init__(self,data, autograd=False,creation_op=None, creators=None, id=None):
        self.data=np.array(data)
        self.creation_op=creation_op
        self.creators=creators
        self.grad=None
        self.children={}
        self.autograd=autograd
        if (id is None):
            id = np.random.randint(0,100000)
        self.id = id
        if creators:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id]=1
                else:
                    c.children[self.id]+=1
    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True  
    def backward(self, grad, grad_origin=None):
        if self.autograd:
            if grad_origin is not None:
                if not self.children[grad_origin.id]:
                    raise Exception("Cannnot backprop more than once")
                else:
                    self.children[grad_origin.id]-=1
            if self.grad is None:
                self.grad=grad
            else:
                self.grad+=grad
            if (self.creators is not None) and (grad_origin is None or self.all_children_grads_accounted_for()):
                if self.creation_op=="add":
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                if self.creation_op=="transpose":
                    self.creators[0].backward(self.grad.transpose())
                if self.creation_op=="neg":
                    self.creators[0].backward(self.grad.__neg__())
                if self.creation_op=="mul":
                    self.creators[0].backward(self.grad*self.creators[1], self)
                    self.creators[1].backward(self.grad*self.creators[0], self)
                if self.creation_op=="sub":
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad.__neg__())
                if "sum" in self.creation_op:
                    dim=int(self.creation_op.split("_")[1])
                    ds=self.creators[0].data.shape[dim]
                    self.creators[0].backward(self.grad.expand(dim,ds))
                if "expand" in self.creation_op:
                    dim=int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                if self.creation_op == "mm":
                    self.creators[0].backward(self.grad.mm(self.creators[1].transpose()))
                    self.creators[1].backward(self.grad.transpose().mm(self.creators[0]).transpose())
    def __add__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data+other.data,autograd=True,creation_op="add",creators=[self, other])
        return Tensor(self.data+other.data)
    def __neg__(self):
        if (self.autograd):
            return Tensor(self.data * -1, autograd=True, creation_op="neg",creators=[self])
        return Tensor(self.data*-1)
    def __mul__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data*other.data,autograd=True,creation_op="mul",creators=[self, other])
        return Tensor(self.data*other.data)
    def __sub__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data-other.data,autograd=True,creation_op="sub",creators=[self, other])
        return Tensor(self.data-other.data)
    def sum(self, dim):
        if (self.autograd):
            return Tensor(self.data.sum(dim),autograd=True,creation_op="sum_"+str(dim),creators=[self])
        return Tensor(self.data.sum(dim))
    def mm(self, x):
        if (self.autograd and x.autograd):
            return Tensor(self.data.dot(x.data),autograd=True,creation_op="mm",creators=[self, x])
        return Tensor(self.data.dot(x.data))
    def expand(self, dim,copies):

        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)
        
        if(self.autograd):
            return Tensor(new_data,autograd=True,creation_op="expand_"+str(dim),creators=[self])
        return Tensor(new_data)
    
    def transpose(self):
        if(self.autograd):
            return Tensor(self.data.transpose(),autograd=True,creation_op="transpose",creators=[self])
        return Tensor(self.data.transpose())

    def __repr__(self):
        return str(self.data.__repr__())
    def __str___(self):
        return str(self.data__str__())


### MODELS BASED ON FRAMEWORK

In [63]:
#Classic approach

import numpy
np.random.seed(0)

data=np.array([[0,0],[0,1],[1,0],[1,1]])
target=np.array([[0],[1],[0],[1]])
weights01=np.random.rand(2,3)
weights12=np.random.rand(3,1)
for i in range (10):
    layer0=data
    layer1=layer0.dot(weights01)
    layer2=layer1.dot(weights12)
    diff=layer2-target
    sqdiff=diff**2
    loss=sqdiff.sum(0)
    layer1delta=diff.dot(weights12.transpose())
    weights01-=layer0.transpose().dot(layer1delta) * 0.1
    weights12-=layer1.transpose().dot(diff) * 0.1
    print(loss)


[5.06643999]
[0.49599078]
[0.41806719]
[0.35298133]
[0.29725496]
[0.2492326]
[0.20785392]
[0.17231261]
[0.14193745]
[0.1161398]


In [69]:
#Tensor approach

import numpy
np.random.seed(0)

data=Tensor([[0,0],[0,1],[1,0],[1,1]], autograd=True)
target=Tensor([[0],[1],[0],[1]], autograd=True)
w=list()
w.append(Tensor(np.random.rand(2,3), autograd=True))
w.append(Tensor(np.random.rand(3,1), autograd=True))
for i in range (10):
    pred=data.mm(w[0]).mm(w[1])
    diff=(pred-target)
    sqdiff=diff*diff
    loss=sqdiff.sum(0)
    loss.backward(Tensor(np.ones_like(loss.data)))
    for w_ in w:
        w_.data -= w_.grad.data * 0.1
        w_.grad.data *= 0
    print(loss.data)

[0.58128304]
[0.48988149]
[0.41375111]
[0.34489412]
[0.28210124]
[0.2254484]
[0.17538853]
[0.1324231]
[0.09682769]
[0.06849361]
