In [1]:
import nltk

In [2]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/mismayil/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [3]:
from nltk.corpus import reuters
text = reuters.raw()

In [4]:
from nltk.tokenize import word_tokenize

def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    return tokens

In [5]:
tokens = tokenize(text)

In [6]:
len(tokens)

1274933

In [8]:
tokens[:50]

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'and',
 'japan',
 'has',
 'raised',
 'fears',
 'among',
 'many',
 'of',
 'asia',
 'exporting',
 'nations',
 'that',
 'the',
 'row',
 'could',
 'inflict',
 'economic',
 'damage',
 'businessmen',
 'and',
 'officials',
 'said',
 'they',
 'told',
 'reuter',
 'correspondents',
 'in',
 'asian',
 'capitals',
 'a',
 'move',
 'against',
 'japan',
 'might',
 'boost',
 'protectionist',
 'sentiment',
 'in',
 'the']

In [7]:
def generate_vocab(tokens):
    vocab = dict()
    
    for i, token in enumerate(set(tokens)):
        vocab[token] = i
    
    return vocab

In [8]:
vocab = generate_vocab(tokens)

In [9]:
len(vocab)

27952

In [10]:
import numpy as np
def one_hot_encode(token, vocab):
    vector = np.zeros(len(vocab))
    vector[vocab[token]] = 1
    return vector

In [15]:
def generate_train_data(tokens, vocab, context_size=2):
    V = len(vocab)
    X, y = [], []

    for i, token in enumerate(tokens):
        context = tokens[i-context_size:i] + tokens[i+1:i+context_size+1]
        context_vector = np.zeros(V)
        
        for word in context:
            context_vector += np.array(one_hot_encode(word, vocab))
        context_vector = context_vector / len(context)
        center_vector = one_hot_encode(tokens[i], vocab)
        
        X.append(context_vector)
        y.append(center_vector)
    
    return np.array(X), np.array(y)

def generate_batch_data(tokens, vocab, context_size=2, batch_size=128):
    batches = 0

    while True:
        batch_tokens = tokens[batches*batch_size:(batches+1)*batch_size]
        if batch_tokens:
            yield generate_train_data(batch_tokens, vocab, context_size)
            batches += 1
        else:
            break

In [26]:
tokens = ['I', 'think', 'therefore', 'I', 'am']

In [27]:
vocab = generate_vocab(tokens)

In [28]:
vocab

{'think': 0, 'am': 1, 'therefore': 2, 'I': 3}

In [31]:
one_hot_encode('am', vocab)

array([0., 1., 0., 0.])

In [49]:
X, Y = generate_train_data(tokens, vocab)

In [50]:
X

array([[0.5       , 0.        , 0.25      , 0.33333333, 0.        ],
       [0.        , 0.        , 0.25      , 0.33333333, 0.        ],
       [0.5       , 0.5       , 0.        , 0.33333333, 0.5       ],
       [0.        , 0.5       , 0.5       , 0.        , 0.5       ]])

In [51]:
Y

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0.]])

In [26]:
import numpy as np

class Layer:
    def __init__(self):
        self.input = None
        self.output = None
    
    def forward(self, X):
        raise NotImplementedError
    
    def backward(self):
        raise NotImplementedError

class Linear(Layer):
    def __init__(self, in_dim, out_dim):
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.weights = np.random.rand(in_dim, out_dim)
        self.bias = np.random.rand(1, out_dim)
    
    def forward(self, X):
        self.input = X
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output
    
    def backward(self, out_error, learning_rate=0.05):
        in_error = np.dot(out_error, self.weights.T)
        weights_error = np.dot(self.input.T, out_error)
        bias_error = out_error
        
        self.weights = self.weights - learning_rate * weights_error
        self.bias = self.bias - learning_rate * bias_error
        
        return in_error

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return x > 0

def softmax(x, axis=1):
    return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)

def cross_entropy(x, target, axis=1):
    return (-1/target.shape[0]) * np.sum(target * np.log(x))

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1-np.tanh(x)**2

class Activation(Layer):
    def __init__(self, activation):
        activation_map = {
            'relu': {
                'func': relu,
                'derivative': relu_derivative
            },
            'tanh': {
                'func': tanh,
                'derivative': tanh_derivative
            }
        }
        self.activation = activation_map[activation]['func']
        self.derivative = activation_map[activation]['derivative']
    
    def forward(self, X):
        self.input = X
        self.output = self.activation(self.input)
        return self.output
    
    def backward(self, out_error, learning_rate):
        return self.derivative(self.input) * out_error

class Loss(Layer):
    def __init__(self):
        super().__init__()
        self.target = None

class CrossEntropyLoss(Loss):
    def __init__(self):
        super().__init__()
        self.softmax_out = None

    def forward(self, X, target):
        self.input = X
        self.target = target
        self.softmax_out = softmax(self.input)
        self.output = cross_entropy(self.softmax_out, self.target)
        return self.output

    def backward(self):
        return (1/self.target.shape[0])*(self.softmax_out - self.target)

class MSELoss(Loss):
    def forward(self, X, target):
        self.input = X
        self.target = target
        self.output = np.mean(np.power(target-X, 2))
        return self.output
    
    def backward(self):
        return (2 * (self.input-self.target)) / self.target.shape[0]

class Model:
    def __init__(self):
        self.layers = []
        self.loss = None
    
    def add(self, layer):
        self.layers.append(layer)
    
    def set_loss(self, loss):
        self.loss = loss
    
    def train(self, X, y, epochs=10, learning_rate=0.05):
        for i in range(epochs):
            predictions = self.predict(X)
            
            loss = self.loss()
            cost = loss.forward(predictions, y)
            
            error = loss.backward()
            
            for layer in reversed(self.layers):
                error = layer.backward(error, learning_rate)
            
            print(f'epoch={i}, loss={cost}')
    
    def predict(self, X):
        output = X
        for layer in self.layers:
            output = layer.forward(output)
        return output

In [79]:
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

model = Model()
model.add(Linear(2, 3))
model.add(Activation('tanh'))
model.add(Linear(3, 1))
model.add(Activation('tanh'))

model.set_loss(MSELoss)
model.train(X, y, epochs=1000, learning_rate=0.1)

predictions = model.predict(X)
print(predictions)

epoch=0, loss=0.3896520997009981
epoch=1, loss=0.38480037198559675
epoch=2, loss=0.37967755673105086
epoch=3, loss=0.37427427407523217
epoch=4, loss=0.36858270723852515
epoch=5, loss=0.36259706916053525
epoch=6, loss=0.3563141026964657
epoch=7, loss=0.3497335923181594
epoch=8, loss=0.3428588552654982
epoch=9, loss=0.3356971694116138
epoch=10, loss=0.3282600850717787
epoch=11, loss=0.3205635605485538
epoch=12, loss=0.31262785882532995
epoch=13, loss=0.30447714815092775
epoch=14, loss=0.29613876458234023
epoch=15, loss=0.2876421210272675
epoch=16, loss=0.27901728421238436
epoch=17, loss=0.2702932851665606
epoch=18, loss=0.2614962748407633
epoch=19, loss=0.2526476776306834
epoch=20, loss=0.24376252539815588
epoch=21, loss=0.23484816897634125
epoch=22, loss=0.2259035627448097
epoch=23, loss=0.2169193042632268
epoch=24, loss=0.20787859094175595
epoch=25, loss=0.1987592335891994
epoch=26, loss=0.18953684005524285
epoch=27, loss=0.18018923652494723
epoch=28, loss=0.17070209860233893
epoch=29,

epoch=960, loss=0.0007115916241867606
epoch=961, loss=0.0007107653914935472
epoch=962, loss=0.0007099410211898666
epoch=963, loss=0.0007091185070576768
epoch=964, loss=0.0007082978429063958
epoch=965, loss=0.0007074790225727641
epoch=966, loss=0.0007066620399206793
epoch=967, loss=0.0007058468888410683
epoch=968, loss=0.0007050335632517157
epoch=969, loss=0.0007042220570971375
epoch=970, loss=0.0007034123643484219
epoch=971, loss=0.0007026044790030907
epoch=972, loss=0.0007017983950849485
epoch=973, loss=0.0007009941066439529
epoch=974, loss=0.0007001916077560555
epoch=975, loss=0.0006993908925230799
epoch=976, loss=0.0006985919550725524
epoch=977, loss=0.0006977947895575983
epoch=978, loss=0.0006969993901567765
epoch=979, loss=0.000696205751073956
epoch=980, loss=0.0006954138665381655
epoch=981, loss=0.0006946237308034802
epoch=982, loss=0.0006938353381488594
epoch=983, loss=0.000693048682878036
epoch=984, loss=0.0006922637593193634
epoch=985, loss=0.000691480561825699
epoch=986, loss

In [28]:
model = Model()
vocab_size = len(vocab)
embedding_size = 10
model.add(Linear(vocab_size, embedding_size))
model.add(Activation('relu'))
model.add(Linear(embedding_size, vocab_size))

model.set_loss(CrossEntropyLoss)

for X, y in generate_batch_data(tokens, vocab):
    model.train(X, y, epochs=1, learning_rate=0.1)

# predictions = model.predict(X)
# print(predictions)

epoch=0, loss=10.803965146945673
epoch=0, loss=10.717711291696356
epoch=0, loss=10.94213276844914
epoch=0, loss=10.73386338092749
epoch=0, loss=10.73069152793749
epoch=0, loss=10.726844508349833
epoch=0, loss=10.71624650221188
epoch=0, loss=10.580714668721834
epoch=0, loss=10.803319664884851
epoch=0, loss=10.67726164571343
epoch=0, loss=10.605684776818466
epoch=0, loss=10.590747177168273
epoch=0, loss=10.844783096060144
epoch=0, loss=10.555626921612205
epoch=0, loss=10.681467881972907
epoch=0, loss=10.6296962326693
epoch=0, loss=10.703142324901497
epoch=0, loss=10.55598889695597
epoch=0, loss=10.401677660358345
epoch=0, loss=10.61549000822691
epoch=0, loss=10.598286894209084
epoch=0, loss=10.504289683296696
epoch=0, loss=10.472057420050366
epoch=0, loss=10.454608306534878
epoch=0, loss=10.36555417044579
epoch=0, loss=10.359146730131911
epoch=0, loss=10.72079548756025
epoch=0, loss=10.524509107598382
epoch=0, loss=10.62787727934509
epoch=0, loss=10.355153317278704
epoch=0, loss=10.29467

epoch=0, loss=8.775674489216499
epoch=0, loss=9.2807712970578
epoch=0, loss=9.210607528407067
epoch=0, loss=8.706976676095788
epoch=0, loss=8.859109452595368
epoch=0, loss=9.031231018785865
epoch=0, loss=8.94110525127719
epoch=0, loss=8.787835097168164
epoch=0, loss=8.8552463229583
epoch=0, loss=9.075761920832644
epoch=0, loss=9.025981446312262
epoch=0, loss=9.029914609725585
epoch=0, loss=9.349298271439228
epoch=0, loss=8.933206194366239
epoch=0, loss=9.460028659421157
epoch=0, loss=9.341601097381579
epoch=0, loss=8.956456347447354
epoch=0, loss=9.2034464447335
epoch=0, loss=9.240982325019054
epoch=0, loss=9.40460646821562
epoch=0, loss=8.988757393805814
epoch=0, loss=8.823014346189757
epoch=0, loss=9.122441215533074
epoch=0, loss=8.88713560531975
epoch=0, loss=8.521330640673606
epoch=0, loss=9.159147992551706
epoch=0, loss=9.262094231482573
epoch=0, loss=9.192710154145832
epoch=0, loss=8.932489364323612
epoch=0, loss=8.972043713699895
epoch=0, loss=9.1633109093931
epoch=0, loss=9.066

epoch=0, loss=8.807200752932044
epoch=0, loss=8.326778923458289
epoch=0, loss=7.8821734846577
epoch=0, loss=9.037437267014479
epoch=0, loss=10.160206554924393
epoch=0, loss=10.167562716929604
epoch=0, loss=10.079208864515627
epoch=0, loss=9.343975270966027
epoch=0, loss=9.714053500083
epoch=0, loss=8.881630632425633
epoch=0, loss=8.442906677164379
epoch=0, loss=8.157797349450043
epoch=0, loss=9.887356227736388
epoch=0, loss=9.660810169265453
epoch=0, loss=9.568223163549469
epoch=0, loss=9.615294997878
epoch=0, loss=9.57397833467119
epoch=0, loss=9.282134188854107
epoch=0, loss=8.002554330984076
epoch=0, loss=8.029509266849983
epoch=0, loss=8.453221188545974
epoch=0, loss=8.09752160741502
epoch=0, loss=8.797901079382568
epoch=0, loss=8.65464773218109
epoch=0, loss=8.92904027426414
epoch=0, loss=8.440700937365772
epoch=0, loss=9.37044062641311
epoch=0, loss=9.330128810289507
epoch=0, loss=9.298548853826391
epoch=0, loss=8.419613590325508
epoch=0, loss=8.70853222273542
epoch=0, loss=8.354

KeyboardInterrupt: 

In [20]:
X, y = next(generate_batch_data(tokens, vocab))

In [22]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
model = Model()
vocab_size = len(vocab)
embedding_size = 10
model.add(Linear(vocab_size, embedding_size))
model.add(Activation('relu'))
model.add(Linear(embedding_size, vocab_size))

model.set_loss(CrossEntropyLoss)
X, y = generate_train_data(tokens[:1000], vocab)
model.train(X, y, epochs=100, learning_rate=0.05)

epoch=0, loss=10.728138439199919
epoch=1, loss=10.722454452383854
epoch=2, loss=10.716771774968786
epoch=3, loss=10.711090398949281
epoch=4, loss=10.705410316345377
epoch=5, loss=10.699731519203311
epoch=6, loss=10.694053999595774
epoch=7, loss=10.688377749622466
epoch=8, loss=10.682702761410516
epoch=9, loss=10.677029027115044
epoch=10, loss=10.671356538919705
epoch=11, loss=10.665685289037045
epoch=12, loss=10.660015269709247
epoch=13, loss=10.65434647320847
epoch=14, loss=10.648678891837534
epoch=15, loss=10.643012517930464
epoch=16, loss=10.637347343853063
epoch=17, loss=10.631683362003487
epoch=18, loss=10.62602056481293
epoch=19, loss=10.620358944746085
epoch=20, loss=10.61469849430205
epoch=21, loss=10.609039206014664
epoch=22, loss=10.603381072453507
epoch=23, loss=10.597724086224313
epoch=24, loss=10.592068239969779
epoch=25, loss=10.586413526370379
epoch=26, loss=10.580759938144906
epoch=27, loss=10.575107468051435
epoch=28, loss=10.569456108887872
epoch=29, loss=10.563805853