# **`MAKEMORE PART2 TURKISH`**

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline  

In [2]:
import csv

with open('turkce_isim.csv', 'r') as f:
    reader = csv.reader(f)
    names = [row[0] for row in reader]
    names.pop(0)

print(names[:8])

['aba', 'abaca', 'abacan', 'abaç', 'abay', 'abayhan', 'abaza', 'abbas']


In [3]:
words = names
words[:5]

['aba', 'abaca', 'abacan', 'abaç', 'abay']

In [4]:
turkish_sort = ['a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o','ö', 'p', 'r', 's', 'ş', 't', 'u','ü', 'v', 'w', 'x', 'y', 'z']
turkish_sort.insert(0,'.')
turkish_sort,len(turkish_sort)
stoi = {ch:i for i,ch in enumerate(turkish_sort)}
itos = {i: st for st, i in stoi.items()}
print(stoi)
print(itos)

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'ç': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'ğ': 9, 'h': 10, 'ı': 11, 'i': 12, 'j': 13, 'k': 14, 'l': 15, 'm': 16, 'n': 17, 'o': 18, 'ö': 19, 'p': 20, 'r': 21, 's': 22, 'ş': 23, 't': 24, 'u': 25, 'ü': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, 'z': 31}
{0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'ç', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'ğ', 10: 'h', 11: 'ı', 12: 'i', 13: 'j', 14: 'k', 15: 'l', 16: 'm', 17: 'n', 18: 'o', 19: 'ö', 20: 'p', 21: 'r', 22: 's', 23: 'ş', 24: 't', 25: 'u', 26: 'ü', 27: 'v', 28: 'w', 29: 'x', 30: 'y', 31: 'z'}


## **`Dataset`:**
**xx**

In [5]:
block_size = 3
context = [0]*block_size
X, Y = [], []
for w in words[8:14]:
    context = [0]*block_size
    print(w)
    for le in w + '.':        
        ix= stoi[le]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '===>', itos[ix])
        context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)

abdal
... ===> a
..a ===> b
.ab ===> d
abd ===> a
bda ===> l
dal ===> .
abdi
... ===> a
..a ===> b
.ab ===> d
abd ===> i
bdi ===> .
abdullah
... ===> a
..a ===> b
.ab ===> d
abd ===> u
bdu ===> l
dul ===> l
ull ===> a
lla ===> h
lah ===> .
abdurrahman
... ===> a
..a ===> b
.ab ===> d
abd ===> u
bdu ===> r
dur ===> r
urr ===> a
rra ===> h
rah ===> m
ahm ===> a
hma ===> n
man ===> .
abdülalim
... ===> a
..a ===> b
.ab ===> d
abd ===> ü
bdü ===> l
dül ===> a
üla ===> l
lal ===> i
ali ===> m
lim ===> .
abdülazim
... ===> a
..a ===> b
.ab ===> d
abd ===> ü
bdü ===> l
dül ===> a
üla ===> z
laz ===> i
azi ===> m
zim ===> .


In [6]:
X.shape,X.dtype, Y.shape, Y.dtype

(torch.Size([52, 3]), torch.int64, torch.Size([52]), torch.int64)

### **`Important`:**
**What I find important about the second part is embedding. In the paper BENGIO at al. used 30 dimensional space for each word. Here Andrej uses 2 dimensional space for a letter. Which was not available in the previous example**

### **`Embeddings`:**
**Just two floating number for and integer(index of a character.)***

In [7]:
C = torch.randn(len(turkish_sort), 2)
C.shape

torch.Size([32, 2])

### **`One hot encoding`:**
**understanding the effect of one hot vector turing matrix multiplication.Simply only `one` in the vector effects the matrix multiplication.**
```python
F.one_hot(torch.tensor(5), num_classes=len(turkish_sort)).float() @ C
```
**just plucks out the 5th index from the `C` lockup tensor**


In [8]:
F.one_hot(torch.tensor(5), num_classes=len(turkish_sort)).float() @ C

tensor([-0.5029, -0.4986])

In [9]:
# for getting more than one index from C then it is possible
# to use list of the indexes to the one_hot function
# the result will be embedded vectors of the indexes
one_hot = F.one_hot(torch.tensor([5, 6]), num_classes=len(turkish_sort)).float() @ C
one_hot


tensor([[-0.5029, -0.4986],
        [ 0.6134, -0.8831]])

In [10]:
# the shape represents at the 0th index the number of embeddings
# and at the 1st index the number of the dimensions of the embedding
one_hot.shape

torch.Size([2, 2])

In [11]:
#As you put your training examples to the lookup table you will get the embeddings of the examples 
C[X].shape, C[X][:3]

(torch.Size([52, 3, 2]),
 tensor([[[ 1.4203, -0.4910],
          [ 1.4203, -0.4910],
          [ 1.4203, -0.4910]],
 
         [[ 1.4203, -0.4910],
          [ 1.4203, -0.4910],
          [-0.5457,  1.6587]],
 
         [[ 1.4203, -0.4910],
          [-0.5457,  1.6587],
          [-0.1320, -0.0494]]]))

In [12]:
# six is for the 3 letter and 2 embeddings
# 100 is arbitrary number of the hidden layer
# b1 is the bias for the hidden layer
W1 = torch.randn(6,100)
b1 = torch.randn(100)

### **`This section look like a mini pytorch course like in the previous lecture mentioning torch.sum() `:**
#### **Understand the `cat`(concatenate) `unbind`, `view` operations:**
**x**


In [13]:
emb = C[X]
emb.shape

torch.Size([52, 3, 2])

In [14]:
x = torch.randn(1, 2)
x

tensor([[ 1.4996, -0.1410]])

#### **Understand the `cat`(concatenate):**
**Pretty straighhforward dimension is the key, '0' means as rows and '1' means as columns if there were two dimensions.**


In [15]:
torch.cat((x,x,x),0)

tensor([[ 1.4996, -0.1410],
        [ 1.4996, -0.1410],
        [ 1.4996, -0.1410]])

#### **Understand the `unbind`:**
**x**

In [16]:
x3 = torch.cat((x,x,x),0)
x3

tensor([[ 1.4996, -0.1410],
        [ 1.4996, -0.1410],
        [ 1.4996, -0.1410]])

In [17]:
torch.unbind(x3,0)

(tensor([ 1.4996, -0.1410]),
 tensor([ 1.4996, -0.1410]),
 tensor([ 1.4996, -0.1410]))

#### **Understand the `view`:**
**x**

In [18]:
# first three characters of a names looks like this for two names:

first_three_of_first  = torch.randn(1,3,2)

first_three_of_second  = torch.randn(1,3,2)

together = torch.cat((first_three_of_first,first_three_of_second),0)
together



tensor([[[ 0.1736,  0.3010],
         [ 0.5734,  0.4294],
         [ 1.4347, -0.9318]],

        [[-0.2327, -0.0586],
         [-0.1716, -0.2636],
         [ 2.1780,  1.3951]]])

In [19]:
#need to conver them to a row.But without needing the first dimension which is 2 that represent different names.
together.view(-1,6)

tensor([[ 0.1736,  0.3010,  0.5734,  0.4294,  1.4347, -0.9318],
        [-0.2327, -0.0586, -0.1716, -0.2636,  2.1780,  1.3951]])

#### **`Important: how bias vector added`:**
**This is how all of the activations added not the same number but the same vector. I guess each number on the vector added to the consecutive neuron activation** 

In [20]:
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
h, h.shape

(tensor([[ 0.0723, -0.5861, -1.0000,  ..., -0.9805,  0.7936,  0.6151],
         [-0.5851, -0.3529, -0.9977,  ..., -0.9802, -0.7314,  0.9998],
         [-0.9896,  0.7565,  0.9536,  ..., -0.2332,  0.9700,  0.9146],
         ...,
         [-0.6427,  0.6573,  0.9874,  ..., -0.5188,  0.1763,  0.9906],
         [ 0.9975,  0.3514,  0.9932,  ...,  0.7096, -0.9998,  0.9555],
         [ 0.2339, -0.0394,  0.8533,  ...,  0.0878, -0.9922, -0.8178]]),
 torch.Size([52, 100]))

In [21]:
W2 =torch.randn(100, len(turkish_sort))
b2 = torch.randn(len(turkish_sort))


In [22]:
logits = h @ W2 + b2
logits, logits.shape

(tensor([[-19.5141,  10.8293,  -1.6469,  ...,  -8.2664,  -3.5521,  -1.6764],
         [ -4.0797,  16.5948,   0.4707,  ...,  -4.4218,   7.5601,  -6.4462],
         [  5.0019,  -1.2570,  -6.4912,  ...,   1.0800,   8.2391,  16.5117],
         ...,
         [  6.6508,   0.2585,  -1.6568,  ...,   3.8307,  12.4436,  13.3933],
         [  5.8239,   2.0466,   3.6497,  ...,  20.4477,   9.0831,  14.5781],
         [  2.8911,   2.3771,  -6.1821,  ...,  12.4109,   3.7470,  12.2366]]),
 torch.Size([52, 32]))

In [23]:
counts = logits.exp()
probs = counts / counts.sum(-1, keepdim=True)
probs, probs.shape

(tensor([[5.8247e-19, 8.7752e-06, 3.3488e-11,  ..., 4.4675e-14, 4.9831e-12,
          3.2516e-11],
         [4.4979e-13, 4.2837e-04, 4.2582e-11,  ..., 3.1947e-13, 5.1066e-08,
          4.2195e-14],
         [9.7736e-06, 1.8701e-08, 9.9693e-11,  ..., 1.9355e-07, 2.4884e-04,
          9.7427e-01],
         ...,
         [2.4071e-04, 4.0302e-07, 5.9368e-08,  ..., 1.4346e-05, 7.8933e-02,
          2.0404e-01],
         [4.4420e-07, 1.0165e-08, 5.0502e-08,  ..., 9.9682e-01, 1.1562e-05,
          2.8148e-03],
         [1.7078e-05, 1.0215e-05, 1.9588e-09,  ..., 2.3272e-01, 4.0193e-05,
          1.9549e-01]]),
 torch.Size([52, 32]))

In [24]:
probs[0].sum()

tensor(1.)

#### **`grad=None`:**
**During the forward pass, as far as I understand we do not record gradient because before backward pass we just make it None and as far as I understand we do not use it. I guess gradien formed hust after loss is calculated. Hope to see it during the backprop lecture** 

In [25]:
torch.arange(len(turkish_sort))

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [26]:
probs[torch.arange(len(Y)), Y], probs[torch.arange(len(Y)), Y].shape

(tensor([8.7752e-06, 4.2582e-11, 5.4013e-09, 3.6486e-01, 3.2696e-07, 4.6897e-08,
         8.7752e-06, 4.2582e-11, 5.4013e-09, 1.3505e-04, 6.0203e-06, 8.7752e-06,
         4.2582e-11, 5.4013e-09, 2.2334e-03, 9.6854e-06, 1.0000e+00, 8.3443e-08,
         4.2777e-18, 1.8286e-04, 8.7752e-06, 4.2582e-11, 5.4013e-09, 2.2334e-03,
         5.3923e-09, 7.9711e-12, 1.4610e-12, 8.9486e-12, 3.3427e-02, 5.4765e-01,
         2.3400e-13, 1.8972e-05, 8.7752e-06, 4.2582e-11, 5.4013e-09, 4.0599e-06,
         3.6201e-08, 7.5608e-15, 2.4239e-05, 4.9091e-07, 1.5448e-04, 6.6624e-05,
         8.7752e-06, 4.2582e-11, 5.4013e-09, 4.0599e-06, 3.6201e-08, 7.5608e-15,
         1.7931e-10, 1.8331e-06, 1.0411e-06, 1.7078e-05]),
 torch.Size([52]))

**these are the probabilities of the correct letters but it looks mostly very low because the model is not trained yet.**


In [27]:
#no need to any additional operation to get the loss because the expected result should be one for the correct index zero for the others.
loss = -probs[torch.arange(len(Y)), Y].log().mean()
loss

tensor(16.1361)

### **`Write the more respectable version`:**
**Before move on,do it by just checking the video.**



In [33]:
X.shape

torch.Size([52, 3])

In [None]:
W1 =torch.randn(6,100)

In [35]:
emb = C[X].view(-1,6)
W1 =torch.randn(6,100)
b1 = torch.randn(100)
W2 = torch.randn(100,32)
b = torch.randn(32)
emb.shape

torch.Size([52, 6])

In [58]:
W1

tensor([[ 8.9217e-01, -6.6178e-01,  6.1124e-01, -6.0231e-01, -1.0537e+00,
         -6.6607e-01, -2.4958e-01, -1.3112e-01, -1.0556e+00, -1.0644e+00,
         -4.0786e-01,  1.8293e-02, -6.6983e-02, -6.4250e-01, -4.3001e-01,
         -6.1038e-02,  1.8944e-01, -7.5578e-01, -8.5329e-01,  6.0211e-01,
          7.1527e-01, -1.6008e+00, -5.7841e-01, -3.9869e-01, -2.2066e-01,
         -1.0323e+00, -8.2252e-01,  1.1279e+00,  1.8025e+00, -1.4047e+00,
          6.1778e-02,  6.4445e-01,  1.0657e+00, -1.8962e-02, -1.2244e+00,
          2.8159e-01, -2.3187e-02,  1.4662e+00, -4.9876e-01, -1.1359e+00,
         -6.7188e-01,  2.4340e-01, -6.6283e-01, -8.6735e-01,  2.6425e+00,
          3.4549e-01,  1.3021e-01, -1.7434e+00,  1.1922e+00,  1.1067e+00,
         -2.3990e-01, -2.8501e-01, -5.5344e-01,  1.2156e+00, -1.8077e-01,
          1.3447e+00, -2.2380e+00, -2.1846e+00, -1.2733e+00,  4.2082e-01,
         -4.6160e-01,  9.3822e-01, -3.7169e-01,  1.4554e+00, -3.0614e+00,
         -1.4998e+00, -7.8042e-01,  2.

In [61]:
emb

tensor([[ 1.6804,  0.0523,  1.6804,  0.0523,  1.6804,  0.0523],
        [ 1.6804,  0.0523,  1.6804,  0.0523, -0.3516, -1.1509],
        [ 1.6804,  0.0523, -0.3516, -1.1509, -0.8601, -0.9394],
        [-0.3516, -1.1509, -0.8601, -0.9394, -0.3316,  1.2129],
        [-0.8601, -0.9394, -0.3316,  1.2129, -0.3516, -1.1509],
        [-0.3316,  1.2129, -0.3516, -1.1509, -0.0508, -0.9616],
        [ 1.6804,  0.0523,  1.6804,  0.0523,  1.6804,  0.0523],
        [ 1.6804,  0.0523,  1.6804,  0.0523, -0.3516, -1.1509],
        [ 1.6804,  0.0523, -0.3516, -1.1509, -0.8601, -0.9394],
        [-0.3516, -1.1509, -0.8601, -0.9394, -0.3316,  1.2129],
        [-0.8601, -0.9394, -0.3316,  1.2129, -0.5565, -1.9247],
        [ 1.6804,  0.0523,  1.6804,  0.0523,  1.6804,  0.0523],
        [ 1.6804,  0.0523,  1.6804,  0.0523, -0.3516, -1.1509],
        [ 1.6804,  0.0523, -0.3516, -1.1509, -0.8601, -0.9394],
        [-0.3516, -1.1509, -0.8601, -0.9394, -0.3316,  1.2129],
        [-0.8601, -0.9394, -0.3316,  1.2

In [71]:
p

tensor([4.0154e-33, 4.7881e-16, 1.6726e-27, 1.3335e-02, 9.5756e-19, 6.3358e-35,
        4.0154e-33, 4.7881e-16, 1.6726e-27, 4.3340e-38, 5.0287e-34, 4.0154e-33,
        4.7881e-16, 1.6726e-27, 1.5675e-34, 2.1281e-30, 3.4889e-20, 7.3150e-30,
        2.4121e-31, 1.5839e-20, 4.0154e-33, 4.7881e-16, 1.6726e-27, 1.5675e-34,
        8.3237e-32, 8.1730e-28, 2.1254e-30, 2.0603e-35, 5.1965e-28, 6.7585e-32,
        1.5540e-22, 1.2176e-29, 4.0154e-33, 4.7881e-16, 1.6726e-27, 1.0678e-18,
        5.3318e-20, 6.5975e-40, 1.7326e-10, 7.3788e-33, 4.1279e-08, 1.6297e-23,
        4.0154e-33, 4.7881e-16, 1.6726e-27, 1.0678e-18, 5.3318e-20, 6.5975e-40,
        1.1998e-18, 1.5697e-41, 9.8666e-01, 1.8612e-30])

In [75]:
h = emb @ W1 + b1
print(h)
logits = h @ W2 + b2
print(f'logits shape: {logits.shape}')
counts = logits.exp()
print(f'counts:{counts}')
probs = counts / counts.sum(-1, keepdim=True)
print(f'probs:{probs}'),print(probs.shape)

loss = -probs[torch.arange(52),Y].mean()
print(f'loss: {loss}')


tensor([[ 3.4950, -0.5382, -0.1393,  ..., -1.0708,  1.8940,  4.4151],
        [ 2.7201, -0.5062,  3.8601,  ..., -1.9793, -0.4609,  3.3421],
        [ 2.0902, -5.7015,  2.8828,  ..., -2.0824, -2.5423,  1.7966],
        ...,
        [ 0.8564, -3.4123, -2.7196,  ...,  1.8478, -1.0885, -0.5635],
        [-0.0869,  5.4036,  4.5189,  ...,  0.0722, -0.4716, -0.3411],
        [ 2.2442, -9.2434,  0.8620,  ..., -2.5186, -1.3786,  2.8398]])
logits shape: torch.Size([52, 32])
counts:tensor([[1.9550e-06, 1.7232e-04, 6.9490e+21,  ..., 1.0960e-09, 1.3534e+19,
         1.7559e-01],
        [7.2328e-08, 1.2383e-20, 2.0548e+13,  ..., 2.0266e-07, 4.0671e+00,
         5.4682e-05],
        [1.2226e-02, 2.5902e-03, 1.3144e+17,  ..., 3.4877e-14, 5.6641e+02,
         1.9602e+07],
        ...,
        [4.1072e+05, 3.0748e+33, 1.2496e+05,  ..., 7.2053e-15, 3.3731e+16,
         3.4145e+18],
        [4.7686e-08, 3.1261e-34, 5.9116e-13,  ..., 4.2657e+06, 7.5231e-23,
         3.4377e-03],
        [7.9875e-02, 1.049

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

.