<a href="https://colab.research.google.com/github/ochekroun/labs/blob/master/IFAGE_Cours_11_G%C3%A9n%C3%A9ration_de_texte_avec_un_r%C3%A9seau_de_neurones_%C3%A0_plusieurs_couches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
df = pd.read_csv('prenoms.csv')
df = df.groupby('name', as_index=False).agg({'count': 'sum'})
df = df.sample(frac=1.0)
df['name'] = df['name'].str.lower()
df = df[df['count'] >= 500]
#df = df[:100]
df

In [None]:
words = df['name'].tolist()
words

In [None]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
num_chars = len(chars) + 1
itos = {i:s for s,i in stoi.items()}

In [None]:
num_chars

In [None]:
itos

In [None]:
# "context length": combien de caractères on utilise pour prédire le suivant
block_size = 3
X, Y = [], []
#for w in words[:2]:
for w in words:
  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)
num = X.shape[0]

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
# le nombre de dimension qu'on utilige pour le plongement sémantique
embedding_dimension = 2

C = torch.randn((num_chars, embedding_dimension))
C

In [None]:
# on "convertit" toutes nos entrées dans l'espace du plongement
emb = C[X]
emb

In [None]:
emb.shape

In [None]:
# on peut "applatir" les entrées en une dimension par exemple
emb.view(-1, 6)

In [None]:
# couche de réseau de neurones "cachée", de 100 neurones, avec des biais
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

z = emb.view(-1, 6) @ W1 + b1

# on calcule l'activation avec tanh
h = torch.tanh(z)

In [None]:
h.shape

In [None]:
h

In [None]:
# couche de sortie du réseau de neurone
# on passe de 100 entrées de la couche précédente, au nombre de caractères dans notre vocabulaire
W2 = torch.randn((100, num_chars))
b2 = torch.randn(num_chars)
logits = h @ W2 + b2

In [None]:
logits.shape

In [None]:
logits

In [None]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [None]:
prob

In [None]:
prob.shape

In [None]:
X.shape[0]

In [None]:
loss = -prob[torch.arange(num), Y].log().mean()
loss

In [None]:
# on peut plutôt utiliser F.cross_entropy, plus rapide!
loss = F.cross_entropy(logits, Y)
loss

In [None]:
# choisit au hasard 8 chiffre entre 1 et 3 (exclut)
torch.randint(1, 3, (8,))

In [None]:
torch.randint(0, X.shape[0], (32,))

## Cleaned up

On ajoute 3 choses déjà vues précédemment:
- La séparatation train/test/dev
- La gestion du `device`
- Le batching
- La diminution du learning rate

In [None]:
device = 'cpu'

In [None]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

Xtr = Xtr.to(device)
Ytr = Ytr.to(device)

In [None]:
g = torch.Generator(device=device).manual_seed(2147483647) # for reproducibility

C = torch.randn((num_chars, 10), generator=g, device=device)
W1 = torch.randn((30, 200), generator=g, device=device)
b1 = torch.randn(200, generator=g, device=device)
W2 = torch.randn((200, num_chars), generator=g, device=device)
b2 = torch.randn(num_chars, generator=g, device=device)
parameters = [C, W1, b1, W2, b2]

In [None]:
W1.device

In [None]:
nb_params = sum(p.nelement() for p in parameters)
print(f"Notre modèle à {nb_params} paramètres")

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
lossi = []
stepi = []

In [None]:
Xtr.shape[0]

In [None]:
for i in tqdm(range(200_000)):
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 10)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  lr = 0.1 if i < 100_000 else 0.01
  for p in parameters:
    p.data += -0.1 * p.grad

  # track stats
  #lri.append(lre[i])
  stepi.append(i)
  lossi.append(loss.log10().item())

In [None]:
plt.plot(lossi);

In [None]:
emb = C[Xtr] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr)
loss

In [None]:
perplexity = torch.exp(loss)
print(f'Perplexité: {perplexity:.2f}')

In [None]:
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

In [None]:
perplexity = torch.exp(loss)
print(f'Perplexité: {perplexity:.2f}')

In [None]:
# visualize dimensions 0 and 1 of the embedding matrix C for all characters
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white')
plt.grid('minor')

In [None]:
g = torch.Generator(device=device).manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break

    print(''.join(itos[i] for i in out))