### SentencePiece
découpe les mots. Peut être utilisé en multi-langue (particulièrement pour les langues avec des bases de mots communes)

### WordEmbeddings 
Normalement appliqué après tokenization. Doit être appris sur une langue. Serait-il possible d’apprendre des sentencePice embedding dans plusieurs langue? A tester sur une langue, puis deux, puis 30 (comme MUSE).

### Objectifs : 
* Apprendre sentencePiece
* Apprendre sentencePiece Embeddings
* Comment évaluer les embeddings?
* Apprendre sentencePiece Embeddings en plusieurs langues
* Evaluation?


## Create Embeddings

In [430]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random

In [431]:
embed_size = 10

In [432]:
word_to_id = {"hello": 0, "world": 1} # create dictionnary of words ids
embeds = nn.Embedding(len(word_to_id), embed_size)

In [433]:
word_tensor = torch.tensor([word_to_id["hello"]], dtype=torch.long)
hello_embed = embeds(word_tensor)
print(hello_embed)

tensor([[ 0.0988,  1.3208,  1.4393,  0.7731, -0.3379, -1.2744, -0.4319, -0.5081,
         -0.3132,  1.8581]], grad_fn=<EmbeddingBackward>)


## Pretrained Embeddings with CBOW

In [434]:
text = """Deep learning (also known as deep structured learning or hierarchical learning) 
is part of a broader family of machine learning methods based on learning data representations, 
as opposed to task-specific algorithms. Learning can be supervised, semi-supervised or unsupervised. 
Deep learning architectures such as deep neural networks, deep belief networks and recurrent neural 
networks have been applied to fields including computer vision, speech recognition, natural language 
processing, audio recognition, social network filtering, machine translation, bioinformatics, 
drug design, medical image analysis, material inspection and board game programs, where they have 
produced results comparable to and in some cases superior to human experts. Deep learning models are 
vaguely inspired by information processing and communication patterns in biological nervous systems 
yet have various differences from the structural and functional properties of biological brains 
(especially human brains), which make them incompatible with neuroscience evidences.""".split()

In [435]:
vocab = set(text)
vocab_size = len(vocab)
word2id = {word:i for i,word in enumerate(vocab)}
id2word = {i:word for i,word in enumerate(vocab)}

### Generate data for training

In [436]:
datas = []
for i in range(2, len(text) - 2):
    context = [text[i - 2], text[i - 1],
               text[i + 1], text[i + 2]]
    target = text[i]
    datas.append((context, target))
print(datas[:5])

[(['Deep', 'learning', 'known', 'as'], '(also'), (['learning', '(also', 'as', 'deep'], 'known'), (['(also', 'known', 'deep', 'structured'], 'as'), (['known', 'as', 'structured', 'learning'], 'deep'), (['as', 'deep', 'learning', 'or'], 'structured')]


In [437]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [438]:
def get_max_prob_result(input, ix_to_word):
    return ix_to_word[get_index_of_max(input)]

In [439]:
def get_index_of_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i 
    return index

### CBOW model

In [440]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size=128):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.projection = nn.Sequential(
                            nn.Linear(embedding_dim, hidden_size),
                            nn.ReLU(),
                            nn.Linear(hidden_size, vocab_size),
                            nn.LogSoftmax(dim = -1)
                        )
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.projection(embeds)
        return out

    def get_word_emdedding(self, word):
        word = torch.LongTensor([word2id[word]])
        return self.embeddings(word).view(1,-1)

In [441]:
model = CBOW(vocab_size, embed_size)

### Learning

In [442]:
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [443]:
for epoch in range(50):
    total_loss = 0
    for context, target in datas:
        context_vector = make_context_vector(context, word2id)  
        model.zero_grad()
        log_probs = model(context_vector)
        loss = loss_function(log_probs, torch.tensor([word2id[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()

        total_loss += loss.data

### Test

In [444]:
context = ['deep','networks']
context_vector = make_context_vector(context, word2id)
a = model(context_vector).data.numpy()
print('Raw text: {}\n'.format(' '.join(text)))
print('Context: {}\n'.format(context))
print('Prediction: {}'.format(get_max_prob_result(a[0], id2word)))

Raw text: Deep learning (also known as deep structured learning or hierarchical learning) is part of a broader family of machine learning methods based on learning data representations, as opposed to task-specific algorithms. Learning can be supervised, semi-supervised or unsupervised. Deep learning architectures such as deep neural networks, deep belief networks and recurrent neural networks have been applied to fields including computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases superior to human experts. Deep learning models are vaguely inspired by information processing and communication patterns in biological nervous systems yet have various differences from the structural and functional properties of biological brains (especially human brain

# Train embeddings with NGram

### Ngram Model

In [445]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, embeddings, context_size, hidden_size=128):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = embeddings
        self.linear1 = nn.Linear(context_size * embeddings.embedding_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

### Training

In [446]:
context_size=4
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(model.embeddings, context_size)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [447]:
for epoch in range(10):
    total_loss = 0
    for context, target in datas:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word2id[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word2id[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)

[638.649551153183, 634.9874835014343, 631.3503413200378, 627.7379004955292, 624.1504530906677, 620.5849575996399, 617.0376613140106, 613.5099146366119, 610.0003879070282, 606.5009481906891]


In [448]:
context = ['deep','methods','of', 'is']
context_vector = make_context_vector(context, word2id)
a = model(context_vector).data.numpy()
print('Raw text: {}\n'.format(' '.join(text)))
print('Context: {}\n'.format(context))
print('Prediction: {}'.format(get_max_prob_result(a[0], id2word)))

Raw text: Deep learning (also known as deep structured learning or hierarchical learning) is part of a broader family of machine learning methods based on learning data representations, as opposed to task-specific algorithms. Learning can be supervised, semi-supervised or unsupervised. Deep learning architectures such as deep neural networks, deep belief networks and recurrent neural networks have been applied to fields including computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases superior to human experts. Deep learning models are vaguely inspired by information processing and communication patterns in biological nervous systems yet have various differences from the structural and functional properties of biological brains (especially human brain

# SkipGram

In [449]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, emb_dimension):
        """Initialize model parameters. 
        Args: 
            vocab_size: vocab size. 
            emb_dimention: Embedding dimention, typically from 50 to 500. """
        super(SkipGramModel, self).__init__()
        self.emb_size = vocab_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=True)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=True)
        self.init_emb()
        
    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
        
    def forward(self, pos_u, pos_v, neg_v):
        """ 
        Args: 
            pos_u: list of center word ids for positive word pairs. 
            pos_v: list of neibor word ids for positive word pairs. 
            neg_v: list of neibor word ids for negative word pairs. 
        """
        losses = []
        emb_u = self.u_embeddings(pos_u)
        emb_v = self.v_embeddings(pos_v)
        score = torch.mul(emb_u, emb_v).squeeze()
        score = torch.sum(score, dim=1)
        score = F.logsigmoid(score)
        losses.append(sum(score))
        neg_emb_v = self.v_embeddings(neg_v)
        neg_score = torch.bmm(neg_emb_v, emb_u.unsqueeze(2)).squeeze(2)
        neg_score = torch.sum(neg_score, dim=1)
        neg_score = F.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        return -1 * sum(losses)

In [450]:
e = nn.Embedding(100, 10, sparse=True)

In [451]:
t = torch.LongTensor(1)
t[0] = 5

In [452]:
e(t).shape

torch.Size([1, 10])

## dataset

In [469]:
class contextDataset(data.Dataset):
    def __init__(self, text, word2id):
        self.text = text
        self.word2id = word2id
        
    def __len__(self):
        return len(text)
    
    def __getitem__(self, i):
        target = torch.LongTensor([self.word2id[self.text[i]]])
        if i >= 2 and i < len(text) - 2:
            context = [self.text[i - 2], self.text[i - 1],
                      self.text[i + 1], self.text[i + 2]]
        elif i == 1:
            context = [self.text[i - 1], self.text[i + 1],
                      self.text[i + 2], self.text[i + 3]]
        elif i == 0:
            context = [self.text[i + 1], self.text[i + 2], 
                       self.text[i + 3], self.text[i+4]]
        elif i >= len(text) - 2:
            context = [self.text[i-4], self.text[i-3] ,self.text[i - 2], self.text[i - 1]]
        else:
            print("ERROOR :", i, '/', len(self.text))
            return None
        context = [self.word2id[c] for c in context]
        
        neg = random.choices(list(word2id.values()), k=1)
        
        return target, context, neg

In [470]:
cd = contextDataset(text, word2id)

In [471]:
cd[0]

(tensor([2]), [19, 65, 12, 8], [96])

In [472]:
model = SkipGramModel(vocab_size, embed_size)

In [473]:
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [474]:
i = torch.LongTensor(1)
i[0] = cd[0][0]
p = torch.LongTensor(1,4)
p[0] = torch.LongTensor(cd[0][1])
n = torch.LongTensor(1,5)
n[0] = torch.LongTensor(cd[0][2])

In [475]:
model(i,p,n)

tensor(3.4657, grad_fn=<MulBackward0>)

In [476]:
for epoch in range(10):
    total_loss = 0
    for target, context, neg in cd:
        model.zero_grad()
        
        cont = torch.LongTensor(1,len(context))
        cont[0] = torch.LongTensor(context)
        
        n = torch.LongTensor(1,len(neg))
        n[0] = torch.LongTensor(neg)
        
        log_probs = model(target, cont, n)

        log_probs.backward()
        optimizer.step()

        total_loss += loss.item()
    losses.append(total_loss)
print(losses)

[638.649551153183, 634.9874835014343, 631.3503413200378, 627.7379004955292, 624.1504530906677, 620.5849575996399, 617.0376613140106, 613.5099146366119, 610.0003879070282, 606.5009481906891, 722.7742567062378, 722.7742567062378, 722.7742567062378, 722.7742567062378, 722.7742567062378, 722.7742567062378, 722.7742567062378, 722.7742567062378, 722.7742567062378, 722.7742567062378]
