# <center>PROJECT SANDBOX</center>

## Documentation
The aim of this notebook is to provide a simple sandbox to test different NN architectures for the project. , here is a doc about the functions imported from `scripts` folder : 

- **`prepare_dataset(device,ratio=0.5,shuffle_ctx=False)`** :
    - **Input**:
        - device : a torch.device object
        - ratio : a float ratio between 0 and 1 that determines the average proportion of modern english verses in the data loader
        - shuffle_ctx : if `True`, shuffle the contexts within a Batch so that half of the `x_1` elements has a wrong context `ctx_1`. Useful to train the context recognizer model.
    - **Return** :
        - a torch Dataset | class : Shakespeare inherited from torch.utils.data.Dataset
        - a python word dictionary (aka tokenizer) | class : dict
    - **Tensors returned when loaded in the dataloader**:
        - x_1 : input verse (modern / shakespearian)
        - x_2 : output verse (modern / shakespearian)

        - ctx_1 = context of the input verse
        - ctx_2 = context of the output verse

        - len_x : length of the input verse
        - len_y : length of the output verse

        - len_ctx_x : length of the input verse context
        - len_ctx_y : length of the output verse context

        - label : label of the input verse (0 : modern, 1 : shakespearian)
        - label_ctx : label of the context (0 : wrong context, 1 : right context)
- **`string2code(string,dict)`** : 
    - **Input**:
        - string : a sentence
        - dict : a tokenizer
    - **Return** :
        - a torch Longtensor (sentence tokenized)
- **`code2string(torch.Longtensor,dict)`** : 
    - **Input**:
        - torch.Longtensor : a sentence tokenized
        - dict : a tokenizer
    - **Return** :
        - a string sentence

## Importing packages

In [1]:
from scripts.data_builders.prepare_dataset import prepare_dataset_ctx,string2code,code2string,assemble

import torch
import torchvision.datasets as datasets
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter
import math
from torch.nn import BCELoss,CrossEntropyLoss
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import pickle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device = ",device)

device =  cuda


## Preprocessing data

In [2]:
train_data, dict_words = prepare_dataset_ctx(device,ratio=0.5,shuffle_ctx=True) #check with shift+tab to look at the data structure
batch_size = 16
dict_token = {b:a for a,b in dict_words.items()} #dict for code2string

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
                                           shuffle=True,collate_fn=train_data.collate)
dict_size = len(dict_words)
d_embedding = 300 #cf. paper Y.Kim 2014 Convolutional Neural Networks for Sentence Classification

print("- dict size : ",dict_size)

Loading ...
- Shakespeare context dataset length :  21079
- Corrupted samples (ignored) :  0
- dict size :  17513


## Designing NN model

# Model 1 : CNN

In [3]:
class CoherenceClassifier(torch.nn.Module):
    def __init__(self,dict_size=dict_size,d_embedding=300):
        super().__init__()
        self.embed_layer=torch.nn.Embedding(dict_size+1,d_embedding,padding_idx=dict_size)

        self.conv_1 = torch.nn.Conv1d(d_embedding,3,kernel_size = 3, stride = 1)
        self.max_pool = torch.nn.MaxPool1d(3,2)
        self.relu = torch.nn.ReLU()
        self.linear = torch.nn.Linear(3,1)
    
    def forward(self,x):
        x = self.embed_layer(x)
        x = self.conv_1( x.transpose(1,2))
        x = self.max_pool( x )
        x = self.relu( x )
        x = torch.max( x , 2 )[0]
        x = torch.sigmoid(self.linear(x))
        return x

In [4]:
epochs = 100
model = CoherenceClassifier().to(device)
optimizer = optim.Adam(params=model.parameters(),lr=0.01)
loss_func = BCELoss()
n = len(train_data.x) // batch_size

In [5]:
for epoch in range(epochs):
    total_loss = 0
    i = 0
    for _,_ , ctx,_ , _,_ , len_ctx,_, _,label_ctx in train_loader:
        i+=1
        optimizer.zero_grad()
        
        #CNN
        ctx = model.forward(ctx).reshape(-1) #CNN architecture
        
        loss = loss_func( ctx , label_ctx.float() )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        #Vizualization

    print('-' * 35)
    print('| epoch {:3d} | '
          'lr {:02.2f} | '
          'loss {:5.2f}'.format(
            epoch+1, optimizer.state_dict()["param_groups"][0]["lr"],
            round(total_loss,2)))

ValueError: not enough values to unpack (expected 10, got 4)

# Model 2 : LSTM 

In [None]:
class CoherenceClassifier(torch.nn.Module):
    def __init__(self,dict_size=dict_size,d_embedding=300,d_hidden=100):
        super().__init__()
        self.d_hidden = d_hidden
        self.embedding = nn.Embedding(dict_size+1,d_embedding,padding_idx=dict_size)
        self.lstm = nn.LSTM(d_embedding,self.d_hidden,dropout=0.,num_layers=1,bidirectional=False)
        #self.bn0 = nn.BatchNorm1d(self.d_hidden)
        self.linear1 = torch.nn.Linear(self.d_hidden,1)
    
    def forward(self,x,len_x):
        x = self.embedding(x)
        x = pack_padded_sequence(x.permute(1,0,2),len_x,enforce_sorted=False)
        _,x = self.lstm(x)
        x = x[0].reshape(-1,self.d_hidden)
        #x = self.bn0(x)
        x = torch.sigmoid( self.linear1(x) ).reshape(-1)
        return x

In [None]:
epochs = 100
model = CoherenceClassifier().to(device)
optimizer = optim.Adam(params=model.parameters(),lr=0.01)
loss_func = BCELoss()
n = len(train_data.x) // batch_size

In [None]:
for epoch in range(epochs):
    total_loss = 0
    i = 0
    for _,_ , ctx,_ , _,_ , len_ctx,_, _,label_ctx in train_loader:
        i+=1
        optimizer.zero_grad()
        
        #LSTM
        ctx = model.forward(ctx,len_ctx) #LSTM architecture
        
        loss = loss_func( ctx , label_ctx.float() )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print('-' * 35)
    print('| epoch {:3d} | '
          'lr {:02.2f} | '
          'loss {:5.2f}'.format(
            epoch+1, optimizer.state_dict()["param_groups"][0]["lr"],
            round(total_loss,2)))

# Model 3 : Transformers

In [None]:
class CoherenceClassifier(nn.Module):

    def __init__(self,dict_size=dict_size, d_embedding=300,  dropout=0.1):
        super(CoherenceClassifier, self).__init__()
        
        self.embedding = nn.Embedding(dict_size+1,d_embedding,padding_idx=dict_size)
        self.pos_encoder = PositionalEncoding(d_embedding, dropout)
        encoder_layers = TransformerEncoderLayer(d_model=d_embedding, nhead = 4,dropout=dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=4)
        
        self.decoder = nn.Linear(d_embedding, 2 )

    def forward(self, x):

        x = self.embedding( x )
        #x = self.pos_encoder( x )
        x = self.transformer_encoder( x )
        x = torch.softmax(torch.tanh(self.decoder( x )),1)
        return x
    
    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=50):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
epochs = 100
model = CoherenceClassifier().to(device)
optimizer = optim.Adam(params=model.parameters(),lr=0.01)
loss_func = CrossEntropyLoss()
n = len(train_data.x) // batch_size

In [None]:
for epoch in range(epochs):
    total_loss = 0
    i = 0
    for _,_ , ctx,_ , _,_ , len_ctx,_, _,label_ctx in train_loader:
        i+=1
        optimizer.zero_grad()
        
        #Transformer
        ctx = model.forward(ctx) #Transformer architecture
        y = torch.cat([label_ctx.reshape(-1,1),1-label_ctx.reshape(-1,1)],dim=1) # Transformer architecture
        
        loss = loss_func( ctx , y )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print('-' * 35)
    print('| epoch {:3d} | '
          'lr {:02.2f} | '
          'loss {:5.2f}'.format(
            epoch+1, optimizer.state_dict()["param_groups"][0]["lr"],
            round(total_loss,2)))

# Pre-trained model

In [3]:
epochs = 100

#Load model and plug our Embedding in
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased').to(device)
model.bert.embeddings.word_embeddings = nn.Embedding(dict_size+1,768,padding_idx=dict_size).to(device)
model.train()

optimizer = optim.Adam(params=model.parameters(),lr=0.01)
loss_func = CrossEntropyLoss()
n = len(train_data.x) // batch_size

Using cache found in /home/jb/.cache/torch/hub/huggingface_pytorch-transformers_master


In [4]:
for epoch in range(epochs):
    total_loss = 0
    i = 0
    for ctx,pos_token,pos_ctx,label in train_loader:
        i+=1
        optimizer.zero_grad()
        
        #pre-trained BERT
        ctx = model.forward(input_ids=ctx,
                           token_type_ids=pos_ctx,
                            position_ids=pos_token)[0]
        
        loss = loss_func( ctx , label )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        print('| epoch {:3d} | {:5d}/{:5d} batches | '
              'loss {:5.2f} |'.format(
                epoch+1, i, n,loss.item()))
    print('-' * 35)
    print("Epoch ",epoch,"\t",round(total_loss / n,2))

| epoch   1 |     1/ 1317 batches | loss  0.65 |
| epoch   1 |     2/ 1317 batches | loss  1.70 |
| epoch   1 |     3/ 1317 batches | loss  4.12 |
| epoch   1 |     4/ 1317 batches | loss  3.60 |
| epoch   1 |     5/ 1317 batches | loss  0.70 |
| epoch   1 |     6/ 1317 batches | loss  0.76 |
| epoch   1 |     7/ 1317 batches | loss  6.59 |
| epoch   1 |     8/ 1317 batches | loss  2.42 |
| epoch   1 |     9/ 1317 batches | loss  1.08 |
| epoch   1 |    10/ 1317 batches | loss  2.13 |
| epoch   1 |    11/ 1317 batches | loss  0.68 |
| epoch   1 |    12/ 1317 batches | loss  0.63 |
| epoch   1 |    13/ 1317 batches | loss  1.78 |
| epoch   1 |    14/ 1317 batches | loss  0.64 |
| epoch   1 |    15/ 1317 batches | loss  0.77 |
| epoch   1 |    16/ 1317 batches | loss  1.09 |
| epoch   1 |    17/ 1317 batches | loss  0.56 |
| epoch   1 |    18/ 1317 batches | loss  0.82 |
| epoch   1 |    19/ 1317 batches | loss  0.62 |
| epoch   1 |    20/ 1317 batches | loss  0.57 |
| epoch   1 |    21/

| epoch   1 |   169/ 1317 batches | loss  0.57 |
| epoch   1 |   170/ 1317 batches | loss  0.72 |
| epoch   1 |   171/ 1317 batches | loss  0.63 |
| epoch   1 |   172/ 1317 batches | loss  0.88 |
| epoch   1 |   173/ 1317 batches | loss  0.86 |
| epoch   1 |   174/ 1317 batches | loss  0.74 |
| epoch   1 |   175/ 1317 batches | loss  0.66 |
| epoch   1 |   176/ 1317 batches | loss  0.78 |
| epoch   1 |   177/ 1317 batches | loss  0.95 |
| epoch   1 |   178/ 1317 batches | loss  0.77 |
| epoch   1 |   179/ 1317 batches | loss  0.79 |
| epoch   1 |   180/ 1317 batches | loss  0.73 |
| epoch   1 |   181/ 1317 batches | loss  0.82 |
| epoch   1 |   182/ 1317 batches | loss  0.86 |
| epoch   1 |   183/ 1317 batches | loss  1.28 |
| epoch   1 |   184/ 1317 batches | loss  0.90 |
| epoch   1 |   185/ 1317 batches | loss  0.65 |
| epoch   1 |   186/ 1317 batches | loss  0.88 |
| epoch   1 |   187/ 1317 batches | loss  1.02 |
| epoch   1 |   188/ 1317 batches | loss  0.85 |
| epoch   1 |   189/

| epoch   1 |   337/ 1317 batches | loss  0.74 |
| epoch   1 |   338/ 1317 batches | loss  1.00 |
| epoch   1 |   339/ 1317 batches | loss  0.62 |
| epoch   1 |   340/ 1317 batches | loss  0.71 |
| epoch   1 |   341/ 1317 batches | loss  1.04 |
| epoch   1 |   342/ 1317 batches | loss  0.65 |
| epoch   1 |   343/ 1317 batches | loss  0.74 |
| epoch   1 |   344/ 1317 batches | loss  0.80 |
| epoch   1 |   345/ 1317 batches | loss  0.73 |
| epoch   1 |   346/ 1317 batches | loss  0.86 |
| epoch   1 |   347/ 1317 batches | loss  0.73 |
| epoch   1 |   348/ 1317 batches | loss  0.90 |
| epoch   1 |   349/ 1317 batches | loss  0.67 |
| epoch   1 |   350/ 1317 batches | loss  0.65 |
| epoch   1 |   351/ 1317 batches | loss  0.98 |
| epoch   1 |   352/ 1317 batches | loss  0.81 |
| epoch   1 |   353/ 1317 batches | loss  0.77 |
| epoch   1 |   354/ 1317 batches | loss  0.97 |
| epoch   1 |   355/ 1317 batches | loss  0.51 |
| epoch   1 |   356/ 1317 batches | loss  1.21 |
| epoch   1 |   357/

RuntimeError: CUDA out of memory. Tried to allocate 26.00 MiB (GPU 0; 5.94 GiB total capacity; 4.82 GiB already allocated; 11.31 MiB free; 240.13 MiB cached)

# Draft

In [None]:
for ctx,pos_token,pos_ctx,label in train_loader:
    print(code2string(ctx[0],dict_token))
    print(pos_ctx[0])
    print(label[0].item())
    break

In [None]:
torch.LongTensor([0])

In [6]:
50 % 100

50

219