# <center>PROJECT SANDBOX</center>

## Documentation
The aim of this notebook is to provide a simple sandbox to test different NN architectures for the project. , here is a doc about the functions imported from `scripts` folder : 

- **`prepare_dataset(device,ratio=0.5,shuffle_ctx=False)`** :
    - **Input**:
        - device : a torch.device object
        - ratio : a float ratio between 0 and 1 that determines the average proportion of modern english verses in the data loader
        - shuffle_ctx : if `True`, shuffle the contexts within a Batch so that half of the `x_1` elements has a wrong context `ctx_1`. Useful to train the context recognizer model.
    - **Return** :
        - a torch Dataset | class : Shakespeare inherited from torch.utils.data.Dataset
        - a python word dictionary (aka tokenizer) | class : dict
    - **Tensors returned when loaded in the dataloader**:
        - x_1 : input verse (modern / shakespearian)
        - x_2 : output verse (modern / shakespearian)

        - ctx_1 = context of the input verse
        - ctx_2 = context of the output verse

        - len_x : length of the input verse
        - len_y : length of the output verse

        - len_ctx_x : length of the input verse context
        - len_ctx_y : length of the output verse context

        - label : label of the input verse (0 : modern, 1 : shakespearian)
        - label_ctx : label of the context (0 : wrong context, 1 : right context)
- **`string2code(string,dict)`** : 
    - **Input**:
        - string : a sentence
        - dict : a tokenizer
    - **Return** :
        - a torch Longtensor (sentence tokenized)
- **`code2string(torch.Longtensor,dict)`** : 
    - **Input**:
        - torch.Longtensor : a sentence tokenized
        - dict : a tokenizer
    - **Return** :
        - a string sentence

## Importing packages

In [1]:
from scripts.data_builders.prepare_dataset import prepare_dataset,string2code,code2string,assemble

import torch
import torchvision.datasets as datasets
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter
from torch.nn import BCELoss,CrossEntropyLoss
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import pickle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device = ",device)

device =  cuda


## Preprocessing data

In [2]:
train_data, dict_words = prepare_dataset(device,ratio=0.5,shuffle_ctx=True) #check with shift+tab to look at the data structure
batch_size = 64
dict_token = {b:a for a,b in dict_words.items()} #dict for code2string

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
                                           shuffle=True,collate_fn=train_data.collate)

Loading ...
- Shakespeare dataset length :  21079
- Corrupted samples (ignored) :  0


## Designing NN model

### Language Model 

In [3]:
dict_size = len(dict_words)
d_embedding = 300 #cf. paper Y.Kim 2014 Convolutional Neural Networks for Sentence Classification


In [4]:
class CoherenceClassifier(torch.nn.Module):
    def __init__(self,dict_size=dict_size,d_embedding=300):
        super().__init__()
        self.embed_layer=torch.nn.Embedding(dict_size+1,d_embedding,padding_idx=dict_size)

        self.conv_1 = torch.nn.Conv1d(d_embedding,3,kernel_size = 3, stride = 1)
        self.max_pool = torch.nn.MaxPool1d(3,2)
        self.relu = torch.nn.ReLU()
        self.linear = torch.nn.Linear(3,1)
    
    def forward(self,x):
        x = self.embed_layer(x)
        x = self.conv_1( x.transpose(1,2))
        x = self.max_pool( x )
        x = self.relu( x )
        x = torch.max( x , 2 )[0]
        x = torch.sigmoid(self.linear(x))
        return x

        
    

In [5]:
class CoherenceClassifier(torch.nn.Module):
    def __init__(self,dict_size=dict_size,d_embedding=1000,d_hidden=1000):
        super().__init__()
        self.d_hidden = d_hidden
        self.embedding = nn.Embedding(dict_size+1,d_embedding,padding_idx=dict_size)
        self.lstm = nn.LSTM(d_embedding,self.d_hidden,dropout=0.,num_layers=1,bidirectional=False)
        self.bn0 = nn.BatchNorm1d(self.d_hidden)
        self.linear1 = torch.nn.Linear(self.d_hidden,500)
        self.bn1 = nn.BatchNorm1d(500)
        self.linear2 = torch.nn.Linear(500,100)
        self.bn2 = nn.BatchNorm1d(100)
        self.linear3 = torch.nn.Linear(100,1)
    
    def forward(self,x,len_x):
        x = self.embedding(x)
        x = pack_padded_sequence(x.permute(1,0,2),len_x,enforce_sorted=False)
        _,x = self.lstm(x)
        x = x[0].reshape(-1,self.d_hidden)
        x = self.bn0(x)
        x = torch.relu( self.linear1(x) )
        x = self.bn1(x)
        x = torch.relu( self.linear2(x) )
        x = self.bn2(x)
        x = torch.sigmoid( self.linear3(x) ).reshape(-1)
        return x

In [6]:
class CoherenceClassifier(nn.Module):

    def __init__(self, d_embedding, dict_size, dropout=0.1):
        super(CoherenceClassifier, self).__init__()
        
        self.embedding = nn.Embedding(dict_size+1,d_embedding,padding_idx=dict_size)
        encoder_layers = TransformerEncoderLayer(d_model=d_embedding, nhead = 4)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=4)
        
        self.decoder = nn.Linear(d_embedding, 2 )

    def forward(self, x):

        x = self.embedding(x)
        x = self.transformer_encoder( x )
        x = torch.softmax(torch.tanh(self.decoder( x )),1)
        return x



## Running model

In [7]:
for x,y , ctx_x,ctx_y , len_x,len_y , len_ctx_x,len_ctx_y, label,label_ctx in train_loader:
    
    for i in range(x.shape[0]):
        print("\n- x :")
        print(code2string(x[i],dict_token))
        print("- context of x :")
        print(code2string(ctx_x[i],dict_token))
        print("- context label :",label_ctx[i].item())

        print("- len_ctx_x :")
        print(ctx_x)
#         ipdb.set_trace()
    break


- x :
I’LL DO THAT , BECAUSE I HAVE TO TELL MY BROTHER HOW YOU FORGAVE HIM , ROSALIND .
- context of x :
COME ON , YOU KEEP GETTING PALER . I’LL DO THAT , BECAUSE I HAVE TO TELL MY BROTHER HOW YOU FORGAVE HIM , ROSALIND . I’LL THINK OF SOMETHING .
- context label : 1
- len_ctx_x :
tensor([[    0,   259,   478,  ..., 17513, 17513, 17513],
        [    0,  1802,   104,  ..., 17513, 17513, 17513],
        [    0,   127,     5,  ..., 17513, 17513, 17513],
        ...,
        [    0,   178,    24,  ..., 17513, 17513, 17513],
        [    0,   337,  2384,  ..., 17513, 17513, 17513],
        [    0,    99,   264,  ..., 17513, 17513, 17513]], device='cuda:0')

- x :
BUT , MY GOOD LORD , THE WORDS OF YOU AND LORD BUCKINGHAM ARE AS TRUSTWORTHY TO ME AS IF I HAD SEEN AND HEARD HIM SPEAK MYSELF .
- context of x :
FORGIVE MY BOLDNESS IF I TAKE YOU UP ON YOUR WORDS ABOUT MAKING MAGIC . BUT , MY GOOD LORD , THE WORDS OF YOU AND LORD BUCKINGHAM ARE AS TRUSTWORTHY TO ME AS IF I HAD SEEN AND HEARD HIM

THE CONSTABLE OF FRANCE .
- context of x :
A DOLLAR . THE CONSTABLE OF FRANCE . YOU HAVE SPOKEN TRUER THAN YOU PURPOSED .
- context label : 0
- len_ctx_x :
tensor([[    0,   259,   478,  ..., 17513, 17513, 17513],
        [    0,  1802,   104,  ..., 17513, 17513, 17513],
        [    0,   127,     5,  ..., 17513, 17513, 17513],
        ...,
        [    0,   178,    24,  ..., 17513, 17513, 17513],
        [    0,   337,  2384,  ..., 17513, 17513, 17513],
        [    0,    99,   264,  ..., 17513, 17513, 17513]], device='cuda:0')

- x :
DOST THOU LIE SO LOW ?
- context of x :
I CONJURE THEE BY ALL THE SAINTS IN HEAVEN . DOST THOU LIE SO LOW ? I AM NOT MAD .
- context label : 0
- len_ctx_x :
tensor([[    0,   259,   478,  ..., 17513, 17513, 17513],
        [    0,  1802,   104,  ..., 17513, 17513, 17513],
        [    0,   127,     5,  ..., 17513, 17513, 17513],
        ...,
        [    0,   178,    24,  ..., 17513, 17513, 17513],
        [    0,   337,  2384,  ..., 17513, 17513, 17513]

tensor([[    0,   259,   478,  ..., 17513, 17513, 17513],
        [    0,  1802,   104,  ..., 17513, 17513, 17513],
        [    0,   127,     5,  ..., 17513, 17513, 17513],
        ...,
        [    0,   178,    24,  ..., 17513, 17513, 17513],
        [    0,   337,  2384,  ..., 17513, 17513, 17513],
        [    0,    99,   264,  ..., 17513, 17513, 17513]], device='cuda:0')

- x :
IS SOMETHING ABOUT TO HAPPEN THAT WARRANTS WORKING THIS NIGHT AND DAY ?
- context of x :
MY LORD , WE WERE SENT FOR . IS SOMETHING ABOUT TO HAPPEN THAT WARRANTS WORKING THIS NIGHT AND DAY ? WHO CAN EXPLAIN THIS TO ME ?
- context label : 1
- len_ctx_x :
tensor([[    0,   259,   478,  ..., 17513, 17513, 17513],
        [    0,  1802,   104,  ..., 17513, 17513, 17513],
        [    0,   127,     5,  ..., 17513, 17513, 17513],
        ...,
        [    0,   178,    24,  ..., 17513, 17513, 17513],
        [    0,   337,  2384,  ..., 17513, 17513, 17513],
        [    0,    99,   264,  ..., 17513, 17513, 17513]],

EDMUND AND I HAVE TALKED , AND MORE CONVENIENT IS HE FOR MY HAND THAN FOR YOUR LADY’S .
- context of x :
THEREFORE , I BESEECH YOUR HIGHNESS PARDON ME . EDMUND AND I HAVE TALKED , AND MORE CONVENIENT IS HE FOR MY HAND THAN FOR YOUR LADY’S . IT IS WITH A GOOD WILL .
- context label : 0
- len_ctx_x :
tensor([[    0,   259,   478,  ..., 17513, 17513, 17513],
        [    0,  1802,   104,  ..., 17513, 17513, 17513],
        [    0,   127,     5,  ..., 17513, 17513, 17513],
        ...,
        [    0,   178,    24,  ..., 17513, 17513, 17513],
        [    0,   337,  2384,  ..., 17513, 17513, 17513],
        [    0,    99,   264,  ..., 17513, 17513, 17513]], device='cuda:0')

- x :
THEY SIT CONFERRING BY THE PARLOR FIRE .
- context of x :
NO MATTER WHITHER , SO YOU COME NOT HERE . THEY SIT CONFERRING BY THE PARLOR FIRE . YET THIS I WILL NOT DO , DO HOW I CAN .
- context label : 0
- len_ctx_x :
tensor([[    0,   259,   478,  ..., 17513, 17513, 17513],
        [    0,  1802,   104,  ..., 1751

In [7]:
epochs=100
model=CoherenceClassifier(d_embedding,dict_size).to(device)
optimizer = optim.Adam(params=model.parameters(),lr=0.1)
loss_func = CrossEntropyLoss()

In [22]:
n = len(train_data.x) // batch_size

for epoch in range(epochs):
    total_loss = 0
    
    for x,y , ctx_x,ctx_y , len_x,len_y , len_ctx_x,len_ctx_y, label,label_ctx in train_loader:
        optimizer.zero_grad()
        
        x = model.forward(x) #Transformer architecture
        y = torch.cat([label_ctx.reshape(-1,1),1-label_ctx.reshape(-1,1)],dim=1) # Transformer architecture
        
        #x = model.forward(x,len_x) #LSTM architecture
        
        #x = model.forward(x).reshape(-1) #CNN architecture
        
        loss = loss_func( x , y )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(epoch,"\t",round(total_loss/n,5))

0 	 3.81727
1 	 3.81838
2 	 3.80264
3 	 3.80391
4 	 3.79953
5 	 3.8058
6 	 3.80982
7 	 3.82012
8 	 3.81764
9 	 3.80559
10 	 3.80786
11 	 3.79998
12 	 3.80706
13 	 3.82285
14 	 3.81141


KeyboardInterrupt: 

In [24]:
x.shape

torch.Size([64, 36, 2])