# Preparing Data

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

import spacy
import numpy as np

import random
import math
import time

In [5]:
from functions import *


Seed for reproduction

In [6]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
device

device(type='cuda')

In [9]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')



In [10]:
spacy_de.vocab.length,spacy_en.vocab.length

(680, 772)

In [11]:
with open('train.en') as f:
    trainEn = f.readlines()
with open('train.de') as f:
    trainDe = f.readlines()
with open('val.en') as f:
    valEn = f.readlines()
with open('val.de') as f:
    valDe = f.readlines()

In [12]:
engVocabulary = Vocabulary('en',spacy_en,40)
deVocabulary = Vocabulary('de',spacy_de,40)

In [13]:
for i in trainEn:
  engVocabulary.add_sentence(i)
for i in trainDe:
  deVocabulary.add_sentence(i)
for i in valEn:
  engVocabulary.add_sentence(i)
for i in valDe:
  deVocabulary.add_sentence(i)

In [14]:
engVocabulary.num_words,deVocabulary.num_words

(9951, 19062)

In [15]:
engVocabulary.longest_sentence,deVocabulary.longest_sentence

(41, 44)

In [16]:
deVocabulary.sentence_to_index(trainDe[100],padding=True)

[1,
 461,
 462,
 11,
 30,
 169,
 215,
 50,
 166,
 209,
 59,
 30,
 463,
 464,
 16,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [17]:
engVocabulary.sentence_to_index(trainEn[100])

[1, 447, 241, 17, 21, 169, 155, 298, 36, 116, 373, 448, 14, 2]

In [18]:
src = []
for j in np.array(trainEn):
    src.append(engVocabulary.sentence_to_index(j,padding=True))
src = torch.tensor(src,dtype=torch.long, device=device)
trg = []
for j in np.array(trainDe):
    trg.append(deVocabulary.sentence_to_index(j,padding=True))
trg = torch.tensor(trg,dtype=torch.long, device=device)
trg = trg.T
src = src.T

In [19]:
valsrc = []
for j in np.array(valEn):
    valsrc.append(engVocabulary.sentence_to_index(j,padding=True))
valsrc = torch.tensor(valsrc,dtype=torch.long, device=device)
valtrg = []
for j in np.array(valDe):
    valtrg.append(deVocabulary.sentence_to_index(j,padding=True))
valtrg = torch.tensor(valtrg,dtype=torch.long, device=device)
valtrg = valtrg.T
valsrc = valsrc.T

# Model

Bert as en encoder
https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891

In [22]:
class Encoder(nn.Module):
    def __init__(self,
                 input_dim,
                 hid_dim,
                 n_layers,
                 n_heads,
                 pf_dim,
                 dropout,
                 device,
                 max_length = 100 #  The position embedding has a "vocabulary" size of 100, which means our model can accept sentences up to 100 tokens long. This can be increased if we want to handle longer sentences.
                 ):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList(
            [
                EncoderLayer(
                    hid_dim,
                    n_heads,
                    pf_dim,
                    dropout,
                    device
                    )
            for _ in range(n_layers)] 
        )
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, src, src_mask):
        batch_size = src.shape[0] #src = [batch size, src len] src_mask = [batch_size, 1, 1, src_len]
        src_len = src.shape[1]
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device) # pos = [batch size, src len]
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos)) # src = [batch size, src len, hid dim]
        for layer in self.layers:
            src = layer(src, src_mask) # src = [batch_size, src len, hid dim]
        return src

class EncoderLayer(nn.Module):
    def __init__(
            self,
            hid_dim,
            n_heads,
            pf_dim, 
            dropout,
            device
    ):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(
            hid_dim,
            pf_dim,
            dropout
        )
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        #src = [batch size, src len, hid dim]
        #src mask = [batch size, 1, 1, src len]
        
        _src, attention = self.self_attention(src, src, src, src_mask) # self attention
        src = self.self_attn_layer_norm( src + self.dropout(_src)) # dropout, residual connection and layer norm
        # src = [batch size, src len, hid dim]
        # positionwise feedforward
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src)) # src = [batch size, src len, hid dim]
        return src, attention
    
class MultiHeadAttentionLayer(nn.Module):
    def __init__(
            self,
            hid_dim,
            n_heads, # for parallel computing - attention is not calculated alltogether, it is calculated head by head and the concatenated
            dropout,
            device
    ):
        super().__init__()
        assert hid_dim % n_heads == 0
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads # will give an integer
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
    
    def forward(self, query, key, value, mask = None):

        batch_size = query.shape[0]
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
        Q = self.fc_q(query) # [batch size, query len, hid dim]
        K = self.fc_k(key) # [batch size, key len, hid dim]
        V = self.fc_v(value) # [batch size, value len, hid dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch size, n heads, query len, hid dim]
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch size, n heads, key len, hid dim]
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch size, n heads, value len, hid dim]
        # calculating energy - the un-normalized attention
        energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale #energy = [batch size, n heads, query len, key len]
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10) # fill with zeros
        attention = torch.softmax(energy, dim=-1) # [batch size, n heads, query len, key len] # cada linha tem que somar 1
        x = torch.matmul(self.dropout(attention), V) #x = [batch size, n heads, query len, head dim]
        x = x.view(batch_size, -1, self.hid_dim) # [ batch size, query len, hid dim]
        x = self.fc_o(x) # apply layer to multi head attention [ batch size, query len, hid dim]
        return x, attention

class PositionwiseFeedforwardLayer(nn.Module):
    #Why is this used? Unfortunately, it is never explained in the paper.
    #BERT uses the GELU activation function, which can be used by simply switching torch.relu for F.gelu. Why did they use GELU? Again, it is never explained.
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = self.dropout(torch.relu(self.fc_1(x))) #[batch size, seq len, pf dim]
        x = self.fc_2(x)
        return x

# Decoder

class Decoder(nn.Module):
    def __init__(
        self,
        output_dim,
        hid_dim, 
        n_layers,
        n_heads,
        pf_dim,
        dropout,
        device,
        max_length = 100 #  The position embedding has a "vocabulary" size of 100, which means our model can accept sentences up to 100 tokens long. This can be increased if we want to handle longer sentences.
    ):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList(
            [
                DecoderLayer(
                    hid_dim,
                    n_heads,
                    pf_dim,
                    dropout,
                    device
                )
            for _ in range(n_layers) ] 
        )
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size,1).to(self.device) # [batch size, trg len]
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos)) # [batch size, trg len, hid dim]
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
            # trg = [batch size, trg len, hid dim]
            # attention = [batch size, n heads, trg len, src len]
        output = self.fc_out(trg) # [batch size, trg len, output dim]
        return output, attention

class DecoderLayer(nn.Module):
    def __init__(
        self,
        hid_dim,
        n_heads,
        pf_dim,
        dropout,
        device
    ):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        # self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg)) # [ batch size, trg len, hid dim]
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask) #attention = [batch size, n heads, trg len, src len]
        # dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg)) # [ batch size, trg len, hid dim]
        # positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        # dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg)) # [batch size, trg len, hid dim]

        return trg, attention
    
class Seq2Seq(nn.Module):
    def __init__(
            self,
            encoder,
            decoder,
            src_pad_idx,
            trg_pad_idx,
            device
    ):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        #src = [batch size, src len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(1) # [batch size, 1, 1, src len]
        return src_mask
    
    def make_trg_mask(self, trg):
        #trg = [batch size, trg len]
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2) # [batch size, 1, 1, trg len]
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool() #trg_sub_mask = [trg len, trg len] Returns the lower triangular part of the matrix
        trg_mask = trg_pad_mask & trg_sub_mask #trg_mask = [batch size, 1, trg len, trg len]
        return trg_mask
    
    def forward(self, src, trg):
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(src) # self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src, enc_attention = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention



In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [24]:
INPUT_DIM = engVocabulary.num_words+1
OUTPUT_DIM = deVocabulary.num_words+1
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

In [26]:
SRC_PAD_IDX = 0
TRG_PAD_IDX = 0

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device).to(device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device).to(device)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [27]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
            
model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(9952, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
     

In [28]:
LEARNING_RATE = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
# for the loss we have to ignore de PAD_TOKEN, it is not important
_PAD_TOKEN = 0
criterion = nn.CrossEntropyLoss(ignore_index = _PAD_TOKEN)

In [29]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 16,331,895 trainable parameters


In [30]:
import gc
def train_bert(model, optimizer, criterion, clip, src, trg):

    model.train()

    epoch_loss = 0
    it = 0
    for i in chunks(np.arange(src.shape[1]), 64):
        it += 1
        #print(it)
        
        optimizer.zero_grad()

        output, _ = model(src[:,i].to(device),trg[:,i].to(device))
        #print('Shape')
        #print(output.argmax(2).view(-1, output.shape[1]).shape, trg[:,i].to(device).reshape(-1,1)[:,0].shape)
        #print(output.argmax(2).reshape(-1,1)[:,0].shape, trg[:,i].to(device).reshape(-1,1)[:,0].shape)
        loss = criterion(output.view(-1, output.shape[-1]), trg[:,i].to(device).reshape(-1,1)[:,0])
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

        gc.collect()

    return epoch_loss / 64

def evaluate_bert(model, criterion, src, trg):

  model.eval()

  epoch_loss = 0

  with torch.no_grad():

    for i in chunks(np.arange(src.shape[1]), 64):
      
      output, _ = model(src[:,i].to(device), trg[:,i].to(device)) #turn off teacher forcing
      
      loss = criterion(output.view(-1, output.shape[-1]), trg[:,i].to(device).reshape(-1,1)[:,0])

      epoch_loss += loss.item()
      
      gc.collect()
  return epoch_loss / 64

In [31]:
N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    print('Start training',epoch)
    train_loss = train_bert(model, optimizer, criterion, CLIP, src, trg)
    print('Validating...',epoch)
    valid_loss = evaluate_bert(model, criterion, valsrc, valtrg)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Start training 0
Validating... 0
Epoch: 01 | Time: 2m 25s
	Train Loss: 12.554 | Train PPL: 283217.604
	 Val. Loss: 0.147 |  Val. PPL:   1.158
Start training 1
Validating... 1
Epoch: 02 | Time: 2m 40s
	Train Loss: 2.806 | Train PPL:  16.542
	 Val. Loss: 0.096 |  Val. PPL:   1.101
Start training 2
Validating... 2
Epoch: 03 | Time: 2m 54s
	Train Loss: 1.351 | Train PPL:   3.862
	 Val. Loss: 0.080 |  Val. PPL:   1.083
Start training 3
Validating... 3
Epoch: 04 | Time: 3m 8s
	Train Loss: 0.601 | Train PPL:   1.825
	 Val. Loss: 0.073 |  Val. PPL:   1.076
Start training 4
Validating... 4
Epoch: 05 | Time: 3m 19s
	Train Loss: 0.154 | Train PPL:   1.166
	 Val. Loss: 0.072 |  Val. PPL:   1.075
Start training 5
Validating... 5
Epoch: 06 | Time: 3m 38s
	Train Loss: 0.024 | Train PPL:   1.024
	 Val. Loss: 0.073 |  Val. PPL:   1.076
Start training 6
Validating... 6
Epoch: 07 | Time: 3m 44s
	Train Loss: 0.012 | Train PPL:   1.012
	 Val. Loss: 0.073 |  Val. PPL:   1.076
Start training 7
Validating... 

In [33]:
' '.join([engVocabulary.to_word(int(word)) for word in valsrc[:,500].cpu().numpy()])

'SOS a woman is putting a helmet on a small girl . EOS PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD'

In [32]:
with torch.no_grad():
  output,attention_ = model(valsrc[:,500].view(-1,1).to(device),valtrg[:,500].view(-1,1).to(device))
' '.join([deVocabulary.to_word(int(word)) for word in output.argmax(2)[:,0].cpu().numpy()])

'SOS eine frau zieht einem kleinen mädchen einen helm an . EOS mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann'

In [34]:
' '.join([engVocabulary.to_word(int(word)) for word in valsrc[:,600].cpu().numpy()])

'SOS an artist working on an ice sculpture EOS PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD'

In [35]:
with torch.no_grad():
  output,attention_ = model(valsrc[:,600].view(-1,1).to(device),valtrg[:,600].view(-1,1).to(device))
' '.join([deVocabulary.to_word(int(word)) for word in output.argmax(2)[:,0].cpu().numpy()])

'SOS ein künstler arbeitet an einer eisskulptur EOS mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann mann'

In [38]:
valsrc[:,600].shape

torch.Size([40])

In [44]:
output.shape

torch.Size([40, 1, 19063])

In [45]:
attention_.shape

torch.Size([40, 8, 1, 1])

In [42]:
attention_.squeeze(2).squeeze(2)

tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1