# import module

In [7]:
import mediapipe as mp
import numpy as np
import time
import cv2
import torch
import math
import pandas as pd
import os
import time

import utils
import augmentation


device = torch.device("cpu")
# import gensim

# Data Load

In [8]:
def load_tensor(dir_path = [".", "output", "tensor"]):
    '''
    tensor 가져오는 방식을 csv에서 가져온 번호에서 부터 불러오도록 설정
    output 에 nums는 현재 가져온곳의 한글값을 표현하기 위해서 
    '''
    dir_path = os.path.join(*dir_path)
    tensor_folders = sorted(os.listdir(dir_path))
    print(tensor_folders[1:9])
    h_list = []
    answers = []
    
    # 0 ~ 7 순회
    for tensor_folder in tensor_folders[1:9]:
        tensors_path = os.path.join(dir_path + "/" + tensor_folder)
        tensors = sorted(os.listdir(tensors_path))
        
        # 각각의 숫자 순회
        for tensor in tensors:
            h_list.append(torch.load(tensors_path + "/" + tensor + "/hand.pt"))
            answers.append(int(tensor_folder))
            

    return h_list, answers

In [9]:
def pipe(t, batch_first = False):
    t = list(map(lambda x: x.view(-1,x.shape[1]*x.shape[2]),t))
    t = torch.nn.utils.rnn.pad_sequence(t,batch_first=batch_first).to(device)
    return t

In [10]:
hand, nums = load_tensor()

['0', '1', '2', '3', '4', '5', '6', '7']


In [11]:
hand_tensor = pipe(hand)

In [12]:
hand_tensor.shape

torch.Size([71, 954, 126])

In [14]:
for i in range(8):
    print("숫자 {0} : {1}개 입니다.".format(i,nums.count(i)))

숫자 0 : 117개 입니다.
숫자 1 : 117개 입니다.
숫자 2 : 117개 입니다.
숫자 3 : 117개 입니다.
숫자 4 : 120개 입니다.
숫자 5 : 123개 입니다.
숫자 6 : 120개 입니다.
숫자 7 : 123개 입니다.


In [7]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model,batch_first = False, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        # print(pe.shape)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)        
        # print(pe.shape)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.batch_first = batch_first
        if batch_first :
            pe = pe.unsqueeze(0)
        else :
            pe = pe.unsqueeze(0).transpose(0,1)
        # print(pe.shape)
        self.register_buffer('pe', pe)

    def forward(self, x):

        if self.batch_first:
            # print(x.shape , self.pe[:,:x.size(1),:].shape)
            x = x + self.pe[:, :x.size(1),:]
        else :
            # print(x.shape , self.pe[:x.size(0),:].shape)
            x = x + self.pe[:x.size(0),:]
            
        return self.dropout(x)

In [8]:
# answer = kors[hand_index]
# print(answer.shape, answer.unique().size)
answer = nums

In [9]:
1

1

In [10]:
# tokenizer, pad_sequence
sos_data = torch.LongTensor(np.zeros(len(answer))+8).unsqueeze(dim=0)
decode_data = torch.LongTensor(np.array(answer)).unsqueeze(dim=0)
decode_data = torch.cat((sos_data,decode_data),axis=0).to(device)

In [11]:
decode_data[:,:10],decode_data[:,-10:], sos_data.shape , decode_data.shape 
# seq, batch

(tensor([[8, 8, 8, 8, 8, 8, 8, 8, 8, 8],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([[8, 8, 8, 8, 8, 8, 8, 8, 8, 8],
         [7, 7, 7, 7, 7, 7, 7, 7, 7, 7]]),
 torch.Size([1, 954]),
 torch.Size([2, 954]))

In [12]:
eos_y_data = torch.zeros(1,len(answer),9)
# (1, 153, 53)
y_data = torch.zeros(1,len(answer),9)
# (1, 153 , 53)

y_data = torch.cat((y_data,eos_y_data),axis=0).to(device)
y_data[0,torch.arange(len(answer)),np.array(answer)] = 1 
#one hot vector

y_data[1,torch.arange(len(answer)),-1] = 1 #eos

In [13]:
y_data.shape, decode_data[:,10] , y_data[:,10] 
# (sos, answer)   , (answer , eos)
# sos = 51 , eos = 52

(torch.Size([2, 954, 9]),
 tensor([8, 0]),
 tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1.]]))

In [14]:
class TransformerModel_hand(torch.nn.Module):

    def __init__(self, input_node, d_model, nhead, nhid, nlayers,ntoken ,dropout=0.5  ):
        """
        input_node  node개수
        d_model    임배딩 차원 , model 내부의 수치
        nhead   멀티헤드 개수
        nhid    Linear 뉴런개수 = dff 
        nlayers encoder 개수 = num_layers
        ntoken  단어 개수
        """
        super(TransformerModel_hand, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.encode_emb = torch.nn.Linear(input_node,d_model)# GNN으로 대체될수도 있음
        self.decode_emb = torch.nn.Embedding(ntoken,d_model)

        self.pos_encoder = PositionalEncoding(d_model, dropout) #해당하는 언어의 위치 정보

        self.transformer = torch.nn.Transformer(
                d_model = d_model, nhead = nhead, 
                num_encoder_layers = nlayers, 
                num_decoder_layers = nlayers, 
                dim_feedforward = nhid, dropout = dropout
            )

        self.decoder = torch.nn.Linear(d_model, ntoken)
        self.softmax = torch.nn.Softmax(dim=2)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask


    def forward(self, src ,tgt ,src_mask = None, tgt_mask = None):
        # src = torch.cat(src,axis=2)
        # lt, rt, pt 의 feature부분을 concat
        src = self.encode_emb(src)
        src = self.pos_encoder(src)

        tgt = self.decode_emb(tgt)
        tgt = self.pos_encoder(tgt)

        # print("source shape : {0} | target shape : {1}".format(src.shape,tgt.shape))
        target = self.transformer(src=src,tgt=tgt,src_mask = None, tgt_mask = None)
        output = self.decoder(target)
        # output = self.softmax(output)
        return output

In [15]:
nodes = 126 # GNN이전의 노드 개수  고친다면
d_model = 400 #400
nlayers = 4 
nhead = 8
dropout = 0.2
nhid = 2048
tokens = 9# token개수 + sos, eos
transmodel = TransformerModel_hand(input_node = nodes,d_model= d_model,nhead = nhead,nhid=nhid,nlayers=nlayers,ntoken=tokens,dropout=dropout).to(device)

loss_fn =  torch.nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adadelta(transmodel.parameters())

In [16]:
src_mask = transmodel.generate_square_subsequent_mask(hand_tensor.shape[0]).to(device)
# src mask의 경우 다른 mask를 사용해야 하는것으로 판단됨
tgt_mask = transmodel.generate_square_subsequent_mask(decode_data.shape[0]).to(device)

In [17]:
x = 0
batch_size = 40
tmp = int(hand_tensor.shape[1]/batch_size)
src =  (hand_tensor[:,x*batch_size:(x+1)*batch_size]) 

In [18]:
aug = augmentation.DataAugmentation(src,0.5,1,device)

In [19]:
src.shape

torch.Size([71, 40, 126])

In [None]:

epochs = 1000
losses = []
accuracy = []
transmodel.train()
start = time.time()
for i in range(epochs):
    tmp_time = time.time()
    print("start epoch ",end="| ")
    # print(tmp)
    batch_accuracy = 0
    batch_loss = 0
    for x in range(tmp):
        # print(src[0].shape, tgt.shape)
        # print("{0} {1:.3f}s ".format(x,time.time()-tmp_time),end="")
        src = aug(hand_tensor[:,x*batch_size:(x+1)*batch_size])
        tgt = decode_data[0,x*batch_size:(x+1)*batch_size].unsqueeze(dim=0)
        y = y_data[0,x*batch_size:(x+1)*batch_size].unsqueeze(dim=0)


        optimizer.zero_grad()

        predict = transmodel(src,tgt, src_mask )
        
        tmp_pre = predict.squeeze(0).max(dim=-1, keepdim=False)[1]
        tmp_y = y.squeeze(0).max(dim=-1, keepdim=False)[1]
        
        # predict (seq =1 , batch = 30, feature =  56)
        
        predict = predict.transpose(0,1)
        y = y.transpose(0,1)
        
        loss =loss_fn(predict,y)
        # batch, **
        
        loss.backward()
        optimizer.step()


        batch_loss += loss.item()
        batch_accuracy +=  batch_size-(tmp_pre - tmp_y).count_nonzero()
#         break
    # print(batch_loss, batch_accuracy)
    losses.append(batch_loss)
    accuracy.append(batch_accuracy/(batch_size*tmp))
    
    end = time.time()
    # print(predict.shape,predict.squeeze(0).max(dim=-1, keepdim=False)[1],end=" ")
    print("| end epoch: {0}\t| time: {2:.4f}s |  loss : {1:6f} | acc : {3:.2f}%".format(i+1,losses[-1], end-tmp_time, accuracy[-1]*100))   
#     if(losses[-1] < 0.5 and accuracy[-1] > 0.97):
#         break
print("total time = {0:.4f}s".format(end-start))

        

In [None]:
src = hand_tensor[:,batch_size*tmp:]
tgt = decode_data[0,batch_size*tmp:].unsqueeze(dim=0)
y = y_data[0,batch_size*tmp:].unsqueeze(dim=0).transpose(0,1)


In [None]:
src.shape, tgt.shape, y.shape

In [None]:
val_predict = transmodel(src,tgt,src_mask).transpose(0,1)

In [None]:
val_loss = loss_fn(val_predict, y)

In [None]:
val_loss.item()

In [None]:
 val_predict.transpose(0,1).squeeze(0).max(dim=-1, keepdim=False)[1] , tgt , y.transpose(0,1).squeeze(0).max(dim=-1, keepdim=False)[1]

In [None]:
val_predict[0,0]

In [None]:
t = [(i,x.item()) for i,x in enumerate(val_predict[0,0])]
t = sorted(t , key=lambda x:-x[1] )
t