# Import Modules

## Standard Modules

In [1]:
import re 
import os

import pickle as pkl

import collections

from functools import reduce
from collections import Counter

## External Modules

In [2]:
import numpy as np
import pandas as pd

from tqdm import tqdm, trange

## DeepPavlov Modules

In [3]:
import deeppavlov

from deeppavlov.models.preprocessors.bert_preprocessor import BertNerPreprocessor

[nltk_data] Downloading package punkt to /home/alem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/alem/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/alem/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


## Pytorch-Pretrained-Bert

In [4]:
import pytorch_pretrained_bert

In [5]:
from pytorch_pretrained_bert import BertTokenizer

In [6]:
import keras
import keras_preprocessing

Using TensorFlow backend.


In [7]:
from keras.preprocessing.sequence import pad_sequences

In [8]:
pytorch_pretrained_bert.tokenization.__file__

'/home/alem/anaconda3/lib/python3.7/site-packages/pytorch_pretrained_bert/tokenization.py'

In [9]:
keras.preprocessing.sequence.__file__

'/home/alem/anaconda3/lib/python3.7/site-packages/keras/preprocessing/sequence.py'

In [10]:
keras_preprocessing.__file__

'/home/alem/anaconda3/lib/python3.7/site-packages/keras_preprocessing/__init__.py'

## Pytorch

In [11]:
import torch
from torch.utils.data import Dataset

# Constants

In [12]:
ABS_PATH = '/home/alem/Alem_Sagandykov_Documents/Alem_Social/Location_Identifier/Named_Entity_Recognition/data/'

In [13]:
collections3_v2_path = 'Russian/collection3_v2/pickle/dataset.pkl'

In [14]:
akerke_tagged_complaints_path = 'Russian/Alem_Tagged_Complaints/akerke_tagged/pickle/dataset.pkl'

In [15]:
ru_bert_path = '/home/alem/Alem_Sagandykov_Documents/Alem_Social/HERMES/'

In [16]:
vocab_path = 'Production/vocab.txt'
config_path = 'Production/bert_config.json'
ru_bert_pytorch_weights_path_pth = 'pytorch_dump/deeppavlov_pretrained_rubert.pth'
ru_bert_pytorch_weights_path_bin = 'pytorch_dump/rubert_cased_L-12_H-768_A-12_pt/pytorch_model.bin'

 # Pipeline

## NER Dataset

In [101]:
class NER_Dataset(Dataset):

    def __init__(self,dataset, transform = None):
        self.dataset = dataset
        self.transform = transform

    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        tokens, tags, index = self.dataset[idx]

        sample = {
            'tokens' : tokens,
            'tags' : tags,
            'sample_index' : index
        }

        if self.transform:
            return self.transform(sample)
        
        return sample

## Data Pipeline Class

In [93]:
def print_results(sample):
    
    fmt = '{:20} : {:5} : {}'
    
    tokens, tags, index = list(sample.keys())
    
    for token,tag in zip(sample[tokens], sample[tags]):
        print(fmt.format(token, tag, sample[index]))
    
    print()

In [19]:
class Data_Pipeline:
    
    def __init__(self, *args):
        self.content_pipeline = list(args)
    
    
    def __call__(self, obj):
        
        return reduce(lambda x,y : y(x), [obj] + self.content_pipeline)

## Bert Markers Adder

In [20]:
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, "r", encoding="utf-8") as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab

In [21]:
vocab = load_vocab(os.path.join(ru_bert_path, vocab_path))

In [82]:
class Bert_Markers_Adder:
    
    def __init__(self, verbose):
        self.verbose = verbose    
    
    def __call__(self, sample):
        
        sample['tokens'] = ['[CLS]'] + sample['tokens'] + ['[SEP]']
        sample['tags'] = ['X'] + sample['tags'] + ['X']
        
        if self.verbose:
            print_results(sample)
        
        return sample

In [83]:
def add_artefacts_tags(tags, cased_index, cased_tag = 'X'):
    for cased_index in cased_index:
        tags.insert(cased_index, cased_tag)
    
    return tags

def convert_tags_to_ids(tags, tags_dict_index):
    
    return [tags_dict_index[tag] for tag in tags]

In [106]:
class Pytorch_Wordpiece_Tokenizer:
    
    def __init__(self, tokenizer, verbose, cased_token = '#'):
        
        self.verbose = verbose
        self.tokenizer = tokenizer
        self.cased_token = cased_token
    
    def __call__(self, sample):
        
        cased_tokens = tokenizer.tokenize(' '.join(sample['tokens']))
        cased_index = [i for i in range(len(cased_tokens)) if self.cased_token in cased_tokens[i]]
        cased_tags = add_artefacts_tags(sample['tags'], cased_index)
        
        sample['tokens'], sample['tags'] = cased_tokens, cased_tags
        
        cased_sample = {
            'cased_tokens' : cased_tokens,
            'cased_tags' : cased_tags,
            'sample_index' : sample['sample_index']
        }
        
        del sample
        
        if self.verbose:
            print_results(cased_sample)
        
        return cased_sample
        

In [115]:
class Token_To_Id_Transformer:
    
    def __init__(self, tags_dict_index, verbose):
        self.verbose = verbose
        self.tokenizer = tokenizer
        self.tags_dict_index = tags_dict_index
        
    def __call__(self, cased_sample):
        
        cased_tokens_ids = self.tokenizer.convert_tokens_to_ids(cased_sample['cased_tokens'])
        cased_tags_ids = convert_tags_to_ids(cased_sample['cased_tags'], self.tags_dict_index)
        
        cased_sample['tokens_ids'], cased_sample['tags_ids'], cased_sample['sample_index_ids'] = cased_tokens_ids, \
                                                                                                 cased_tags_ids, \
                                                                                                 cased_sample['sample_index']
        
        del cased_sample['cased_tokens'], cased_sample['cased_tags'], cased_sample['sample_index']
        
        print(cased_sample.keys())
        
        if self.verbose:
            print_results(cased_sample)
        
        return cased_sample

In [205]:
class Pad_Trunc_Sequence_Getter:
    
    def __init__(self, verbose, max_len, padding_truncating, values, vocab, tags_dict_index):
        
        self.verbose = verbose
        self.max_len = max_len
        self.padding_truncating = padding_truncating
        self.values = values
        self.vocab = vocab
        self.tags_dict_index = tags_dict_index
    
    def __call__(self, cased_sample):
        pad_trunc_tokens_ids = pad_sequences([cased_sample['tokens_ids']], 
                                             maxlen = self.max_len, 
                                             truncating = self.padding_truncating, 
                                             padding = self.padding_truncating, 
                                             value = self.values['tokens'])
        
        pad_trunc_tags_ids = pad_sequences([cased_sample['tags_ids']], 
                                           maxlen=self.max_len, 
                                           truncating = self.padding_truncating, 
                                           padding = self.padding_truncating, 
                                           value = self.values['tags'])
        
        if len(cased_sample['tokens_ids']) > self.max_len:
            pad_trunc_tokens_ids = list(pad_trunc_tokens_ids[0][:-1]) + [self.vocab['[SEP]']]
            pad_trunc_tags_ids = list(pad_trunc_tags_ids[0][:-1]) + [self.tags_dict_index['X']]
        
        else:
            pad_trunc_tokens_ids = pad_trunc_tokens_ids[0]
            pad_trunc_tags_ids = pad_trunc_tags_ids[0]
            
        
        cased_sample['pad_trunc_tokens_ids'], cased_sample['pad_trunc_tags_ids'] = pad_trunc_tokens_ids, \
                                                                                   pad_trunc_tags_ids
        
        cased_sample['pad_trunc_sample_index_ids'] = [cased_sample['sample_index_ids']]
        
        del cased_sample['tokens_ids'], cased_sample['tags_ids'], cased_sample['sample_index_ids']
        
        if self.verbose:
            print_results(cased_sample)
        
        
        return cased_sample
        

In [121]:
def make_long_tensor(cased_sample):
    return {key : torch.LongTensor(cased_sample[key]) for key in cased_sample}

In [146]:
def make_attention_mask(cased_sample):
    cased_sample['pad_trunc_attention_mask'] = [float(token_id>0) for token_id in cased_sample['pad_trunc_tokens_ids']]
    
    return cased_sample

# Prepare Data

In [27]:
with open(os.path.join(ABS_PATH, collections3_v2_path), 'rb') as file:
    collection3_v2_dataset = pkl.load(file)

In [28]:
with open(os.path.join(ABS_PATH, akerke_tagged_complaints_path), 'rb') as file:
    akerke_tagged_complaints_dataset = pkl.load(file)

In [29]:
fmt = '{:5} : {:}'

In [30]:
for key in collection3_v2_dataset:
    print(fmt.format(key, len(collection3_v2_dataset[key])))

train : 8461
valid : 1982
test  : 1790


In [31]:
for key in akerke_tagged_complaints_dataset:
    print(fmt.format(key, len(akerke_tagged_complaints_dataset[key])))

train : 5878
test  : 1633
valid : 654


In [66]:
train_data = list(collection3_v2_dataset['train']) + list(akerke_tagged_complaints_dataset['train'])
test_data = list(collection3_v2_dataset['test']) + list(akerke_tagged_complaints_dataset['test'])
valid_data = list(collection3_v2_dataset['valid']) + list(akerke_tagged_complaints_dataset['valid'])

In [76]:
train_data = [(train_data[i][0], train_data[i][1], i) for i in range(len(train_data))]
test_data = [(test_data[i][0], test_data[i][1], i) for i in range(len(test_data))]
valid_data = [(valid_data[i][0], valid_data[i][1], i) for i in range(len(valid_data))]

In [32]:
train_tokens = np.array([sample[0] for sample in collection3_v2_dataset['train']] + 
                       list([sample[0] for sample in akerke_tagged_complaints_dataset['train']]))


train_tags = np.array([sample[1] for sample in collection3_v2_dataset['train']] + 
                       list([sample[1] for sample in akerke_tagged_complaints_dataset['train']]))

In [33]:
test_tokens = np.array([sample[0] for sample in collection3_v2_dataset['test']] + 
                       list([sample[0] for sample in akerke_tagged_complaints_dataset['test']]))


test_tags = np.array([sample[1] for sample in collection3_v2_dataset['test']] + 
                       list([sample[1] for sample in akerke_tagged_complaints_dataset['test']]))

In [34]:
valid_tokens = np.array([sample[0] for sample in collection3_v2_dataset['valid']] + 
                       list([sample[0] for sample in akerke_tagged_complaints_dataset['valid']]))


valid_tags = np.array([sample[1] for sample in collection3_v2_dataset['valid']] + 
                       list([sample[1] for sample in akerke_tagged_complaints_dataset['valid']]))

# HyperParametres

In [35]:
MAX_LEN = 512
BATCH_SIZE = 32

# Tokenization

In [36]:
bert_ner_preprocessor = BertNerPreprocessor(vocab_file = os.path.join(ru_bert_path, vocab_path),
                                            max_seq_length = None,
                                            provide_subword_tags = True)




In [37]:
output_train = bert_ner_preprocessor(tokens = [' '.join(token) for token in train_tokens] ,
                                     tags = train_tags,)

In [89]:
tokens, subword_tokens, subword_tok_ids, subword_masks, nonmasked_tags = output_train

In [108]:
print(tokens[5])

['Кроме', 'того', ',', 'полковник', 'полиции', 'Виктор', 'Пауков', 'назначен', 'начальником', 'управления', 'внутренних', 'дел', 'по', 'Центральному', 'административному', 'округу', 'главного', 'управления', 'МВД', 'РФ', 'по', 'Москве', '.']


In [110]:
print(train_tags[5])

['O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'B-ORG', 'B-LOC', 'O', 'B-LOC', 'O']


In [95]:
print(subword_tokens[5])

['[CLS]', 'Кроме', 'того', ',', 'полковник', 'полиции', 'Виктор', 'Паук', '##ов', 'назначен', 'начальником', 'управления', 'внутренних', 'дел', 'по', 'Центральному', 'административному', 'округу', 'главного', 'управления', 'МВД', 'РФ', 'по', 'Москве', '.', '[SEP]']


In [111]:
print(nonmasked_tags[5])

['X', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'X', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'B-ORG', 'B-LOC', 'O', 'B-LOC', 'O', 'X']


In [112]:
fmt = '{:20} : {}'
for token, tag in zip(subword_tokens[5], nonmasked_tags[5]):
    print(fmt.format(token, tag))

[CLS]                : X
Кроме                : O
того                 : O
,                    : O
полковник            : O
полиции              : O
Виктор               : B-PER
Паук                 : I-PER
##ов                 : X
назначен             : O
начальником          : O
управления           : O
внутренних           : O
дел                  : O
по                   : O
Центральному         : B-LOC
административному    : I-LOC
округу               : I-LOC
главного             : O
управления           : O
МВД                  : B-ORG
РФ                   : B-LOC
по                   : O
Москве               : B-LOC
.                    : O
[SEP]                : X


# Testing

## Transform

In [38]:
tags2index = {
    'X' : 0, 'O' : 1,
    'B-ORG' : 2, 'I-ORG' : 3,
    'B-PER' : 4, 'I-PER' : 5,
    'B-LOC' : 5, 'I-LOC' : 6
}

In [39]:
tokenizer = BertTokenizer(vocab_file = os.path.join(ru_bert_path, vocab_path), 
                          do_lower_case = False, 
                          do_basic_tokenize = False)

In [206]:
data_pipeline = Data_Pipeline(
    Bert_Markers_Adder(verbose = False),
    
    Pytorch_Wordpiece_Tokenizer(tokenizer, verbose = False),
    
    Token_To_Id_Transformer(tags2index, verbose = False),  
    
    Pad_Trunc_Sequence_Getter(max_len = 128, 
                              padding_truncating = 'post',
                              vocab = vocab,
                              verbose = False,
                              tags_dict_index = tags2index,
                              values = {
                                  'tokens' : 0,
                                  'tags' : 1
                              }),
    
    make_attention_mask,
    make_long_tensor,
#     debug
)

In [49]:
len(train_tokens[6])

25

In [50]:
len(train_tokens[3])

158

In [51]:
test_sample_padding = {
    'tokens' : train_tokens[6],
    'tags' : train_tags[6]
}

In [52]:
test_sample_truncating = {
    'tokens' : train_tokens[3],
    'tags' : train_tags[3]
}

In [53]:
transformed_test_sample_padding = data_pipeline(test_sample_padding)

[CLS]                : X
Премьер              : O
-                    : O
министром            : O
Италии               : B-LOC
назначен             : O
Марио                : B-PER
Монти                : I-PER
Президент            : O
Италии               : B-LOC
Джорджо              : B-PER
Наполитано           : I-PER
назначил             : O
новым                : O
премьер              : O
-                    : O
министром            : O
страны               : O
известного           : O
экономиста           : O
,                    : O
бывшего              : O
еврокомиссара        : O
Марио                : B-PER
Монти                : I-PER
.                    : O
[SEP]                : X

[CLS]                : X
Премьер              : O
-                    : O
министром            : O
Италии               : B-LOC
назначен             : O
Марио                : B-PER
Монти                : I-PER
Президент            : O
Италии               : B-LOC
Джорджо              : B-P

In [54]:
transformed_test_sample_truncating = data_pipeline(test_sample_truncating)

[CLS]                : X
В                    : O
частности            : O
,                    : O
генерал              : O
-                    : O
майор                : O
полиции              : O
Василий              : B-PER
Олейник              : I-PER
назначен             : O
начальником          : O
управления           : O
Министерства         : B-ORG
внутренних           : I-ORG
дел                  : I-ORG
(                    : I-ORG
МВД                  : I-ORG
)                    : I-ORG
РФ                   : B-LOC
по                   : O
Еврейской            : B-LOC
автономной           : I-LOC
области              : I-LOC
,                    : O
генерал              : O
-                    : O
майор                : O
полиции              : O
Илья                 : B-PER
Ольховский           : I-PER
-                    : O
министром            : O
внутренних           : O
дел                  : O
по                   : O
Республике           : B-LOC
Хакасия        

In [55]:
transformed_test_sample_truncating

{'pad_trunc_tokens_ids': [101,
  781,
  7849,
  128,
  9456,
  130,
  16890,
  12187,
  16007,
  91320,
  9201,
  17249,
  9278,
  15865,
  14263,
  3977,
  120,
  12618,
  122,
  7546,
  1516,
  71602,
  36656,
  4161,
  128,
  9456,
  130,
  16890,
  12187,
  23026,
  74689,
  2241,
  130,
  17383,
  14263,
  3977,
  1516,
  20137,
  79396,
  128,
  9456,
  130,
  16890,
  12187,
  8441,
  5232,
  43721,
  130,
  17383,
  14263,
  3977,
  1516,
  63660,
  20137,
  128,
  9456,
  130,
  16890,
  12187,
  32957,
  25792,
  130,
  17249,
  9278,
  12618,
  7546,
  1516,
  32732,
  4161,
  128,
  9456,
  130,
  16890,
  12187,
  13440,
  32167,
  130,
  17249,
  9278,
  12618,
  7546,
  1516,
  73701,
  876,
  26649,
  128,
  9456,
  130,
  16890,
  12187,
  8522,
  92123,
  22447,
  130,
  17249,
  12628,
  9278,
  12618,
  7546,
  1516,
  31705,
  4161,
  128,
  9456,
  130,
  16890,
  12187,
  8441,
  23166,
  8621,
  130,
  17249,
  9278,
  12618,
  7546,
  1516,
  27822,
  4161,
  1

In [56]:
transformed_test_sample_truncating.keys()

dict_keys(['pad_trunc_tokens_ids', 'pad_trunc_tags_ids'])

In [57]:
reverse_vocab = dict(zip(vocab.values(), vocab.keys()))

In [58]:
index2tags = dict(zip(tags2index.values(), tags2index.keys()))

In [59]:
tags2index

{'X': 0,
 'O': 1,
 'B-ORG': 2,
 'I-ORG': 3,
 'B-PER': 4,
 'I-PER': 5,
 'B-LOC': 5,
 'I-LOC': 6}

In [60]:
index2tags

{0: 'X', 1: 'O', 2: 'B-ORG', 3: 'I-ORG', 4: 'B-PER', 5: 'B-LOC', 6: 'I-LOC'}

In [61]:
reverse_vocab[17249]

'начальником'

In [64]:
fmt_big = '{:20} : {:}'

In [65]:
for token_id, tag_id in zip(transformed_test_sample_truncating['pad_trunc_tokens_ids'], 
                            transformed_test_sample_truncating['pad_trunc_tags_ids']):
    
    print(fmt_big.format(reverse_vocab[token_id], index2tags[tag_id]))

[CLS]                : X
В                    : O
частности            : O
,                    : O
генерал              : O
-                    : O
майор                : O
полиции              : O
Василий              : B-PER
Олейник              : B-LOC
назначен             : O
начальником          : O
управления           : O
Министерства         : B-ORG
внутренних           : I-ORG
дел                  : I-ORG
(                    : I-ORG
МВД                  : I-ORG
)                    : I-ORG
РФ                   : B-LOC
по                   : O
Еврейской            : B-LOC
автономной           : I-LOC
области              : I-LOC
,                    : O
генерал              : O
-                    : O
майор                : O
полиции              : O
Илья                 : B-PER
Ольхов               : B-LOC
##ский               : X
-                    : O
министром            : O
внутренних           : O
дел                  : O
по                   : O
Республике         

In [248]:
transformed_test_sample_truncating['pad_trunc_tokens_ids'][0]

101

In [249]:
vocab['[SEP]']

102

## NER Dataset + Transform

In [207]:
train_dataset = NER_Dataset(dataset = train_data, transform = data_pipeline)
test_dataset = NER_Dataset(dataset = test_data, transform = data_pipeline)
valid_dataset = NER_Dataset(dataset = valid_data, transform = data_pipeline)

In [208]:
a = train_dataset[0]

dict_keys(['tokens_ids', 'tags_ids', 'sample_index_ids'])


In [209]:
a.keys()

dict_keys(['pad_trunc_tokens_ids', 'pad_trunc_tags_ids', 'pad_trunc_sample_index_ids', 'pad_trunc_attention_mask'])

In [210]:
for key in a:
    print(key, ' ', a[key].shape)

pad_trunc_tokens_ids   torch.Size([128])
pad_trunc_tags_ids   torch.Size([128])
pad_trunc_sample_index_ids   torch.Size([1])
pad_trunc_attention_mask   torch.Size([128])


In [73]:
len(a['pad_trunc_tokens_ids'])
len(a['pad_trunc_tags_ids'])

128

In [151]:
a['pad_trunc_attention_mask']

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [126]:
a['pad_trunc_tokens_ids']

tensor([   101,  96420,    156,    785,    132,  19929,  41569,  16416,  13642,
         12187,    851, 113791,  23644,   1506,   4750,  21301,  34775,    128,
         26252,    851,  12618,   1516,  36400,    866,   7546,    132,    102,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

In [129]:
reverse_vocab[0]

'[PAD]'

In [168]:
a = torch.Tensor(32,128)

In [169]:
a.shape

torch.Size([32, 128])

In [157]:
a.view(-1).shape

torch.Size([4096])

In [170]:
b = []

In [171]:
b.extend(list(a))

In [164]:
len(b)

32

In [165]:
len(b[0])

128

In [166]:
type(b[0])

torch.Tensor

# Metrics

In [167]:
from seqeval.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score

In [None]:
classification_report()

In [None]:
accuracy_score()

In [173]:
from ml_helpers import *

In [None]:
Model_Helper()

In [175]:
a = torch.LongTensor(32,128)

In [176]:
print(a.shape)

torch.Size([32, 128])


In [178]:
print(a.shape)

torch.Size([32, 128])


In [187]:
f = '{:35} : {}'

In [188]:
print(f.format('tensor', a.shape))

tensor                              : torch.Size([32, 128])


In [184]:
print('tensor : ',a.shape)

tensor :  torch.Size([32, 128])


In [212]:
a = torch.LongTensor([0])

In [213]:
a.shape

torch.Size([1])

In [214]:
a

tensor([0])

In [216]:
a.numpy()

array([0])

In [223]:
def parse_index_to_tags(batch_tags):

    return [[index2tags[index] for index in tags] for tags in batch_tags.numpy()]

In [224]:
parse_index_to_tags(a)

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'

In [230]:
a = torch.Tensor(np.ones((32,128,7)))

In [226]:
a = a.to('cuda:0')

In [229]:
a.cpu().numpy()

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)

In [231]:
a.shape

torch.Size([32, 128, 7])

In [232]:
a.argmax(2)

tensor([[6, 6, 6,  ..., 6, 6, 6],
        [6, 6, 6,  ..., 6, 6, 6],
        [6, 6, 6,  ..., 6, 6, 6],
        ...,
        [6, 6, 6,  ..., 6, 6, 6],
        [6, 6, 6,  ..., 6, 6, 6],
        [6, 6, 6,  ..., 6, 6, 6]])

In [239]:
outputs = torch.FloatTensor(32,128,8)
target = torch.LongTensor(32,128)

In [240]:
loss = nn.CrossEntropyLoss()
output = loss(outputs.view(-1,8), target.view(-1))
output.backward()

IndexError: Target 140289048331280 is out of bounds.

In [241]:
outputs.view(-1,8).shape

torch.Size([4096, 8])

In [242]:
target.view(-1).shape

torch.Size([4096])

In [248]:
a = torch.LongTensor(32,128,8)

In [244]:
b = []

In [245]:
b.extend(a)

In [247]:
len(b)

32

In [249]:
c = torch.max(a, 2)

In [252]:
c[0]

tensor([[8820700510885670734, 6869205789060890664, 8388363798176071720,
          ..., 4121418380024487980, 8367756544705568800,
         8030883942550741024],
        [6716054724018057517, 8227628345583432736, 8079572507866260000,
          ..., 7883942713142634615, 8319104452899529063,
         8032487034614476659],
        [8391737100187234933, 8391722768054251363, 8459856049076183072,
          ..., 8247626271117287456, 8462656364839138158,
         8391166443842270053],
        ...,
        [8389758648283180576, 8751730065470156393, 8026270154363904061,
          ...,                   0,                   0,
                           0],
        [                  0,                   0,                   0,
          ...,                   0,                   0,
                           0],
        [                  0,                   0,                   0,
          ...,                   0,                   0,
                           0]])

In [253]:
c[1]

tensor([[6, 3, 0,  ..., 2, 1, 0],
        [6, 6, 7,  ..., 4, 7, 7],
        [0, 3, 0,  ..., 6, 2, 7],
        ...,
        [4, 6, 4,  ..., 7, 7, 7],
        [7, 7, 7,  ..., 7, 7, 7],
        [7, 7, 7,  ..., 7, 7, 7]])

In [254]:
d = torch.argmax(a, 2)

In [255]:
d

tensor([[6, 3, 0,  ..., 2, 1, 0],
        [6, 6, 7,  ..., 4, 7, 7],
        [0, 3, 0,  ..., 6, 2, 7],
        ...,
        [4, 6, 4,  ..., 7, 7, 7],
        [7, 7, 7,  ..., 7, 7, 7],
        [7, 7, 7,  ..., 7, 7, 7]])