In [2]:
import torch

In [6]:
import copy

In [3]:
word_weight = torch.nn.Parameter( torch.Tensor( 100, 100 ) )

In [7]:
org = copy.deepcopy( word_weight)

In [5]:
word_weight.data.data.normal_(0, 0.05)

torch.Size([100, 100])

---

In [None]:
#pip install git+https://github.com/haven-jeon/PyKoSpacing.git
from pykospacing import Spacing
from konlpy.tag import Okt


In [None]:
DL_model = ReviewHAN()

In [None]:
DL_model()

In [4]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlret`rieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

## 0. 패키지 및 함수

In [5]:
#pip install kss
import kss
import nltk
from nltk.tokenize import sent_tokenize
from konlpy.tag import Okt
from gensim.models import Word2Vec
#nltk.download('punkt')
#import seaborn as sns


## 1. 데이터 전처리

In [7]:
#sns.distplot(sentiment_data['review'][:500].apply(lambda x: len(kss.split_sentences(x))))

In [None]:
import random
ind = random.randint(0, 10000)
print(ind)
temp_X = [ okt.morphs(corpus[ind][i]) for i in range(len(corpus[ind])) ]
temp_X = [''.join(temp_X[i]) for i in range(len(temp_X))]
temp_X = [spacing(temp_X[i] ) for i in range(len(temp_X))]
temp_X

- 형태소 분석기로 stem 추출했더니 잘못 나오는 경우가 있다...
- pykospacing 이 완벽한 것은 아님..

In [None]:
sentiment_data = pd.read_excel( 'sentiment_dataset.xlsx')
corpus = sentiment_data['review'].apply(lambda x: kss.split_sentences(x)).tolist()
sentiment_data['polarity'] = sentiment_data['polarity'].replace('negative', 0).replace('positive', 1)
res = process_NVA(corpus)
all_sentences = list(chain(*res))
w2v_model = Word2Vec(sentences = all_sentences, size = 100, window = 3, min_count = 30, workers = 4, sg = 0)
df_in = do_preprocessing(res, sentiment_data, w2v_model)

#### w2v

## 2. 데이터 로더

In [261]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import RandomSampler

In [279]:
class HAN_dataset(Dataset):
    def __init__(self, review_df):
        self.review = review_df.reset_index(drop = True)
        
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        label = self.review.loc[idx, 'polarity']
        w2v = self.review.loc[idx, 'w2v']
        num_words = self.review.loc[idx, 'num_words']
        num_sent = self.review.loc[idx, 'num_sent']
        
        sample = {'w2v': w2v, 
                 'label': label, 
                 'num_sent' : num_sent, 
                 'num_words': num_words}
        return sample

In [920]:
def han_collate_fn(samples):
    labels = [sample['label'] for sample in samples ]
    w2v = [sample['w2v'] for sample in samples]
    doc_lengths = [sample['num_sent'] for sample in samples]
    sent_lengths = [sample['num_words'] for sample in samples]
    
    bsz = len(labels)
    batch_max_doc_length = max(doc_lengths)
    batch_max_sent_length = max( [max(sl) if sl else 0 for sl in sent_lengths])
    docs_tensor = torch.zeros((bsz, batch_max_doc_length, batch_max_sent_length, 100), 
                             dtype = torch.float)
    sent_lengths_tensor = torch.zeros((bsz, batch_max_doc_length)).long()
    
    for doc_idx, doc in enumerate(w2v):
        doc_length = doc_lengths[doc_idx]
        sent_lengths_tensor[doc_idx, :doc_length] = torch.Tensor(sent_lengths[doc_idx])
        for sent_idx, sent in enumerate(doc):
            sent_length = sent_lengths[doc_idx][sent_idx]
            docs_tensor[doc_idx, sent_idx, :sent_length, :] = torch.FloatTensor(sent)
            
    return ( docs_tensor, torch.Tensor(labels), torch.Tensor(doc_lengths), sent_lengths_tensor)

In [921]:
class HanDataLoader(DataLoader):
    def __init__(self, dataset, params_dict, shuffle = True):
        self.n_samples = len(dataset)
        self.init_kwargs = {
            'dataset': dataset, 
            'batch_size' : params_dict['batch_size'],
            'collate_fn' : han_collate_fn, 
            'shuffle': shuffle
        }
        super().__init__(**self.init_kwargs)

In [9]:
import matplotlib
from IPython.display import HTML

import torch


In [898]:
def map_sentence_to_color(words, scores, sent_score):
    sentencemap = matplotlib.cm.get_cmap('Blues')
    wordmap = matplotlib.cm.get_cmap('PuBu')
#     result = '<p><span style="margin:30px; padding:5px; background-color: {}">'\
#         .format(matplotlib.colors.rgb2hex(sentencemap(sent_score))[:3])
    result = '<p><span style="margin:1px; padding:2px; background-color: {}">'\
       .format(matplotlib.colors.rgb2hex(sentencemap(sent_score)[:3]))
    template = '<span class = "barcode"; style ="color: black; background-color: {}">{}</span>'
    for word, score in zip(words, scores):
        color = matplotlib.colors.rgb2hex(wordmap(score)[:3])
        result += template.format(color, '&nbsp' + word + '&nbsp')
    result += '</span><p>'
    return result

In [588]:
def get_test_sample(df_test, idx):
    orig_doc = df_test.loc[idx, 'tokenized']
    doc, num_sents, num_words = df_test.loc[idx, 'w2v'], df_test.loc[idx, 'num_sent'], df_test.loc[idx, 'num_words']
    ground_truth = df_test.loc[idx, 'polarity']
    samples = [{'w2v': doc, 
             'label': ground_truth, 
             'num_sent' : num_sents, 
             'num_words': num_words}]
    docs_tensor, labels, doc_lengths, sent_lengths = han_collate_fn(samples)
    
    x_s = (docs_tensor, doc_lengths, sent_lengths )
    return orig_doc, x_s

In [568]:
han_encoder = ReviewHAN()

In [977]:
ind = random.randint(0, len(df_test)-1)
result = visualize_att(han_encoder, df_test, ind)

In [919]:
display(HTML(result))

In [929]:
ind = random.randint(0, len(df_test) )
print(ind)
orig_doc, x_s = get_test_sample(df_test, ind)
docs, doc_lengths, sent_lengths = x_s
print(orig_doc)
v, a_it, a_i = han_encoder(docs, doc_lengths, sent_lengths)
print('attention_sentence: ', a_i.data.tolist())
print('attention_word: ', a_it.data.tolist())

words = orig_doc
sent_score = a_i.tolist()[0]
word_score = a_it.tolist()[0]
result = "<h2>Attention Visualization</h2>"
for sent, word_att, sent_att in zip(words, word_score, sent_score):
    result += map_sentence_to_color( sent, word_att, sent_att)
    

display(HTML(result))

594
[['품절', '대란', '워', '색상', '발색', '감촉', '좋아요'], ['저', '커피잔', '안', '묻어나는', '틴트', '찾고', '있는데', '이건', '묻어나요']]
attention_sentence:  [[0.457771897315979, 0.542228102684021]]
attention_word:  [[[0.14150747656822205, 0.15101945400238037, 0.14407497644424438, 0.14840368926525116, 0.1267567127943039, 0.15417300164699554, 0.13406464457511902, 0.0, 0.0], [0.11969892680644989, 0.10887161642313004, 0.10335748642683029, 0.11200614273548126, 0.11016161739826202, 0.10936056822538376, 0.11279682070016861, 0.11514297872781754, 0.10860376060009003]]]


In [582]:
from IPython.core.display import display, HTML

In [566]:
with open('example.html', 'w') as f:
    f.write(result)


In [487]:
han_encoder = ReviewHAN().to('cuda:0')
x_s = get_x_s(sample_x_s)
docs, doc_lengths, sent_lengths = x_s
v, a_it, a_i = han_encoder(docs, doc_lengths, sent_lengths)

In [492]:
a_it.shape

torch.Size([4, 3, 14])

In [13]:
params_dict = {}
params_dict['batch_size'] = 4
df_train, df_test = train_test_split(df_in)
dl1 = HanDataLoader(HAN_dataset(df_train), params_dict)
#han_dat = HAN_dataset(df_in)
sample_x_s = iter(dl1).__next__()


NameError: name 'train_test_split' is not defined

NameError: name 'df_in' is not defined

In [10]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence


In [11]:
embed_dim = 100
word_gru_h_dim = 100
word_gru_n_layers = 2
sent_gru_h_dim = 100
sent_gru_n_layers = 2
word_att_dim = 200
sent_att_dim = 200
dropval = 0.2
dropgru_s = 0.2
dropgru_w = 0.2

sent_gru = nn.GRU( 2 * word_gru_h_dim, sent_gru_h_dim, 
                                num_layers = sent_gru_n_layers, batch_first = True,
                                bidirectional = True, dropout = dropgru_s)
sent_layer_norm = nn.LayerNorm( 2 * sent_gru_h_dim, elementwise_affine= True)
sent_attention = nn.Linear(2 * sent_gru_h_dim, sent_att_dim)
sentence_context_vector = nn.Linear(sent_att_dim, 1, bias = False)

        # word
word_gru = nn.GRU(embed_dim, word_gru_h_dim, num_layers = word_gru_n_layers, 
                              batch_first = True, bidirectional = True, dropout = dropgru_w)
word_layer_norm = nn.LayerNorm( 2*word_gru_h_dim, elementwise_affine=True)
word_attention = nn.Linear( 2 * word_gru_h_dim, word_att_dim)
word_context_vector = nn.Linear(word_att_dim, 1, bias = False)

In [12]:
#x_s = get_x_s(sample_x_s)
#docs, doc_lengths, sent_lengths = x_s
docs, doc_lengths, sent_lengths = sample_x_s[0], sample_x_s[2], sample_x_s[3]

NameError: name 'sample_x_s' is not defined

In [459]:
#1. Packing
## 1-1 reorder
doc_lengths, doc_perm_idx = doc_lengths.sort(dim = 0, descending = True)
docs = docs[doc_perm_idx]
sent_lengths = sent_lengths[doc_perm_idx]

## 1-2 packing
packed_sents = pack_padded_sequence(docs, lengths=doc_lengths.tolist(), batch_first = True)
packed_sent_lengths = pack_padded_sequence( sent_lengths, lengths= doc_lengths.tolist(), 
                                          batch_first=True)
valid_bsz_sent = packed_sents.batch_sizes

# 2. Word Attention
## 2-1. packing input data
sents, sent_lengths = packed_sents.data, packed_sent_lengths.data
# reorder
sent_lengths, sent_perm_idx = sent_lengths.sort(dim = 0, descending = True)
sents = sents[sent_perm_idx]

# embedding done already, do dropout
#sents = self.Dropout(sents)
packed_words = pack_padded_sequence( sents, lengths = sent_lengths.tolist(), batch_first=True)
valid_bsz_word = packed_words.batch_sizes

##2-2 NN
# hidden layer
h_it, _ = word_gru( packed_words )
h_it_normed = word_layer_norm(h_it.data)
h_it_pad, _ = pad_packed_sequence ( h_it, batch_first = True )
# attention module
u_it = torch.tanh( word_attention( h_it_normed.data ))
u_it_cv = word_context_vector( u_it ).squeeze(1)

In [460]:
# attention weights
a_it_exp = torch.exp( u_it_cv - u_it_cv.max() )
a_it_exp_pad, _ = pad_packed_sequence( PackedSequence( a_it_exp, valid_bsz_word), 
                                     batch_first= True)
a_it = a_it_exp_pad / torch.sum( a_it_exp_pad, dim = 1, keepdim = True)
# output
s_i = (h_it_pad * a_it.unsqueeze(2)).sum(dim = 1)

In [461]:
## 2-3 reorder
_, sent_unperm_idx = sent_perm_idx.sort(dim = 0, descending = False)
s_i = s_i[sent_unperm_idx] 
a_it = a_it[sent_unperm_idx] 

# 3. Sentence Attention
sents, word_att_weights = s_i, a_it
#sents = self.Dropout(sents)

# 3-1 NN
# hidden layer
h_i, _ = sent_gru(PackedSequence(sents, valid_bsz_sent))
h_i_normed = sent_layer_norm( h_i.data )
h_i_pad, _ = pad_packed_sequence( h_i, batch_first = True )

In [462]:
# attention module
u_i = torch.tanh( sent_attention( h_i_normed.data ))
u_i_cv = sentence_context_vector(u_i).squeeze(1)
# attention weights
a_i_exp = torch.exp( u_i_cv - u_i_cv.max() )
a_i_exp_pad, _ = pad_packed_sequence( PackedSequence(a_i_exp, valid_bsz_sent), 
                                    batch_first = True )
sent_att_weights = a_i_exp_pad / torch.sum( a_i_exp_pad, dim = 1, keepdim = True)

In [463]:
# document vector
v = ( h_i_pad * sent_att_weights.unsqueeze(2)).sum(dim = 1)

# 3-2 reorder
word_att_weights, _ = pad_packed_sequence( PackedSequence( word_att_weights, valid_bsz_sent), 
                                         batch_first = True)
_, doc_unperm_idx = doc_perm_idx.sort(dim = 0, descending = False)

# 4. Final Output
v = v[doc_unperm_idx] 
a_it = word_att_weights[ doc_unperm_idx ] 
a_i = sent_att_weights[ doc_unperm_idx ]

## 3. 학습

In [None]:
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [None]:
train_data['document'].nunique(), train_data['label'].nunique()
train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거

stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
# 형태소 분석기 OKT를 사용한 토큰화 작업 (다소 시간 소요)
okt = Okt()
tokenized_data = []
for sentence in train_data['document'][:10000]:
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    tokenized_data.append(temp_X)


In [None]:
model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)


In [None]:
sample_dat = train_data.values[[220]][0][1]

In [None]:
model.wv.vectors.shape

In [None]:
import numpy as np

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sent_tokeni

In [None]:
okt.morphs(sample_dat)

In [None]:
w2v = []
for element in okt.morphs(sample_dat):
    try:
        w2v.append( model[element] )
    except:
        w2v.append( np.zeros(100) )

In [None]:
w2v

In [None]:
w2v.astype(np.float32)

In [None]:
# 220, 163

In [None]:
class ReviewDataset(Dataset)

In [None]:
sample_dat 

In [None]:
for i, cont in enumerate(train_data.values):
    print(i, cont)
#train_data.values[[10]][0][1]