# Imports

In [36]:
import pandas as pd
import numpy as np
import codecs

import MeCab
import nltk
from nltk.stem.porter import PorterStemmer
import neologdn
from bs4 import BeautifulSoup
import re

from gensim.models import Word2Vec

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from keras import backend as K
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)
from keras_preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras import regularizers
from keras.layers import Input, Concatenate, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, BatchNormalization, Activation, Dropout, LSTM, Bidirectional
from keras.layers import CuDNNLSTM, Conv2D, MaxPool2D
from keras_preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.constraints import unit_norm 
from keras.layers import GaussianNoise
from keras.models import load_model
import keras

from AdamW_Keras.AdamW import AdamW

import lightgbm as lgb
from sklearn import model_selection
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support


from annoy import AnnoyIndex

# Load Data

In [37]:
#seriesのid, content, genreの情報を読み込み -> np.array
series_path = '../series-20190310.csv'
with codecs.open(series_path, 'r', 'utf-8', 'ignore') as f:
    series = pd.read_csv(f)
series_id = series['series_id'].values
series_content = series['series_content'].values
genre_id = series['genre_id'].values

# Global Variables

In [38]:
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.3

In [39]:
DROPOUT_RATE = 0.2

# Helper Functions

In [40]:
stopwords = []
with codecs.open("../Japanese.txt", 'r', 'utf-8', 'ignore') as fp:
    stopwords = fp.read().splitlines() 

In [41]:
def analyzer(text):
    tagger = MeCab.Tagger('')
    CONTENT_WORD_POS = ('名詞', '動詞', '形容詞', '副詞')
    stemmer = PorterStemmer()
    jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^　「」！？。]*[！？。]')
    
    
    def normalize_text(s):
        
        def maketrans(f, t):
            return {ord(x): ord(y) for x, y in zip(f, t)}

        
        normalized_text = neologdn.normalize(s)
        soup = BeautifulSoup(normalized_text)
        cleaned_text = soup.get_text()
        
        cleaned_text = re.sub(r'http\S+', '', cleaned_text)
        cleaned_text = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', cleaned_text)
        cleaned_text = re.sub('[﹣－ｰ—―─━ー]+', 'ー', cleaned_text)
        cleaned_text = re.sub('[~∼∾〜〰～]', '', cleaned_text)
        cleaned_text = re.sub('[0-9]', '', cleaned_text)
        cleaned_text = cleaned_text.translate(maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣','！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))
        cleaned_text = re.sub(re.compile("[!-/:-@[-`{-~]"), '', cleaned_text)
        cleaned_text = re.sub(re.compile('[!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣]'), '', cleaned_text)
        cleaned_text = re.sub(re.compile('[■□◆◇◯“…【】『』！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」]'), '', cleaned_text)
        cleaned_text = re.sub('[’]', '\'', cleaned_text)
        cleaned_text = re.sub('[”]', '"', cleaned_text)
        cleaned_text = cleaned_text.replace('\n','')
        cleaned_text = cleaned_text.replace('\r','')
        
        return cleaned_text
            
    
    def tokenize(s):
        lines = tagger.parse(s).splitlines()[:-1]
        words = []
        for line in lines:
            if line == '':
                continue
            else:
                try:
                    surface, feature = line.split('\t')
                    if feature.startswith(CONTENT_WORD_POS) and ',非自立,' not in feature:
                        words.append(surface)

                except:
                    print(line)
                
        return words
    
    def sent_tokenize(s):
        sentences = jp_sent_tokenizer.tokenize(s)
        return sentences
    
    def lemmatize(words):
        
        def isEnglish(s):
                try:
                    s.encode(encoding='utf-8').decode('ascii')
                except UnicodeDecodeError:
                    return False
                else:
                    return True
                
        lemmas = []
        for word in cleaned_words:
            if not isEnglish(word):
                try:
                    for line in tagger.parse(word).splitlines()[:-1]:
                        surface, feature = line.split('\t')
                        if feature.split(',')[6] != '*':
                            lemmas.append(feature.split(',')[6])
                            
                except:
                    print(word)
            else:
                lemmas.append(word)
                    
        return lemmas

    cleaned_text = normalize_text(text)
    words = tokenize(cleaned_text)
    cleaned_words = list(filter(lambda x: x not in stopwords, words))
    cleaned_words =  list(filter(None, cleaned_words))
    lemmas = lemmatize(cleaned_words)
    
    return lemmas

In [42]:
def evaluate_performance(X, y):
    
    clf = LogisticRegression()
    
    scoring = {"p": "precision_macro",
               "r": "recall_macro",
               "f":"f1_macro",
               "a": "accuracy"
              }
    
    cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
    
    results = model_selection.cross_validate(clf,
                                             X,
                                             y,
                                             scoring=scoring,
                                             cv=cv,
                                             return_train_score=True
                                            )
    return(results)

# Preprocessing

In [43]:
corpus = [analyzer(text) for text in series_content]

In [44]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(corpus)

In [45]:
sequences = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [46]:
enc = LabelEncoder().fit(genre_id)
tmp_labels = enc.transform(genre_id)

In [47]:
targets = np.array(tmp_labels).reshape(-1)
labels = np.eye(17)[targets]

In [48]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]

In [49]:
labels = labels[indices]

In [50]:
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [51]:
macronum = len(set(tmp_labels))

In [52]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
# x_train = data
# y_train = labels
# x_val = x_train
# y_val = y_train

# Word2Vec

In [53]:
features_num = 300
min_word_count = 2
context = 15
downsampling = 1e-3
epoch_num = 100

In [54]:
word2vec = Word2Vec(sentences=corpus,
                    iter=epoch_num,
                    size=features_num,
                    min_count=min_word_count,
                    window=context,
                    sample=downsampling
                   )

# Initial Embeddings from Word2Vec

In [55]:
embeddings_index = {}
words = word2vec.wv.vocab

for word in words:
    embeddings_index[word] = word2vec[word]

  """


In [56]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Model

In [57]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True
                           )

In [58]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

batch_norm = BatchNormalization()(embedded_sequences)


l_lstm = Bidirectional(CuDNNLSTM(150,
                            kernel_regularizer=regularizers.l2(1e-6),
                            recurrent_regularizer=regularizers.l2(1e-6),
                           ), name='dense_encoding')(batch_norm)

l_lstm = Dropout(DROPOUT_RATE)(l_lstm)

output = Dense(macronum, activation='softmax')(l_lstm)

model = Model(sequence_input, output)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 300)          8103000   
_________________________________________________________________
batch_normalization_2 (Batch (None, 300, 300)          1200      
_________________________________________________________________
dense_encoding (Bidirectiona (None, 300)               542400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 17)                5117      
Total params: 8,651,717
Trainable params: 8,651,117
Non-trainable params: 600
________________________________________________________________

In [59]:
optimizer = keras.optimizers.adam(decay=1e-6)

In [60]:
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

In [61]:
model_filepath= "./models/model-08-best.h5"

In [62]:
checkpoint = ModelCheckpoint(model_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [63]:
callbacks_list = [checkpoint]

In [64]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=100, batch_size=300, callbacks=callbacks_list, verbose=1)

Train on 5051 samples, validate on 2164 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.44177, saving model to ./models/model-08-best.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.44177 to 0.54760, saving model to ./models/model-08-best.h5
Epoch 3/100

Epoch 00003: val_acc improved from 0.54760 to 0.61091, saving model to ./models/model-08-best.h5
Epoch 4/100

Epoch 00004: val_acc improved from 0.61091 to 0.66128, saving model to ./models/model-08-best.h5
Epoch 5/100

Epoch 00005: val_acc improved from 0.66128 to 0.68900, saving model to ./models/model-08-best.h5
Epoch 6/100

Epoch 00006: val_acc improved from 0.68900 to 0.72043, saving model to ./models/model-08-best.h5
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.72043
Epoch 8/100

Epoch 00008: val_acc improved from 0.72043 to 0.72597, saving model to ./models/model-08-best.h5
Epoch 9/100

Epoch 00009: val_acc improved from 0.72597 to 0.74168, saving model to ./models/model-08-best.h5
Epoch 10/100


Epoch 00040: val_acc did not improve from 0.76479
Epoch 41/100

Epoch 00041: val_acc did not improve from 0.76479
Epoch 42/100

Epoch 00042: val_acc did not improve from 0.76479
Epoch 43/100

Epoch 00043: val_acc did not improve from 0.76479
Epoch 44/100

Epoch 00044: val_acc did not improve from 0.76479
Epoch 45/100

Epoch 00045: val_acc did not improve from 0.76479
Epoch 46/100

Epoch 00046: val_acc did not improve from 0.76479
Epoch 47/100

Epoch 00047: val_acc did not improve from 0.76479
Epoch 48/100

Epoch 00048: val_acc did not improve from 0.76479
Epoch 49/100

Epoch 00049: val_acc did not improve from 0.76479
Epoch 50/100

Epoch 00050: val_acc did not improve from 0.76479
Epoch 51/100

Epoch 00051: val_acc did not improve from 0.76479
Epoch 52/100

Epoch 00052: val_acc did not improve from 0.76479
Epoch 53/100

Epoch 00053: val_acc did not improve from 0.76479
Epoch 54/100

Epoch 00054: val_acc did not improve from 0.76479
Epoch 55/100

Epoch 00055: val_acc did not improve fr

<keras.callbacks.History at 0x7f7d441e7780>

# Model Evaluation

In [65]:
model = load_model('./models/model-07-best.h5')

In [66]:
eval_sequences = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index
eval_data = pad_sequences(eval_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [67]:
eval_y = genre_id

In [68]:
vectorizer = Model(inputs=model.input, outputs=model.get_layer('dense_encoding').output)

In [69]:
eval_x = vectorizer.predict(eval_data)

ValueError: Error when checking input: expected input_1 to have shape (1000,) but got array with shape (300,)

In [None]:
np.mean(evaluate_performance(eval_x, eval_y)['test_f'])

# Nearest Neighbors Search

In [None]:
annoy_index = AnnoyIndex(300, metric='angular')

In [None]:
for i, vector in enumerate(eval_x):
    annoy_index.add_item(i, vector)

In [None]:
annoy_index.build(100)

In [None]:
def get_nearest_neighbors(index, n):
    print("Original series:\n{0}\n\n".format(series_content[index]))
    retrieved_series = annoy_index.get_nns_by_vector(eval_x[index], n)
    for i, series_index in enumerate(retrieved_series):
        print("Top {0} Nearest Series:\n{1}\n Genre: {2}\n\n".format(i,series_content[series_index], genre_id[series_index]))

In [None]:
get_nearest_neighbors(550,100)