In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# # MeCabをcolabで使えるようにする
# !apt install aptitude
# !aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
# !pip install mecab-python3

In [0]:
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import FastText
from torchtext.vocab import GloVe
from torchtext.vocab import Vectors

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
from torch.autograd import Variable

import pickle

# import MeCab
import re, os
from glob import glob
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import linecache
import pandas as pd

In [0]:
drive_dir = "drive/My Drive/Colab Notebooks/"
livedoor_data_dir = drive_dir + "livedoor_data/"
word_embedding_dir = drive_dir + "word_embedding_models/"

tagger = MeCab.Tagger("-Owakati")

def make_wakati(sentence):
    sentence = tagger.parse(sentence)
    sentence = re.sub(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+', " ", sentence)
    sentence = re.sub(r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+', "", sentence)
    wakati = sentence.split(" ")
    wakati = list(filter(("").__ne__, wakati))
    return wakati

categories = [name for name in os.listdir(livedoor_data_dir + 'text') if os.path.isdir(livedoor_data_dir + "text/" +name)]
print(categories)

# row = []
# for cat in categories:
#     path = livedoor_data_dir + "text/" + cat + "/*.txt"
#     files = glob(path)
#     for text_name in tqdm(files):
#         title = linecache.getline(text_name, 3)
#         tmp = [title, categories.index(cat)]
#         row.append(tmp)
# livedoor_df = pd.DataFrame(row, columns=["title", "category"])

# with open(livedoor_data_dir + "livedoor_datasets.pickle", 'wb') as w:
#     pickle.dump(livedoor_df, w)

with open(livedoor_data_dir + "livedoor_datasets.pickle", 'rb') as f:
  livedoor_df = pickle.load(f)

cat_idx = []
for row in livedoor_df.iterrows():
  cat = row[1]['category']
  idx = categories.index(cat)
  cat_idx.append(idx)

join_df = pd.DataFrame(cat_idx, columns=['category_id'])
livedoor_df = pd.concat([livedoor_df, join_df], axis=1)

train_df, test_df = train_test_split(livedoor_df[['title', 'category_id']], train_size=0.7)

train_df.to_csv(livedoor_data_dir + 'train_ja.tsv', sep="\t", index=False, header=False)
test_df.to_csv(livedoor_data_dir + 'test_ja.tsv', sep="\t", index=False, header=False)

['it-life-hack', 'kaden-channel', 'livedoor-homme', 'topic-news', 'peachy', 'sports-watch', 'dokujo-tsushin', 'smax', 'movie-enter']


In [0]:
TEXT = data.Field(sequential=True, tokenize=make_wakati, lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False)

train, test = data.TabularDataset.splits(
        path=livedoor_data_dir, train='train_ja.tsv', test='test_ja.tsv', format='tsv',
        fields=[('Text', TEXT), ('Label', LABEL)])

print(vars(train[0]))

{'Text': ['コス', 'プレイヤー', 'の', '応募', '作品', 'を', 'そのまま', 'アプリ', 'に', 'コスプレフォト', 'の', '世界', 'が', '楽しめる', 'コス', 'プレ', '時計', 'アプリ', 'アプリ'], 'Label': '7'}


In [0]:
japanese_fasttext_vectors = Vectors(name=word_embedding_dir + "wiki.ja.vec")
TEXT.build_vocab(train, vectors=japanese_fasttext_vectors, min_freq=1)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iter, test_iter = data.Iterator.splits((train, test), batch_sizes=(100, 100), device=device, repeat=False,sort=False)

In [0]:
class AttentionLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tag_size):
        super(AttentionLSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.embed.weight.data.copy_(TEXT.vocab.vectors)
        self.embed.requred_grad_ = False
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tag_size)
        self.softmax = nn.Softmax(dim=1)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, sentence):
        embeds = self.embed(sentence)
        out, hs = self.gru(embeds)

        s = torch.bmm(out, torch.transpose(out,1,2))
        attention_weight = self.softmax(s)
        c = torch.zeros(out.size()[0], 1, self.hidden_dim, device=device)
        for i in range(attention_weight.size()[2]):
          unsq_weight = attention_weight[:,:,i].unsqueeze(2)
          weighted_hs = out * unsq_weight
          weight_sum = torch.sum(weighted_hs, axis=1).unsqueeze(1)
          c = torch.cat([c, weight_sum], dim=1)
        c = c[:,1:,:]
        c = c.sum(dim=1)

        tag_space = self.hidden2tag(c)
        tag_score = self.log_softmax(tag_space)

        return tag_score, attention_weight


EMBEDDING_DIM = TEXT.vocab.vectors.size()[1]
HIDDEN_DIM = 128
VOCAB_SIZE = TEXT.vocab.vectors.size()[0]
TAG_SIZE = len(categories)
# to(device)でモデルがGPU対応する
model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [0]:
losses = []
for epoch in range(100):
    all_loss = 0
    correct = 0
    train_num = 0
    for idx, batch in enumerate(train_iter):
        batch_loss = 0

        model.zero_grad()
        title_tensor = batch.Text[0]
        category_tensor = batch.Label

        score, hs = model(title_tensor)

        batch_loss = loss_function(score, category_tensor)
        batch_loss.backward()
        optimizer.step()
        
        _, predicts = torch.max(score, 1)
        for j, ans in enumerate(category_tensor):
            if predicts[j].item() == ans.item():
                correct += 1
        train_num += category_tensor.size()[0]


        all_loss += batch_loss.item()
    print("epoch", epoch, "\t" , "loss", all_loss, "\t", "acc", correct / train_num)
    if all_loss < 0.1: break
print("done.")

epoch 0 	 loss 204.69387102127075 	 acc 0.1654077086964943
epoch 1 	 loss 102.4802680015564 	 acc 0.2775518109626186
epoch 2 	 loss 91.43012070655823 	 acc 0.39105171411969786
epoch 3 	 loss 76.30656278133392 	 acc 0.5186906837110207
epoch 4 	 loss 59.62267470359802 	 acc 0.6292852992446252
epoch 5 	 loss 46.011005878448486 	 acc 0.705210149138098
epoch 6 	 loss 32.0744503736496 	 acc 0.7917877203176448
epoch 7 	 loss 23.487269461154938 	 acc 0.8562851055587837
epoch 8 	 loss 19.94648975133896 	 acc 0.8723610304086771
epoch 9 	 loss 13.742605410516262 	 acc 0.9178772031764478
epoch 10 	 loss 8.967100329697132 	 acc 0.9486732519852799
epoch 11 	 loss 10.866885278373957 	 acc 0.9304667828781716
epoch 12 	 loss 6.58900348842144 	 acc 0.9631996901026535
epoch 13 	 loss 3.450178973376751 	 acc 0.9843114468332365
epoch 14 	 loss 2.4140785094350576 	 acc 0.9908967654464459
epoch 15 	 loss 1.7827170873060822 	 acc 0.9941894247530505
epoch 16 	 loss 2.6162975020706654 	 acc 0.9877977919814062
e

KeyboardInterrupt: ignored

In [0]:
test_num = len(test_df)
a = 0
with torch.no_grad():
    # title_batch, category_batch = train2batch(test_x, test_y)

    # for i in range(len(title_batch)):
    for idx, batch in enumerate(test_iter):
        title_tensor = batch.Text[0] #torch.tensor(title_batch[i], device=device)
        category_tensor = batch.Label #torch.tensor(category_batch[i], device=device)

        score, hs = model(title_tensor)
        _, predicts = torch.max(score, 1)
        for j, ans in enumerate(category_tensor):
            if predicts[j].item() == ans.item():
                a += 1
print("predict : ", a / test_num)

predict :  0.6398553999096249
