In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# !mv text drive/My\ Drive/Colab\ Notebooks/

In [0]:
# MeCabをcolabで使えるようにする
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3

In [0]:
# データをDataFrameにまとめる

import os
from glob import glob
import pandas as pd
import linecache
from tqdm import tqdm_notebook as tqdm

drive_dir = "drive/My Drive/Colab Notebooks/"

# カテゴリを配列で取得
categories = [name for name in os.listdir(drive_dir + 'text') if os.path.isdir(drive_dir + "text/" +name)]
print(categories)

import pickle

with open(drive_dir + "livedoor_datasets.pickle", 'rb') as f:
  datasets = pickle.load(f)
  
# row = []
# for cat in tqdm(categories):
#     path = drive_dir + "text/" + cat + "/*.txt"
#     files = glob(path)
#     for text_name in tqdm(files):
#         title = linecache.getline(text_name, 3)
#         tmp = [title, cat]
#         row.append(tmp)
# #         s = pd.Series([title, cat], index=datasets.columns)
# #         datasets = datasets.append(s, ignore_index=True)
# datasets = pd.DataFrame(row, columns=["title", "category"])

['it-life-hack', 'kaden-channel', 'livedoor-homme', 'topic-news', 'peachy', 'sports-watch', 'dokujo-tsushin', 'smax', 'movie-enter']


In [0]:
# データ確認
datasets = datasets.sample(frac=1).reset_index(drop=True)
datasets.head()

Unnamed: 0,title,category
0,【話題】市場はスマホへ？　ドコモが人気の従来型携帯廃止を発表\n,kaden-channel
1,【ニュース】日本上陸で話題の動画配信サービスHuluがブラビアでも視聴可能に\n,kaden-channel
2,好きなところを温めてほぐすマッサージ器は車載もOK!【売れ筋チェック】\n,kaden-channel
3,竜巻被害を報じるフジ報道番組で、安藤アナの笑顔が映る放送事故\n,topic-news
4,JYJユチョン主演のスリリングな恋愛ドラマ『ミス・リプリー』レンタル開始決定\n,peachy


In [0]:
# 形態素解析を定義

import MeCab
import re

tagger = MeCab.Tagger("-Owakati")

def make_wakati(sentence):
    # MeCabで分かち書き
    sentence = tagger.parse(sentence)
    # 半角全角英数字除去
    sentence = re.sub(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+', " ", sentence)
    # 記号もろもろ除去
    sentence = re.sub(r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+', "", sentence)
    # スペースで区切って形態素の配列へ
    wakati = sentence.split(" ")
    # 空の要素は削除
    wakati = list(filter(("").__ne__, wakati))
    return wakati

# 単語数取得
word2index = {}
# 系列を揃えるためのパディング
word2index.update({"<pad>":0})

for title in datasets["title"]:
    wakati = make_wakati(title)
    for word in wakati:
        if word in word2index: continue
        word2index[word] = len(word2index)
print("vocab size : ", len(word2index))

vocab size :  13230


In [0]:
# データのバッチ化

from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle

cat2index = {}
for cat in categories:
    if cat in cat2index: continue
    cat2index[cat] = len(cat2index)

def sentence2index(sentence):
    wakati = make_wakati(sentence)
    return [word2index[w] for w in wakati]
    
def category2index(cat):
    return [cat2index[cat]]

index_datasets_title_tmp = []
index_datasets_category = []
max_len = 0
for title, category in zip(datasets["title"], datasets["category"]):
  index_title = sentence2index(title)
  index_category = category2index(category)
  index_datasets_title_tmp.append(index_title)
  index_datasets_category.append(index_category)
  if max_len < len(index_title):
    max_len = len(index_title)

# 系列の長さを揃えるために短い系列にパディングを追加
# seq2seqのときみたいに後ろパディングしたらLSTMの順伝搬の結果が全部同じになってしまったので、前パディングにしたらうまくいった
index_datasets_title = []
for title in index_datasets_title_tmp:
  for i in range(max_len - len(title)):
    title.insert(0, 0)
#     title.append(0)
  index_datasets_title.append(title)

  
train_x, test_x, train_y, test_y = train_test_split(index_datasets_title, index_datasets_category, train_size=0.7)



def train2batch(title, category, batch_size=100):
  title_batch = []
  category_batch = []
  title_shuffle, category_shuffle = shuffle(title, category)
  for i in range(0, len(title), batch_size):
    title_batch.append(title_shuffle[i:i+batch_size])
    category_batch.append(category_shuffle[i:i+batch_size])
  return title_batch, category_batch

In [0]:
# モデル定義

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, batch_size=100):
        super(LSTMClassifier, self).__init__()
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=0.5)
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        _, lstm_out = self.lstm(embeds)
        bilstm_out = torch.cat([lstm_out[0][0], lstm_out[0][1]], dim=1)
        tag_space = self.hidden2tag(bilstm_out)
        tag_scores = self.softmax(tag_space.squeeze())
        return tag_scores

# ハイパーパラメータ、損失関数、最適化など
EMBEDDING_DIM = 200
HIDDEN_DIM = 300
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

  "num_layers={}".format(dropout, num_layers))


In [0]:
losses = []
for epoch in range(100):
    all_loss = 0
    title_batch, category_batch = train2batch(train_x, train_y)
    for i in range(len(title_batch)):
        batch_loss = 0

        model.zero_grad()

        title_tensor = torch.tensor(title_batch[i], device=device)
        category_tensor = torch.tensor(category_batch[i], device=device).squeeze()
        out = model(title_tensor)

        batch_loss = loss_function(out, category_tensor)
        batch_loss.backward()
        optimizer.step()

        all_loss += batch_loss.item()
    print("epoch", epoch, "\t" , "loss", all_loss)
    if all_loss < 0.1: break
print("done.")



epoch 0 	 loss 86.37613916397095
epoch 1 	 loss 55.229271829128265
epoch 2 	 loss 37.06739562749863
epoch 3 	 loss 23.231920212507248
epoch 4 	 loss 12.926898941397667
epoch 5 	 loss 6.392280258238316
epoch 6 	 loss 4.309775417670608
epoch 7 	 loss 2.9130836687982082
epoch 8 	 loss 1.4029346471652389
epoch 9 	 loss 0.6516141341999173
epoch 10 	 loss 0.40866265119984746
epoch 11 	 loss 0.33226508961524814
epoch 12 	 loss 0.29431027243845165
epoch 13 	 loss 0.2709219478419982
epoch 14 	 loss 0.2584600468399003
epoch 15 	 loss 0.24211731820832938
epoch 16 	 loss 0.23587584507185966
epoch 17 	 loss 0.2299350556859281
epoch 18 	 loss 0.2202752007287927
epoch 19 	 loss 0.219766568421619
epoch 20 	 loss 0.2107653882703744
epoch 21 	 loss 0.21127265034010634
epoch 22 	 loss 0.20408812331152149
epoch 23 	 loss 0.20183917827671394
epoch 24 	 loss 0.21720245279720984
epoch 25 	 loss 0.2130082876101369
epoch 26 	 loss 0.1950056822533952
epoch 27 	 loss 0.1972068732138723
epoch 28 	 loss 0.19494405

In [0]:
test_num = len(test_x)
a = 0
with torch.no_grad():
    title_batch, category_batch = train2batch(test_x, test_y)
    
    for i in range(len(title_batch)):
        title_tensor = torch.tensor(title_batch[i], device=device)
        category_tensor = torch.tensor(category_batch[i], device=device)
        
        out = model(title_tensor)
        _, predicts = torch.max(out, 1)
        for j, ans in enumerate(category_tensor):
            if predicts[j].item() == ans.item():
                a += 1
print("predict : ", a / test_num)



predict :  0.6773610483506552
