In [7]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# MeCabをcolabで使えるようにする
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3

In [11]:
# データをDataFrameにまとめる

import os
from glob import glob
import pandas as pd
import linecache
from tqdm import tqdm_notebook as tqdm

drive_dir = "drive/My Drive/Colab Notebooks/"
categories = [name for name in os.listdir(drive_dir + 'text') if os.path.isdir(drive_dir + "text/" +name)]
print(categories)


import pickle

with open(drive_dir + "livedoor_datasets.pickle", 'rb') as f:
  datasets = pickle.load(f)

['it-life-hack', 'kaden-channel', 'livedoor-homme', 'topic-news', 'peachy', 'sports-watch', 'dokujo-tsushin', 'smax', 'movie-enter']


In [9]:
# 形態素解析を定義

import MeCab
import re

tagger = MeCab.Tagger("-Owakati")

def make_wakati(sentence):
    # MeCabで分かち書き
    sentence = tagger.parse(sentence)
    # 半角全角英数字除去
    sentence = re.sub(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+', " ", sentence)
    # 記号もろもろ除去
    sentence = re.sub(r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+', "", sentence)
    # スペースで区切って形態素の配列へ
    wakati = sentence.split(" ")
    # 空の要素は削除
    wakati = list(filter(("").__ne__, wakati))
    return wakati

# 単語数取得
word2index = {}
# 系列を揃えるためのパディング
word2index.update({"<pad>":0})

for title in datasets["title"]:
    wakati = make_wakati(title)
    for word in wakati:
        if word in word2index: continue
        word2index[word] = len(word2index)
print("vocab size : ", len(word2index))

vocab size :  13230


In [0]:
# データのバッチ化

from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle

cat2index = {}
for cat in categories:
    if cat in cat2index: continue
    cat2index[cat] = len(cat2index)

def sentence2index(sentence):
    wakati = make_wakati(sentence)
    return [word2index[w] for w in wakati]
    
def category2index(cat):
    return [cat2index[cat]]

index_datasets_title_tmp = []
index_datasets_category = []
max_len = 0
for title, category in zip(datasets["title"], datasets["category"]):
  index_title = sentence2index(title)
  index_category = category2index(category)
  index_datasets_title_tmp.append(index_title)
  index_datasets_category.append(index_category)
  if max_len < len(index_title):
    max_len = len(index_title)

# 系列の長さを揃えるために短い系列にパディングを追加
# seq2seqのときみたいに後ろパディングしたらLSTMの順伝搬の結果が全部同じになってしまったので、前パディングにしたらうまくいった
index_datasets_title = []
for title in index_datasets_title_tmp:
  for i in range(max_len - len(title)):
    title.insert(0, 0)
#     title.append(0)
  index_datasets_title.append(title)

  
train_x, test_x, train_y, test_y = train_test_split(index_datasets_title, index_datasets_category, train_size=0.7)



def train2batch(title, category, batch_size=100):
  title_batch = []
  category_batch = []
  title_shuffle, category_shuffle = shuffle(title, category)
  for i in range(0, len(title), batch_size):
    title_batch.append(title_shuffle[i:i+batch_size])
    category_batch.append(category_shuffle[i:i+batch_size])
  return title_batch, category_batch

In [0]:
# モデル定義

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, batch_size=100):
        super(LSTMClassifier, self).__init__()
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        _, lstm_out = self.lstm(embeds)
        bilstm_out = torch.cat([lstm_out[0][0], lstm_out[0][1]], dim=1)
        tag_space = self.hidden2tag(bilstm_out)
        tag_scores = self.softmax(tag_space.squeeze())
        return tag_scores

# ハイパーパラメータ、損失関数、最適化など
EMBEDDING_DIM = 200
HIDDEN_DIM = 300
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)