# livedoorニュースの分類
## MeCabのインストール

In [None]:
!apt install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7

## データファイルのダウンロードと解凍

In [None]:
import tarfile 
from urllib.request import urlretrieve

FILE_URL = 'https://www.rondhuit.com/download/ldcc-20140209.tar.gz'
FILE_PATH = '/content/ldcc-20140209.tar.gz'
EXTRACT_DIR = '/content'

urlretrieve(FILE_URL, FILE_PATH)

mode = "r:gz"
tar = tarfile.open(FILE_PATH, mode) 
tar.extractall(EXTRACT_DIR) 
tar.close()

In [None]:
!ls -l /content/text

## 記事ファイルの読み込み

In [None]:
import os
import pandas as pd

base_dir = '/content/text'

category = pd.Series(name='category')
url = pd.Series(name='url')
time_published = pd.Series(name='time_published')
title = pd.Series(name='title')
text = pd.Series(name='text')

index = 0

for name in os.listdir(base_dir):
    if os.path.isdir(os.path.join(base_dir, name)):
        for file in os.listdir(os.path.join(base_dir, name)):
            if file != 'LICENSE.txt':
                with open(os.path.join(base_dir, name, file), 'r') as f:
                    lines = f.readlines()
                category.at[index] = name
                url.at[index] = lines[0].rstrip()
                time_published.at[index] = lines[1].rstrip()
                title.at[index] = lines[2].rstrip()
                text.at[index] = ''.join(lines[3:])
                index += 1

In [None]:
df = pd.concat([category, url, time_published, title, text], axis=1)

In [None]:
df.head()

## MeCabによるテキストのトークン化

In [None]:
import MeCab

tagger = MeCab.Tagger('-Owakati')

def tokenize_japanese(text):
    return tagger.parse(text)

In [None]:
df['text'] = df['text'].map(tokenize_japanese)

In [None]:
df.head()

## KerasのTokenizerにより単語を整数にエンコード

In [None]:
import tensorflow as tf

MAX_WORDS = 20000 # 最も頻度の高い20,000語のみエンコード

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS)

tokenizer.fit_on_texts(df['text'])

In [None]:
df['sequence'] = tokenizer.texts_to_sequences(df['text'])

In [None]:
word2int = tokenizer.word_index

In [None]:
print(df['sequence'][0])

## Kerasのpad_sequencesで長さを統一

In [None]:
MAX_LENGTH = 5000

x_seq = tf.keras.preprocessing.sequence.pad_sequences(df['sequence'], maxlen=MAX_LENGTH)

## カテゴリーを整数にエンコーディング

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['encoded_label'] = le.fit_transform(df['category'])

## （１）テキストをマルチホットエンコーディング

（サンプル数×最大単語数）のオールゼロ行列を準備する。

In [None]:
import numpy as np

words_multi_hot = np.zeros((x_seq.shape[0], MAX_WORDS))
words_multi_hot.shape

テキスト中にある単語のコードをインデックスにして１を立てる

In [None]:
for i in range(x_seq.shape[0]):
    words_multi_hot[i,x_seq[i]] = 1

In [None]:
words_multi_hot[0]

In [None]:
num_categories = len(np.unique(df['category']))

model = tf.keras.models.Sequential()

inputs = tf.keras.Input(shape=(20000,))

model.add(tf.keras.layers.Dense(256, activation='relu', input_shape=(20000,)))
model.add(tf.keras.layers.Dense(num_categories, activation='softmax'))

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(words_multi_hot, df['encoded_label'], test_size=0.3)

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
          optimizer='adam',
          metrics=['acc'])

history = model.fit(x_train, y_train,
                    batch_size=32,
                    epochs=10,
                    validation_data=(x_test, y_test),
                    verbose=1)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, label='Train Accuracy')
plt.plot(epochs, val_acc, label='Validation Accuracy')
plt.legend()
plt.show()

plt.plot(epochs, loss, label='Train Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.legend()
plt.show()

## （２）単語埋め込み

In [None]:
emb_dim = 128

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(MAX_WORDS, emb_dim, input_length=MAX_LENGTH))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(num_categories, activation='softmax'))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_seq, df['encoded_label'], test_size=0.3)

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
          optimizer='adam',
          metrics=['acc'])

history = model.fit(x_train, y_train,
                    batch_size=32,
                    epochs=50,
                    validation_data=(x_test, y_test),
                    verbose=1)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, label='Train Accuracy')
plt.plot(epochs, val_acc, label='Validation Accuracy')
plt.legend()
plt.show()

plt.plot(epochs, loss, label='Train Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.legend()
plt.show()

## RNN（LSTM）を使ってみる

In [None]:
EMB_DIMS = 128

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(MAX_WORDS, EMB_DIMS, input_length=MAX_LENGTH))
model.add(tf.keras.layers.Dropout(rate=0.4, noise_shape=(None, 1, EMB_DIMS)))
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dense(num_categories, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_seq, df['encoded_label'], test_size=0.3)

history = model.fit(x_train, y_train,
                    batch_size=30,
                    epochs=20,
                    validation_data=(x_test, y_test))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, label='Train Accuracy')
plt.plot(epochs, val_acc, label='Validation Accuracy')
plt.legend()
plt.show()

plt.plot(epochs, loss, label='Train Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.legend()
plt.show()