# TF-Hubを使った単純なテキスト分類(日本語版)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

## Data
青空文庫のデータを使用

### aozoraディレクトリ内にある著者リスト
- 26: 中原中也
- 64: 樋口一葉
- 81: 宮沢賢治
- 121: 新美南吉
- 148: 夏目漱石
- 879: 芥川竜之介
- 885: 与謝野晶子

In [None]:
# aozora以下からデータをロード
def load_directory_data(directory, id):
  data = {}
  data["sentence"] = []
  for file_path in os.listdir(os.path.join(directory, id)):
    with tf.gfile.GFile(os.path.join(os.path.join(directory, id), file_path), "r") as f:
      data["sentence"].append(f.read())
  return pd.DataFrame.from_dict(data)

def load_dataset(directory):
  pos_df = load_directory_data(directory, "885")
  neg_df = load_directory_data(directory, "121")
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

def load_datasets(force_download=False):
  train_df = load_dataset('./aozora/txt/train/')
  test_df = load_dataset('./aozora/txt/test/')
  
  return train_df, test_df

tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = load_datasets()
train_df.head()

## Models
### Input functions

In [None]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["polarity"], num_epochs=None, shuffle=True)

predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["polarity"], shuffle=False)

predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    test_df, test_df["polarity"], shuffle=False)

### Feature columns
モジュールにnnlm-ja-dim50を使用([https://tfhub.dev/google/nnlm-ja-dim50/1](https://tfhub.dev/google/nnlm-ja-dim50/1))

In [None]:
embedded_text_feature_column = hub.text_embedding_column(
    key="sentence", 
    module_spec="https://tfhub.dev/google/nnlm-ja-dim50/1")

### Estimator
TensorFlowの高レベルAPIを使用([DNNClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNClassifier))

In [None]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[500, 100],
    feature_columns=[embedded_text_feature_column],
    n_classes=2,
    optimizer=tf.train.AdagradOptimizer(learning_rate=0.003),
)

### Training
学習を実行

In [None]:
estimator.train(input_fn=train_input_fn, steps=1000);

## Prediction
トレーニングセットとテストセットの両方に対し予測実行

In [None]:
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)

print("Training set accuracy: {accuracy}".format(**train_eval_result))
print("Test set accuracy: {accuracy}".format(**test_eval_result))

### Confusion matrix

In [None]:
def get_predictions(estimator, input_fn):
  return [x["class_ids"][0] for x in estimator.predict(input_fn=input_fn)]

LABELS = [
    "885", "121"
]

with tf.Graph().as_default():
  cm = tf.confusion_matrix(test_df["polarity"], 
                           get_predictions(estimator, predict_test_input_fn))

  with tf.Session() as session:
    cm_out = session.run(cm)

# Normalize the confusion matrix so that each row sums to 1.
cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_out, annot=True, xticklabels=LABELS, yticklabels=LABELS);
plt.xlabel("Predicted");
plt.ylabel("True");