In [1]:
#ニュース記事からの単語の抽出とカウント

In [2]:
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *

# Loading the corpus
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# Load MA_Reuters
documents = ma_reuters.fileids()
print (str(len(documents)) + " total articles")
# extracting training and testing data (document ID)
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print (str(len(train_docs_id)) + " training data")
print (str(len(test_docs_id)) + " testing data")
# Training and testing data
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]
 
# print the total number of categories
categories = ma_reuters.categories()
num_categories = len(categories)
print (num_categories, " categories")
print (categories)

10700 total articles
7713 training data
2987 testing data
55  categories
['acq', 'alum', 'barley', 'bop', 'carcass', 'cocoa', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'crude', 'dlr', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'hog', 'housing', 'interest', 'ipi', 'iron-steel', 'jobs', 'lead', 'livestock', 'meal-feed', 'money-fx', 'money-supply', 'nat-gas', 'oilseed', 'orange', 'palm-oil', 'pet-chem', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3  
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

# Kerasの定義
import keras
#from keras import backend as K
#import keras.backend.tensorflow_backend as K
from tensorflow.compat.v1.keras import backend as K
K.set_session(session) 

In [5]:
# ニュース記事に現れる単語→インデックス辞書の作成
import numpy as np
from nltk import word_tokenize
import collections
import re
 
maxlen = 20 # 1文書に含まれる層単語数の上限を保持
min_length = 3 # 1単語の文字数の最小値(3文字以上の単語のみ残す)
word_counter = collections.Counter()
docs = [train_docs, test_docs]

for document in docs: # 単語の小文字化と抽出
    num_data = len(document)
    for i in range(num_data):
        text = document[i]
        words = map(lambda word: word.lower(), word_tokenize(text))
        p = re.compile('[a-zA-Z]+')
        filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
        if len(filtered_tokens) > maxlen:
            maxlen = len(filtered_tokens)
        for word in filtered_tokens:
            word_counter[word] += 1

print("maxlen = ",maxlen)
print(" Word count = ", len(word_counter),' ',type(word_counter))

maxlen =  1094
 Word count =  32662   <class 'collections.Counter'>


In [6]:
print("語彙生成 creating vocabulary...")
VOCAB_SIZE = 25000 # Reuters News 最大語彙の設定（これ以上は無視する）
word2index = collections.defaultdict(int)
for wid, word in enumerate(word_counter.most_common(VOCAB_SIZE)):# 頻度順
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
index2word = {v:k for k, v in word2index.items()}
index2word[0] = "_UNK_" # 未知語
print("len(word2index) = ", len(word2index))
print("index2word[1] = ",index2word[1])

語彙生成 creating vocabulary...
len(word2index) =  25000
index2word[1] =  the


In [7]:
#順引き辞書と逆引き辞書の作成

In [8]:
print("語彙生成 creating vocabulary...")
VOCAB_SIZE = 27000 # Reuters News 最大語彙の設定（これ以上は無視する）
word2index = collections.defaultdict(int)
for wid, word in enumerate(word_counter.most_common(VOCAB_SIZE)):# 頻度順
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
# 逆引き辞書作成
index2word = {v:k for k, v in word2index.items()}
index2word[0] = "_UNK_" # 未知語
print("len(word2index) = ", len(word2index))
print("index2word[1] = ",index2word[1])

語彙生成 creating vocabulary...
len(word2index) =  27000
index2word[1] =  the


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id])

In [10]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from nltk.corpus import stopwords

print("訓練用データの単語列生成 creating word sequences...")

min_length = 3
cachedStopWords = stopwords.words("english")

xs_train = []
document = train_docs
num_data = len(document)
for i in range(num_data):
    text = document[i]
    words = [x.lower() for x in word_tokenize(text)] # NLTK's word tokenizer
    words = [word for word in words if word not in cachedStopWords]
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
    wids = [word2index[word] for word in filtered_tokens]
    xs_train.append(wids)
            
X_train = pad_sequences(xs_train, maxlen=maxlen)# パディング (1861単語が最大)
Y_train = train_labels # np_utils.to_categorical(ys)  多値分類なのでワンホットではない!!
print("訓練データ（データ＋ラベル）")
print("X_train",X_train.dtype," ",type(X_train)," ",X_train.shape)
print("Y_train",Y_train.dtype," ",type(Y_train)," ",Y_train.shape)

print("テスト用データの単語列生成 creating word sequences...")
xs_test = []
document = test_docs
num_data = len(document)
for i in range(num_data):
    text = document[i]
    words = [x.lower() for x in word_tokenize(text)] # NLTK's word tokenizer
    words = [word for word in words if word not in cachedStopWords]
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
    wids = [word2index[word] for word in filtered_tokens]
    # wids = [word2index[word] for word in words]
    xs_test.append(wids)

X_test = pad_sequences(xs_test, maxlen=maxlen)# パディング
Y_test = test_labels # np_utils.to_categorical(ys) 多値分類なのでワンホットではない!!
print("テストデータ（データ＋ラベル）")
print("X_test",X_test.dtype," ",type(X_test)," ",X_test.shape)
print("Y_test",Y_test.dtype," ",type(Y_test)," ",Y_test.shape)

訓練用データの単語列生成 creating word sequences...
訓練データ（データ＋ラベル）
X_train int32   <class 'numpy.ndarray'>   (7713, 1094)
Y_train int64   <class 'numpy.ndarray'>   (7713, 55)
テスト用データの単語列生成 creating word sequences...
テストデータ（データ＋ラベル）
X_test int32   <class 'numpy.ndarray'>   (2987, 1094)
Y_test int64   <class 'numpy.ndarray'>   (2987, 55)


In [11]:
Xtrain = X_train
Xtest = X_test
Ytrain = Y_train
Ytest = Y_test
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

# 分散表現モデル
WORD2VEC_MODEL = "GoogleNews-vectors-negative300.bin.gz"

# 最大語彙サイズ
VOCAB_SIZE = 27000 

# Google Newsで学習された300次元のword embedding(分散表現)
EMBED_SIZE = 300 

(7713, 1094) (2987, 1094) (7713, 55) (2987, 55)


In [12]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True)
embedding_weights = np.zeros((vocab_sz, EMBED_SIZE))
for word, index in word2index.items():
    try:
        embedding_weights[index, :] = word2vec[word]
    except KeyError:
        pass

print("Embedding_weight matrix size = ", embedding_weights.shape)

OSError: Not a gzipped file (b've')

In [12]:
#CNNの構築

In [13]:
from keras.layers import Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.layers import Input, Dense
from keras.models import Model

NUM_CLASSES = 55 
NUM_FILTERS = 256 
NUM_WORDS = 5 

inputs = Input(shape=(maxlen,)) 
x = Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen,
                    weights=[embedding_weights], # 初期値をGoogleの分散表現にする
                    trainable=True)(inputs)
x = SpatialDropout1D(0.3)(x) 
x = Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="elu")(x)
x = GlobalMaxPooling1D()(x)
x = Dense(512,activation="elu")(x)
x = Dropout(0.4)(x)
outputs = Dense(NUM_CLASSES, activation="sigmoid")(x)
model3 = Model(inputs=[inputs], outputs=[outputs])
model3.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1096)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1096, 300)         8100300   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 1096, 300)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 1092, 256)         384256    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               131584    
_________________________________________________________________
dropout (Dropout)            (None, 512)              

In [14]:
model3.compile(
    optimizer="adam",  
    loss="binary_crossentropy",
    metrics=["categorical_accuracy"])

NUM_EPOCHS = 40
BATCH_SIZE = 128 

from tensorflow.keras.callbacks import ModelCheckpoint

fpath = 'h5/Reuters-CNN-w-{epoch:02d}-{loss:.4f}-{val_loss:.4f}.h5'
callbacks = [
    keras.callbacks.ModelCheckpoint(fpath, monitor='val_loss', save_best_only=True),
]

history3 = model3.fit(
    Xtrain, Ytrain, 
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    callbacks=callbacks,
    validation_data=(Xtest, Ytest))


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [19]:
# Save model1 and history1
model3.save_weights('/Users/kosukehama/Python_DataScience/MA_Reuters-CNN-2020-8-4-weights.h5')
model3.save('/Users/kosukehama/Python_DataScience/MA_Reuters-CNN-2020-8-4.h5')

import pickle
with open('/Users/kosukehama/Python_DataScience/MA_Reuters-CNN-2020-8-4.pkl', 'wb') as h_file:
    pickle.dump(history3.history, h_file)

json_str = model3.to_json()
open('/Users/kosukehama/Python_DataScience/MA_Reuters-CNN-2020-8-4.json','w').write(json_str)


3165

In [26]:
# load json and create model
from keras.models import model_from_json
json_file = open('/Users/kosukehama/Python_DataScience/MA_Reuters-CNN-2020-8-4.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("/Users/kosukehama/Python_DataScience/MA_Reuters-CNN-2020-8-4-weights.h5")
print("Loaded the best model from disk")
 
loaded_model.compile(
    optimizer="adam", # sgd, # "adadelta", # sgd, # 'adadelta', # sgd, # "adadelta", 
    loss="binary_crossentropy",
    metrics=["categorical_accuracy"])

score = loaded_model.score = loaded_model.evaluate(Xtest, Ytest, verbose=0)

print()
print("\nテストデータの損失: {:.4f} (カテゴリカル精度: {:.3f}) ".format(score[0], score[1]))

Loaded the best model from disk


テストデータの損失: 0.0205 (カテゴリカル精度: 0.867) 
