# Loading the subset of Reuters news by NLTK

In [1]:
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *

# Loading the corpus
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# Load MA_Reuters
documents = ma_reuters.fileids()
print (str(len(documents)) + " total articles")
# extracting training and testing data (document ID)
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print (str(len(train_docs_id)) + " training data")
print (str(len(test_docs_id)) + " testing data")
# Training and testing data
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]
 
# print the total number of categories
categories = ma_reuters.categories()
num_categories = len(categories)
print (num_categories, " categories")
print (categories)

10700 total articles
7713 training data
2987 testing data
55  categories
['acq', 'alum', 'barley', 'bop', 'carcass', 'cocoa', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'crude', 'dlr', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'hog', 'housing', 'interest', 'ipi', 'iron-steel', 'jobs', 'lead', 'livestock', 'meal-feed', 'money-fx', 'money-supply', 'nat-gas', 'oilseed', 'orange', 'palm-oil', 'pet-chem', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


# [1] Sample Code Check
## Coffee category data example

In [2]:
# raw document example（'coffee category')
# Documents in a category
category_docs = ma_reuters.fileids("money-fx");
document_id = category_docs[0] # The first document
# print the inside document
print (ma_reuters.raw(document_id))

BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER
  The Bundesbank accepted bids for 6.1
  billion marks at today's tender for a 28-day securities
  repurchase pact at a fixed rate of 3.80 pct, a central bank
  spokesman said.
      Banks, which bid for a total 12.2 billion marks liquidity,
  will be credited with the funds allocated today and must buy
  back securities pledged on May 6.
      Some 14.9 billion marks will drain from the market today as
  an earlier pact expires, so the Bundesbank is effectively
  withdrawing a net 8.1 billion marks from the market with
  today's allocation.
      A Bundesbank spokesman said in answer to enquiries that the
  withdrawal of funds did not reflect a tightening of credit
  policy, but was to be seen in the context of plentiful
  liquidity in the banking system.
      Banks held an average 59.3 billion marks at the Bundesbank
  over the first six days of the month, well clear of the likely
  April minimum reserve requirement of 51 billion mark

# Tokenization with NLTK, TF-IDF vectorizer with scikit-learn

In [3]:
from nltk import word_tokenize
import re # regular expression
 
def tokenize(text): # returning tokens
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))

    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
    return filtered_tokens

from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
# fit_transform
vectorised_train_documents = vectorizer.fit_transform(train_docs)
# transform
vectorised_test_documents = vectorizer.transform(test_docs)
print("converted to TF-IF model")
print("training document dimension ：",vectorised_train_documents.shape)
print("testing document dimension：",vectorised_test_documents.shape)

converted to TF-IF model
training document dimension ： (7713, 26985)
testing document dimension： (2987, 26985)


# SVM classification

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id])

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(LinearSVC(random_state=41)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

import numpy as np
# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, OVR_predictions, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))

Jaccard coef: 0.86
Hamming Loss: 0.005


# [2] Jaccard Index; What category is the highest, and the lowest?

In [5]:
jaccard_indices = np.round(jaccard_score(test_labels, OVR_predictions, average=None),3) # Code 1
jaccard_indices_with_category_name = list(zip(jaccard_indices, categories)) # Code 2
jaccard_indices_with_category_name.sort()

print(jaccard_indices_with_category_name) # Code 3

[(0.0, 'lead'), (0.0, 'pet-chem'), (0.0, 'soy-oil'), (0.0, 'strategic-metal'), (0.0, 'yen'), (0.125, 'silver'), (0.2, 'fuel'), (0.231, 'soy-meal'), (0.273, 'sorghum'), (0.316, 'meal-feed'), (0.333, 'hog'), (0.36, 'rice'), (0.407, 'livestock'), (0.414, 'cpi'), (0.421, 'carcass'), (0.421, 'veg-oil'), (0.435, 'alum'), (0.436, 'soybean'), (0.444, 'rapeseed'), (0.462, 'zinc'), (0.471, 'gas'), (0.471, 'nat-gas'), (0.5, 'oilseed'), (0.5, 'retail'), (0.5, 'wpi'), (0.524, 'jobs'), (0.55, 'cotton'), (0.556, 'iron-steel'), (0.579, 'reserves'), (0.6, 'palm-oil'), (0.621, 'interest'), (0.625, 'bop'), (0.625, 'gold'), (0.643, 'barley'), (0.651, 'money-fx'), (0.653, 'dlr'), (0.66, 'ship'), (0.667, 'tin'), (0.672, 'trade'), (0.697, 'wheat'), (0.703, 'money-supply'), (0.724, 'corn'), (0.727, 'orange'), (0.75, 'gnp'), (0.75, 'housing'), (0.75, 'rubber'), (0.757, 'sugar'), (0.81, 'crude'), (0.833, 'copper'), (0.833, 'ipi'), (0.836, 'grain'), (0.931, 'coffee'), (0.94, 'acq'), (0.944, 'cocoa'), (0.969, 'ea

In [6]:
# check if 'categories' and jaccard_indices is same order
import itertools
[list(itertools.compress(categories, [True if j==1 else False for j in i])) for i in test_labels] == [ma_reuters.categories(doc_id) for doc_id in test_docs_id] # Code 4

True

Code 1で算出したJaccard係数と，"Loading the subset of Reuters news by NLTK"で準備してある変数categoriesの順序が一致するか，Code 4で調べた．
Code 4では，全てのテストデータについて，バイナライズされた多値ラベル表現と実際のラベルの順序関係が一致しているかを調べている．
Jaccard係数の算出では，バイナライズされた多値ラベル表現を用いているため，出力はこの表現と同順である．
Code 4の出力がTrueであることから，Code 1のjaccard_indicesとcategoriesは同順序で保持されていることを確認した．

Code 2でjaccard_indicesとcategoriesを結合したリストを生成し，これをJaccard係数でソートすることにより，Code 3の出力を得た．
この結果，最もJaccard係数の高いカテゴリーは”earn”であり，最も低いカテゴリーは”lead”，”pet-chem”，”soy-oil”，”strategic-metal”，”yen”（全てJaccard係数が0）であった．

# [3] Other method for multi-label classification

In [7]:
EMBED_SIZE = 300
def getDocumentVectors(docs, model):
    vectors = np.zeros((len(docs),EMBED_SIZE))
    for i, text in enumerate(docs):
        tokenized_text = tokenize(text)
        for word in tokenized_text:
            try:
                vectors[i, :] = np.add(vectors[i, :], model[word])
            except KeyError:
                pass
        if len(tokenized_text) > 0:
            vectors[i, :] = np.divide(vectors[i, :], len(tokenized_text))
    return vectors

In [8]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

wordvectors_google = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin.gz", binary=True)

In [10]:
# fit_transform
vectorised_train_documents_google = getDocumentVectors(train_docs,wordvectors_google)
# transform
vectorised_test_documents_google = getDocumentVectors(test_docs,wordvectors_google)
print("converted to the mean of Word2Vec model")
print("training document dimension ：",vectorised_train_documents_google.shape)
print("testing document dimension：",vectorised_test_documents_google.shape)

converted to the mean of Word2Vec model
training document dimension ： (7713, 300)
testing document dimension： (2987, 300)


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id])

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# multi-class, multi-label classification and prediction
OVR_classifier_google = OneVsRestClassifier(SVC()) 
OVR_classifier_google.fit(vectorised_train_documents_google, train_labels)
OVR_predictions_google = OVR_classifier_google.predict(vectorised_test_documents_google)

import numpy as np
# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, OVR_predictions_google, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions_google),3))

Jaccard coef: 0.773
Hamming Loss: 0.007


In [14]:
jaccard_indices_google = np.round(jaccard_score(test_labels, OVR_predictions_google, average=None),3) # Code 1
jaccard_indices_with_category_name_google = list(zip(jaccard_indices_google, categories)) # Code 2
jaccard_indices_with_category_name_google.sort()

print(jaccard_indices_with_category_name_google) # Code 3

[(0.0, 'alum'), (0.0, 'barley'), (0.0, 'carcass'), (0.0, 'fuel'), (0.0, 'gnp'), (0.0, 'iron-steel'), (0.0, 'lead'), (0.0, 'meal-feed'), (0.0, 'orange'), (0.0, 'pet-chem'), (0.0, 'retail'), (0.0, 'rice'), (0.0, 'rubber'), (0.0, 'silver'), (0.0, 'sorghum'), (0.0, 'soy-meal'), (0.0, 'soy-oil'), (0.0, 'strategic-metal'), (0.0, 'tin'), (0.0, 'wpi'), (0.0, 'yen'), (0.0, 'zinc'), (0.033, 'nat-gas'), (0.05, 'cotton'), (0.091, 'soybean'), (0.1, 'palm-oil'), (0.104, 'oilseed'), (0.108, 'veg-oil'), (0.111, 'copper'), (0.167, 'bop'), (0.167, 'ipi'), (0.176, 'gas'), (0.222, 'cocoa'), (0.25, 'housing'), (0.269, 'livestock'), (0.31, 'cpi'), (0.333, 'hog'), (0.333, 'reserves'), (0.371, 'money-supply'), (0.375, 'corn'), (0.444, 'rapeseed'), (0.484, 'gold'), (0.489, 'dlr'), (0.5, 'sugar'), (0.524, 'jobs'), (0.544, 'interest'), (0.571, 'coffee'), (0.588, 'money-fx'), (0.622, 'wheat'), (0.628, 'ship'), (0.631, 'trade'), (0.687, 'grain'), (0.768, 'crude'), (0.907, 'acq'), (0.964, 'earn')]


Word2Vecを用いて各ドキュメントに含まれる単語の分散表現を取得し，その平均をドキュメントの分散表現とした．
分類器は[1]と同様にSVMを用いた．したがって，[1]の結果と[3]の結果を比較することができる．

Word2Vecを用いた文書分類は，TF-IDFを用いた場合と比べ，性能が低下した．
特に，カテゴリ別のJaccard係数を比べると，Word2Vecを用いた場合に0.0となっているカテゴリが増えている．
Word2Vecは未知語に対して分散表現を与えることができない．
そのため，