# 1 順番に実行し、多値クラス多値ラベル問題を解決できることを確認せよ。

In [1]:
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *

# Loading the corpus
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# Load MA_Reuters
documents = ma_reuters.fileids()
print (str(len(documents)) + " total articles")
# extracting training and testing data (document ID)
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print (str(len(train_docs_id)) + " training data")
print (str(len(test_docs_id)) + " testing data")
# Training and testing data
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]
 
# print the total number of categories
categories = ma_reuters.categories()
num_categories = len(categories)
print (num_categories, " categories")
print (categories)

10700 total articles
7713 training data
2987 testing data
55  categories
['acq', 'alum', 'barley', 'bop', 'carcass', 'cocoa', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'crude', 'dlr', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'hog', 'housing', 'interest', 'ipi', 'iron-steel', 'jobs', 'lead', 'livestock', 'meal-feed', 'money-fx', 'money-supply', 'nat-gas', 'oilseed', 'orange', 'palm-oil', 'pet-chem', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [2]:
# raw document example（'coffee category')
# Documents in a category
category_docs = ma_reuters.fileids("iron-steel");
document_id = category_docs[0] # The first document
# print the inside document
print (ma_reuters.raw(document_id))

CANADA PLANS TO MONITOR STEEL IMPORTS, EXPORTS, TRADE MINISTER SAYS

  CANADA PLANS TO MONITOR STEEL IMPORTS, EXPORTS, TRADE MINISTER SAYS
  




In [5]:
from nltk import word_tokenize
import re # regular expression
 
def tokenize(text): # returning tokens
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))

    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
    return filtered_tokens

from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
# fit_transform
vectorised_train_documents = vectorizer.fit_transform(train_docs)
# transform
vectorised_test_documents = vectorizer.transform(test_docs)
print("converted to TF-IF model")
print("training document dimension ：",vectorised_train_documents.shape)
print("testing document dimension：",vectorised_test_documents.shape)

converted to TF-IF model
training document dimension ： (7713, 26979)
testing document dimension： (2987, 26979)


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id])

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(LinearSVC(random_state=41)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

import numpy as np
# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, OVR_predictions, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))

Jaccard coef: 0.86
Hamming Loss: 0.005


# 2 Jaccard 係数（資料第6回参照）が最も高いカテゴリーと最も低いカテゴリーが何であったかのべよ

In [7]:
j_scores = jaccard_score(test_labels, OVR_predictions, average=None)
j_maxs = [categories[i] for i, v in enumerate(j_scores) if v == max(j_scores)]
print("max categories:",j_maxs)
j_mins = [categories[i] for i, v in enumerate(j_scores) if v == min(j_scores)]
print("min categories:",j_mins)

max categories: ['earn']
min categories: ['lead', 'pet-chem', 'soy-oil', 'strategic-metal', 'yen']


# 3 TF-IDFモデル＋SVM以外の組み合わせで本マルチラベル問題に対応できる手法を適宜ためし、実行結果をのべよ

### 分類器をパーセプトロンにして実行

In [8]:
from sklearn.linear_model import Perceptron

# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(Perceptron(random_state=41)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, OVR_predictions, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))

Jaccard coef: 0.856
Hamming Loss: 0.006


### 分類器をロジスティック回帰にして実行

In [9]:
from sklearn.linear_model import LogisticRegression

# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(LogisticRegression(random_state=41)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, OVR_predictions, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))

Jaccard coef: 0.709
Hamming Loss: 0.009


### 分類器を多層パーセプトロンにして実行

In [17]:
from sklearn.neural_network import MLPClassifier

# multi-class, multi-label classification and prediction
MLP_Classifier = MLPClassifier(random_state=41, max_iter=300)
MLP_Classifier.fit(vectorised_train_documents, train_labels)
MLP_predictions = MLP_Classifier.predict(vectorised_test_documents)

# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, MLP_predictions, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, MLP_predictions),3))

Jaccard coef: 0.85
Hamming Loss: 0.006




時間がかかったわりに性能が上がらなかった。

### Tf-Idf からOkapi BM25にモデルを変更

In [29]:
#BSD 3-Clause License

#Copyright (c) 2018, Sho IIZUKA
#All rights reserved.

#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:

#* Redistributions of source code must retain the above copyright notice, this
#  list of conditions and the following disclaimer.

#* Redistributions in binary form must reproduce the above copyright notice,
#  this list of conditions and the following disclaimer in the documentation
#  and/or other materials provided with the distribution.

#* Neither the name of the copyright holder nor the names of its
#  contributors may be used to endorse or promote products derived from
#  this software without specific prior written permission.

#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
#FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
#DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
#SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
#CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
#OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
            # preserve float family dtype
            X = sp.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        # In each row, repeat `dl` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        return X


In [30]:

from sklearn.feature_extraction.text import CountVectorizer
    
class BM25Vectorizer(CountVectorizer):
    def __init__(self, *, input='content', encoding='utf-8',
             decode_error='strict', strip_accents=None, lowercase=True,
             preprocessor=None, tokenizer=None, analyzer='word',
             stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
             ngram_range=(1, 1), max_df=1.0, min_df=1,
             max_features=None, vocabulary=None, binary=False,
             dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True,
             sublinear_tf=False):

        super().__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self._tfidf = BM25Transformer(True)

In [32]:
def tokenize(text): # returning tokens
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))

    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
    return filtered_tokens

# Okapi BM25 vectorizer
vectorizer = BM25Vectorizer(stop_words='english', tokenizer=tokenize)
# fit_transform
vectorised_train_documents = vectorizer.fit_transform(train_docs)
# transform
vectorised_test_documents = vectorizer.transform(test_docs)
print("converted to Okapi BM25 model")
print("training document dimension ：",vectorised_train_documents.shape)
print("testing document dimension：",vectorised_test_documents.shape)

converted to Okapi BM25 model
training document dimension ： (7713, 26979)
testing document dimension： (2987, 26979)


In [37]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id])

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(LinearSVC(random_state=41,max_iter=100000)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

import numpy as np
# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, OVR_predictions, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))

Jaccard coef: 0.856
Hamming Loss: 0.005


In [34]:
from sklearn.linear_model import Perceptron

# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(Perceptron(random_state=41)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

# Jaccard coefficient
from sklearn.metrics import jaccard_score
print ("Jaccard coef:",np.round(jaccard_score(test_labels, OVR_predictions, average='samples'),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))

Jaccard coef: 0.831
Hamming Loss: 0.008


TfIdf からOkapi BM25にモデルを変更したが、max_iterを増やさないといけなかったりと性能が落ちた