In [1]:
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *

# Loading the corpus
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# Load MA_Reuters
documents = ma_reuters.fileids()
print (str(len(documents)) + " total articles")
# extracting training and testing data (document ID)
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print (str(len(train_docs_id)) + " training data")
print (str(len(test_docs_id)) + " testing data")
# Training and testing data
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]
 
# print the total number of categories
categories = ma_reuters.categories()
num_categories = len(categories)
print (num_categories, " categories")
print (categories)

10700 total articles
7713 training data
2987 testing data
55  categories
['acq', 'alum', 'barley', 'bop', 'carcass', 'cocoa', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'crude', 'dlr', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'hog', 'housing', 'interest', 'ipi', 'iron-steel', 'jobs', 'lead', 'livestock', 'meal-feed', 'money-fx', 'money-supply', 'nat-gas', 'oilseed', 'orange', 'palm-oil', 'pet-chem', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [2]:
# raw document example（'jobs category')
# Documents in a category
category_docs = ma_reuters.fileids("jobs");
document_id = category_docs[0] # The first document
# print the inside document
print (ma_reuters.raw(document_id))

GERMAN INDUSTRIAL EMPLOYMENT SEEN STAGNATING
  The number of workers employed in
  the West German industrial sector stagnated in the last quarter
  of 1986 as a 50,000 increase in overall employment benefited
  only the services branch, the DIW economic institute said.
      A DIW report added the general downturn in the economy
  since last Autumn had had a negative effect on the willingness
  of firms to take on workers. It referred to a marked downturn
  in the number of workers taken on in the capital goods sector.
      New orders for manufacturing industry goods have mostly
  fallen or stagnated in recent months, but data for February
  finally showed a reversal of the trend, with a 1.9 pct rise.
  




In [3]:
from nltk import word_tokenize
import re # regular expression
 
def tokenize(text): # returning tokens
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))

    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
    return filtered_tokens

from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
# fit_transform
vectorised_train_documents = vectorizer.fit_transform(train_docs)
# transform
vectorised_test_documents = vectorizer.transform(test_docs)
print("converted to TF-IF model")
print("training document dimension ：",vectorised_train_documents.shape)
print("testing document dimension：",vectorised_test_documents.shape)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


converted to TF-IF model
training document dimension ： (7713, 26986)
testing document dimension： (2987, 26986)


In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id])

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(LinearSVC(random_state=41)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

import numpy as np
# Jaccard coefficient
from sklearn.metrics import jaccard_similarity_score
#print ("Jaccard coef:",np.round(jaccard_similarity_score(test_labels, OVR_predictions, average='samples'),3))
print ("Jaccard coef:",np.round(jaccard_similarity_score(test_labels, OVR_predictions),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))

Jaccard coef: 0.86
Hamming Loss: 0.005


In [5]:
Jmax = 0.500001
Jmax_category = ""
Jmin_category = ""
Jmin = 0.5
for c in categories:
    category_docs_c = ma_reuters.fileids(str(c));
    print("category:" + c)
    test_docs_id_c = list(filter(lambda doc: doc.startswith("test"), category_docs_c))
    test_docs_c = [ma_reuters.raw(doc_id) for doc_id in test_docs_id_c]
    # transform
    vectorised_test_document_c = vectorizer.transform(test_docs_c)
    print("  testing document dimension：",vectorised_test_document_c.shape)
    OVR_predictions_c = OVR_classifier.predict(vectorised_test_document_c)
    test_label_c = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id_c])
    JacCoef = np.round(jaccard_similarity_score(test_label_c, OVR_predictions_c),3)
    HamLoss = np.round(hamming_loss(test_label_c, OVR_predictions_c),3)
    print ("  Jaccard coef:",JacCoef)
    print ("  Hamming Loss:",HamLoss)
    if(JacCoef > Jmax):
        Jmax = JacCoef
        Jmax_category = c
    if(JacCoef < Jmin):
        Jmin = JacCoef
        Jmin_category = c

category:acq


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


  testing document dimension： (719, 26986)
  Jaccard coef: 0.941
  Hamming Loss: 0.002
category:alum
  testing document dimension： (23, 26986)
  Jaccard coef: 0.467
  Hamming Loss: 0.016
category:barley
  testing document dimension： (14, 26986)
  Jaccard coef: 0.76
  Hamming Loss: 0.038
category:bop
  testing document dimension： (30, 26986)
  Jaccard coef: 0.603
  Hamming Loss: 0.018
category:carcass
  testing document dimension： (18, 26986)
  Jaccard coef: 0.427
  Hamming Loss: 0.033
category:cocoa
  testing document dimension： (18, 26986)
  Jaccard coef: 0.926
  Hamming Loss: 0.002
category:coffee
  testing document dimension： (28, 26986)
  Jaccard coef: 0.935
  Hamming Loss: 0.004
category:copper
  testing document dimension： (18, 26986)
  Jaccard coef: 0.75
  Hamming Loss: 0.013
category:corn
  testing document dimension： (56, 26986)
  Jaccard coef: 0.715
  Hamming Loss: 0.032
category:cotton
  testing document dimension： (20, 26986)
  Jaccard coef: 0.618
  Hamming Loss: 0.054
cate

In [6]:
print("Highest Jaccard score is " + Jmax_category + ": " + str(Jmax))
print("Lowest Jaccard score is " + Jmin_category + ": " + str(Jmin))

Highest Jaccard score is earn: 0.976
Lowest Jaccard score is strategic-metal: 0.0


In [7]:
jmax_docs = ma_reuters.fileids(str(Jmax_category));
jmin_docs = ma_reuters.fileids(str(Jmin_category));
test_docs_id_jmax = list(filter(lambda doc: doc.startswith("train"), jmax_docs))
test_docs_id_jmin = list(filter(lambda doc: doc.startswith("train"), jmin_docs))
# print the inside document
print (Jmax_category + ": " + str(len(test_docs_id_jmax)))
print (Jmin_category + ": " + str(len(test_docs_id_jmin)))

earn: 2877
strategic-metal: 16


In [18]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(vectorised_train_documents, train_labels)

In [19]:
forest_predictions = forest.predict(vectorised_test_documents)
print ("Jaccard coef:",np.round(jaccard_similarity_score(test_labels, forest_predictions),3))
print ("Hamming Loss:",np.round(hamming_loss(test_labels, forest_predictions),3))

Jaccard coef: 0.679
Hamming Loss: 0.009
