In [1]:
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *

# Loading the corpus
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# Load MA_Reuters
documents = ma_reuters.fileids()
print (str(len(documents)) + " total articles")
# extracting training and testing data (document ID)
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print (str(len(train_docs_id)) + " training data")
print (str(len(test_docs_id)) + " testing data")
# Training and testing data
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]
 
# print the total number of categories
categories = ma_reuters.categories()
num_categories = len(categories)
print (num_categories, " categories")
print (categories)

10700 total articles
7713 training data
2987 testing data
55  categories
['acq', 'alum', 'barley', 'bop', 'carcass', 'cocoa', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'crude', 'dlr', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'hog', 'housing', 'interest', 'ipi', 'iron-steel', 'jobs', 'lead', 'livestock', 'meal-feed', 'money-fx', 'money-supply', 'nat-gas', 'oilseed', 'orange', 'palm-oil', 'pet-chem', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id])

In [3]:
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('apnews_dbow/doc2vec.bin')



In [4]:
train_vector = [model.infer_vector(doc) for doc in train_docs]
test_vector = [model.infer_vector(doc) for doc in test_docs]

In [5]:
print(len(train_vector[0]))

300


In [7]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
# multi-class, multi-label classification and prediction
OVR_classifier = OneVsRestClassifier(LinearSVC(random_state=41)) 
OVR_classifier.fit(train_vector, train_labels)
OVR_predictions = OVR_classifier.predict(test_vector)
# Evaluation
import numpy as np
# Jaccard coefficient
from sklearn.metrics import jaccard_similarity_score
#print ("Jaccard coef:",np.round(jaccard_similarity_score(test_labels, OVR_predictions, average='samples'),3))
print ("Jaccard coef:",np.round(jaccard_similarity_score(test_labels, OVR_predictions),3))

# Hamming Loss
from sklearn.metrics import hamming_loss
print ("Hamming Loss:",np.round(hamming_loss(test_labels, OVR_predictions),3))
#test_pred = rf_ma_reuters.predict(test_docs)
#print(classification_report(test_labels, test_pred, target_names=categories, digits=5))

Jaccard coef: 0.501
Hamming Loss: 0.015


In [8]:
Jmax = 0.500001
Jmax_category = ""
Jmin_category = ""
Jmin = 0.5
for c in categories:
    category_docs_c = ma_reuters.fileids(str(c));
    print("category:" + c)
    test_docs_id_c = list(filter(lambda doc: doc.startswith("test"), category_docs_c))
    test_docs_c = [ma_reuters.raw(doc_id) for doc_id in test_docs_id_c]
    # transform
    vectorised_test_document_c = [model.infer_vector(doc) for doc in test_docs_c]
    print("  testing document dimension：",len(vectorised_test_document_c))
    OVR_predictions_c = OVR_classifier.predict(vectorised_test_document_c)
    test_label_c = mlb.transform([ma_reuters.categories(doc_id) for doc_id in test_docs_id_c])
    JacCoef = np.round(jaccard_similarity_score(test_label_c, OVR_predictions_c),3)
    HamLoss = np.round(hamming_loss(test_label_c, OVR_predictions_c),3)
    print ("  Jaccard coef:",JacCoef)
    print ("  Hamming Loss:",HamLoss)
    if(JacCoef > Jmax):
        Jmax = JacCoef
        Jmax_category = c
    if(JacCoef < Jmin):
        Jmin = JacCoef
        Jmin_category = c

category:acq
  testing document dimension： 719
  Jaccard coef: 0.665
  Hamming Loss: 0.009
category:alum
  testing document dimension： 23
  Jaccard coef: 0.0
  Hamming Loss: 0.03
category:barley
  testing document dimension： 14
  Jaccard coef: 0.02
  Hamming Loss: 0.084
category:bop
  testing document dimension： 30
  Jaccard coef: 0.0
  Hamming Loss: 0.038
category:carcass
  testing document dimension： 18
  Jaccard coef: 0.0
  Hamming Loss: 0.055
category:cocoa
  testing document dimension： 18
  Jaccard coef: 0.028
  Hamming Loss: 0.023
category:coffee
  testing document dimension： 28
  Jaccard coef: 0.0
  Hamming Loss: 0.025
category:copper
  testing document dimension： 18
  Jaccard coef: 0.019
  Hamming Loss: 0.03
category:corn
  testing document dimension： 56
  Jaccard coef: 0.029
  Hamming Loss: 0.073
category:cotton
  testing document dimension： 20
  Jaccard coef: 0.004
  Hamming Loss: 0.08
category:cpi
  testing document dimension： 28
  Jaccard coef: 0.0
  Hamming Loss: 0.034
cat

In [9]:
print("Highest Jaccard score is " + Jmax_category + ": " + str(Jmax))
print("Lowest Jaccard score is " + Jmin_category + ": " + str(Jmin))

Highest Jaccard score is earn: 0.912
Lowest Jaccard score is alum: 0.0


In [13]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000,max_depth=5) 
forest = forest.fit(train_vector, train_labels)

In [14]:
forest_predictions = forest.predict(test_vector)
print ("Jaccard coef:",np.round(jaccard_similarity_score(test_labels, forest_predictions),3))
print ("Hamming Loss:",np.round(hamming_loss(test_labels, forest_predictions),3))

Jaccard coef: 0.495
Hamming Loss: 0.014
