## Classifying Reuters-21578 collection with Python
https://miguelmalvarez.com/2016/11/07/classifying-reuters-21578-collection-with-python/

Modified by: M. Amintoosi

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [1]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
 
cachedStopWords = stopwords.words("english")
 
def tokenize(text):
  min_length = 3
  words = map(lambda word: word.lower(), word_tokenize(text))
  words = [word for word in words if word not in cachedStopWords]
  tokens = (list(map(lambda token: PorterStemmer().stem(token),
                                   words)))
  p = re.compile('[a-zA-Z]+');
  filtered_tokens = list(filter (lambda token: p.match(token) and
                               len(token) >= min_length,
                               tokens))
  return filtered_tokens

In [2]:
from nltk.corpus import stopwords, reuters
# from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

stop_words = stopwords.words("english")
 
# List of document ids
documents = reuters.fileids()
 
train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                            documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                           documents))
 
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
 
# Tokenisation
vectorizer = TfidfVectorizer(stop_words=stop_words,
                             tokenizer=tokenize)
#                              tokenizer=word_tokenize)
 
# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)
 
# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id)
                             for doc_id in test_docs_id])
 
# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorised_train_documents, train_labels)
 
predictions = classifier.predict(vectorised_test_documents)

  'stop_words.' % sorted(inconsistent))


In [17]:
tmp = vectorised_train_documents[0,:]
print(tmp.toarray())
print(tmp)

[[0. 0. 0. ... 0. 0. 0.]]
  (0, 1421)	0.2290813551945604
  (0, 3529)	0.2815691787361101
  (0, 15546)	0.06537563190866612
  (0, 16709)	0.054046168604127816
  (0, 3900)	0.023754129808964037
  (0, 18510)	0.0418993311187372
  (0, 20058)	0.06734000282564423
  (0, 20674)	0.04175490718197429
  (0, 575)	0.05323819068788935
  (0, 5295)	0.04300561184179896
  (0, 16830)	0.025476637359625777
  (0, 5448)	0.028244773563675808
  (0, 9373)	0.023911353274921356
  (0, 8775)	0.028406190037275698
  (0, 14590)	0.03407710883503197
  (0, 3619)	0.05966114382643919
  (0, 18309)	0.1145406775972802
  (0, 620)	0.0317956666309888
  (0, 12648)	0.035412330583819596
  (0, 8507)	0.05496217841627037
  (0, 10273)	0.025126103475552445
  (0, 15491)	0.0418993311187372
  (0, 3633)	0.31777993287677625
  (0, 17001)	0.2167873893572331
  (0, 15980)	0.04929645159726099
  :	:
  (0, 12887)	0.1764032327692246
  (0, 4589)	0.10398384446706203
  (0, 4812)	0.04267091699411684
  (0, 19091)	0.036213132075115624
  (0, 4099)	0.058801077589

In [3]:
vectorised_test_documents

<3019x20684 sparse matrix of type '<class 'numpy.float64'>'
	with 128979 stored elements in Compressed Sparse Row format>

In [6]:
vectorised_train_documents

<7769x20684 sparse matrix of type '<class 'numpy.float64'>'
	with 370244 stored elements in Compressed Sparse Row format>

In [4]:
from sklearn.metrics import f1_score,precision_score,recall_score
 
precision = precision_score(test_labels, predictions,
                            average='micro')
recall = recall_score(test_labels, predictions,
                      average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions,
                            average='macro')
recall = recall_score(test_labels, predictions,
                      average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
        .format(precision, recall, f1))

Micro-average quality numbers
Precision: 0.9455, Recall: 0.8013, F1-measure: 0.8674
Macro-average quality numbers
Precision: 0.6493, Recall: 0.3948, F1-measure: 0.4665


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [5]:
# import scipy.io as sio
# sio.savemat('rt.mat', {'X_train':vectorised_train_documents,'train_labels':train_labels,'X_test':vectorised_test_documents,'test_labels':test_labels})

In [22]:
mlb.fit_transform([(1, 2), (3,),(1,4)])

array([[1, 1, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 1]])