### Reuters Dataset

In [3]:
# Data Parsing

import os
import itertools
from bs4 import BeautifulSoup
import time

start = time.time()

dataset = "./reuters21578/"
documents = [file for file in os.listdir(dataset) if file.endswith('.sgm')]

reuters_news = []
for document in documents: # store separately single documents from each sgm file in X
    with open(dataset+document,"r",encoding='latin-1') as f:
        content = f.read()
    reuters_news.append(content.split("</REUTERS>"))
reuters_news = list(itertools.chain.from_iterable(reuters_news))

X_train = []
X_test = []
y_train = []
y_test = []

# Following "The Modified Apte ("ModApte") Split"
for article in reuters_news:
    if  'LEWISSPLIT="TRAIN"' in article and 'TOPICS="YES"' in article:
        X_train.append(article)
    elif 'LEWISSPLIT="TEST"' in article and 'TOPICS="YES"' in article:
        X_test.append(article)


for index,doc in enumerate(X_train):
    soup = BeautifulSoup(doc, "html.parser")
    X_train[index] = (soup.find("text").text)
    topic = str(soup.find_all('topics'))
    x = [tag.split("</d>")[0] for tag in topic.split("<d>") if "</d>" in tag] # split topics if many in a single tag
    y_train.append(x)

for index,doc in enumerate(X_test):
    soup = BeautifulSoup(doc, "html.parser")
    X_test[index] = (soup.find("text").text)
    topic = str(soup.find_all('topics'))
    x = [tag.split("</d>")[0] for tag in topic.split("<d>") if "</d>" in tag]
    y_test.append(x)

end = time.time()
print("Time for data parsing: ", end - start, "seconds")

Time for data parsing:  9.937043905258179 seconds


In [4]:
# Feature extraction with Tfidf Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

start = time.time()

vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b[A-Za-z]+\b')
vectorizer.fit(X_train)
vectorizer.fit(X_test)

# transforming the data
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

end = time.time()
print("Time for feature extraction: ", end-start, "seconds")


Time for feature extraction:  2.4923529624938965 seconds


  .format(sorted(unknown, key=str)))


In [5]:
# Train models

from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB # probabilistic
from sklearn.svm import LinearSVC   #non-probabilistic

start = time.time()
reuters_naive = OneVsRestClassifier(MultinomialNB(alpha=0.01)) # probabilistic
reuters_naive.fit(X_train_tfidf,y_train)
end = time.time()
print("Time for probabilistic modeling: ", end-start, "seconds")

start = time.time()
reuters_svc = OneVsRestClassifier(LinearSVC()) # non-probabilistic
reuters_svc.fit(X_train_tfidf,y_train)
end = time.time()
print("Time for non-probabilistic modeling: ", end-start, "seconds")

Time for probabilistic modeling:  0.6419467926025391 seconds
Time for non-probabilistic modeling:  3.024366855621338 seconds


In [6]:
# Evaluate models + Report

from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import classification_report

start = time.time()
pred_naive_reuters = reuters_naive.predict(X_test_tfidf)
pred_svc_reuters = reuters_svc.predict(X_test_tfidf)
end = time.time()
print("Time for prediction: ", end-start, "seconds")

targets = mlb.classes_

print(classification_report(y_test,pred_naive_reuters, target_names = targets))
print("Accuracy of the probabilistic model: ", accuracy_score(y_test, pred_naive_reuters))
print("Hamming loss of the probabilistic model: ", hamming_loss(y_test, pred_naive_reuters))
print("Zero-one loss of the probabilistic model: ", zero_one_loss(y_test, pred_naive_reuters))
print(classification_report(y_test,pred_svc_reuters, target_names = targets))
print("Accuracy of the non-probabilistic model: ", accuracy_score(y_test, pred_svc_reuters))
print("Hamming loss of the non-probabilistic model: ", hamming_loss(y_test, pred_svc_reuters))
print("Zero-one loss of the probabilistic model: ", zero_one_loss(y_test, pred_svc_reuters))

Time for prediction:  0.23117899894714355 seconds
                 precision    recall  f1-score   support

            acq       0.94      0.86      0.90       719
           alum       0.39      0.30      0.34        23
        austdlr       0.00      0.00      0.00         0
         barley       0.40      0.57      0.47        14
            bop       0.70      0.47      0.56        30
            can       0.00      0.00      0.00         0
        carcass       0.30      0.50      0.37        18
     castor-oil       0.00      0.00      0.00         1
     castorseed       0.00      0.00      0.00         0
     citruspulp       0.00      0.00      0.00         0
          cocoa       0.70      0.78      0.74        18
        coconut       0.00      0.00      0.00         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.68      0.93      0.79        28
         copper       0.30      0.50      0.37        18
     copra-cake       0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 20newsgroups

In [7]:
# Parse data
from sklearn.datasets import fetch_20newsgroups

train_set = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_set = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))


# Vectorization

vectorizer2 = TfidfVectorizer(stop_words = 'english',ngram_range=(1,3))
train_vectors = vectorizer2.fit_transform(train_set.data)
test_vectors = vectorizer2.transform(test_set.data)


# Train models

start = time.time()
naive20 = MultinomialNB(alpha= 0.01) 
naive20.fit(train_vectors, train_set.target)
end = time.time()
print("Time for probabilistic modeling: ", end-start, "seconds")

start = time.time()
svc20 = LinearSVC()
svc20.fit(train_vectors, train_set.target)
end = time.time()
print("Time for non-probabilistic modeling: ", end-start, "seconds")


# Evaluation

start = time.time()
naive20_eval = naive20.predict(test_vectors)
svc20_eval = svc20.predict(test_vectors)
end = time.time()
print("Time for prediction: ", end-start, "seconds")

# Report

targets2 = test_set.target_names
print(classification_report(test_set.target,naive20_eval,target_names=targets2))
print("Zero-one loss of the probabilistic model: ", zero_one_loss(test_set.target, naive20_eval))
print(classification_report(test_set.target,svc20_eval,target_names=targets2))
print("Zero-one loss of the non-probabilistic model: ", zero_one_loss(test_set.target, svc20_eval))

Time for probabilistic modeling:  1.8166279792785645 seconds
Time for non-probabilistic modeling:  9.393901109695435 seconds
Time for prediction:  0.3518869876861572 seconds
                          precision    recall  f1-score   support

             alt.atheism       0.72      0.39      0.51       319
           comp.graphics       0.65      0.72      0.68       389
 comp.os.ms-windows.misc       0.69      0.55      0.61       394
comp.sys.ibm.pc.hardware       0.63      0.72      0.67       392
   comp.sys.mac.hardware       0.76      0.66      0.71       385
          comp.windows.x       0.79      0.76      0.78       395
            misc.forsale       0.78      0.77      0.78       390
               rec.autos       0.78      0.71      0.75       396
         rec.motorcycles       0.81      0.72      0.76       398
      rec.sport.baseball       0.93      0.79      0.85       397
        rec.sport.hockey       0.59      0.94      0.72       399
               sci.crypt       0.