# Representation of the collection

In [6]:
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

def parse_collection():
    documents = reuters.fileids()
    train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
    
    train_labels = [reuters.categories(doc_id) for doc_id in train_docs_id]
    test_labels = [reuters.categories(doc_id) for doc_id in test_docs_id]
    
    return (train_docs, test_docs, train_labels, test_labels)

def represent_collection(representer, train_docs, test_docs, train_labels, test_labels):
    # Learn and transform train documents
    vectorised_train_documents = representer.fit_transform(train_docs)
    vectorised_test_documents = representer.transform(test_docs)

    # Transform multilabel labels
    mlb = MultiLabelBinarizer()
    multilabel_train_labels = mlb.fit_transform(train_labels)
    multilable_test_labels = mlb.transform(test_labels)
    
    return (vectorised_train_documents, vectorised_test_documents, multilabel_train_labels, multilable_test_labels)

# Tokenisation 
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)

train_docs, test_docs, train_labels, test_labels = parse_collection()
train_docs, test_docs, train_labels, test_labels = represent_collection(vectorizer, 
                                                                        train_docs, test_docs, 
                                                                        train_labels, test_labels)

print("{} training documents".format(train_docs.shape[0]))
print("{} testing documents".format(test_docs.shape[0]))

7769 training documents
3019 testing documents


# Classification training and prediction

In [8]:
def train_and_predict(classifier, train_docs, train_labels, test_documents):
    classifier.fit(train_docs, train_labels)
    predictions = classifier.predict(test_docs)
    return predictions

classifier = OneVsRestClassifier(LinearSVC(random_state=42))
predictions = train_and_predict(classifier, train_docs, train_labels, test_docs)

print("{} labels predicted for {} documents".format(sum([sum(prediction) for prediction in predictions]),
                                                    test_docs.shape[0]))

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

# How well have we done?

This section will evaluate our solution using Precision, Recall and F1.

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Show our quality
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, 
                                                                     recall, 
                                                                     f1))
print()

precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, 
                                                                     recall, 
                                                                     f1))
print()
