# Representation of the collection

In [None]:
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier

def parse_collection():
    documents = reuters.fileids()
    train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
    
    train_labels = [reuters.categories(doc_id) for doc_id in train_docs_id]
    test_labels = [reuters.categories(doc_id) for doc_id in test_docs_id]
    
    return (train_docs, test_docs, train_labels, test_labels)

def represent_collection(representer, train_docs, test_docs, train_labels, test_labels):
    # Learn and transform train documents
    vectorised_train_documents = representer.fit_transform(train_docs)
    vectorised_test_documents = representer.transform(test_docs)

    # Transform multilabel labels
    mlb = MultiLabelBinarizer()
    multilabel_train_labels = mlb.fit_transform(train_labels)
    multilable_test_labels = mlb.transform(test_labels)
    
    return (vectorised_train_documents, vectorised_test_documents, multilabel_train_labels, multilable_test_labels)

# Classification training and prediction

In [None]:
def train_and_predict(classifier, train_docs, train_labels, test_docs):
    classifier.fit(train_docs, train_labels)
    predictions = classifier.predict(test_docs)
    return predictions

# How well have we done?

This section will evaluate our solution using Precision, Recall and F1.

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def quality_metrics(labels, predictions, average='micro'):
    precision = precision_score(labels, predictions, average=average)
    recall = recall_score(labels, predictions, average=average)
    f1 = f1_score(labels, predictions, average=average)
    return [precision, recall, f1]

In [None]:
def evaluate_model(representer, classifier, description):
    # Representation
    train_docs, test_docs, train_labels, test_labels = parse_collection()
    train_docs, test_docs, train_labels, test_labels = represent_collection(representer, 
                                                                            train_docs, test_docs, 
                                                                            train_labels, test_labels)
    print("{} training documents".format(train_docs.shape[0]))
    print("{} testing documents".format(test_docs.shape[0]))

    # Training and prediction
    predictions = train_and_predict(classifier, train_docs, train_labels, test_docs)
    print("{} labels predicted for {} documents"
          .format(sum([sum(prediction) for prediction in predictions]),test_docs.shape[0]))
    print()

    micro_quality_metrics = quality_metrics(test_labels, predictions)
    macro_quality_metrics =quality_metrics(test_labels, predictions, average='macro')
    # Evalaution
    print("Running: "+description)
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(*micro_quality_metrics))
    print()
    
    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(*macro_quality_metrics))
    print()
    return {'micro': micro_quality_metrics, 
            'macro': macro_quality_metrics}
    
random_state = 42
# First model 
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
classifier = OneVsRestClassifier(LinearSVC(random_state=random_state))
q1 = evaluate_model(vectorizer, classifier, 'TFIDF LinearSVC')

# Second model 
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2,
                             max_df=0.90,
                             use_idf=True, sublinear_tf=True,
                             norm='l2');
classifier = OneVsRestClassifier(LinearSVC(random_state=random_state))
q2 = evaluate_model(vectorizer, classifier, 'TFIDF minDF 3, max DF 90, L2, sublinear, LinearSVC')

# Third model
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2,
                             max_df=0.90, max_features = 3000,
                             use_idf=True, sublinear_tf=True,
                             norm='l2');
classifier = KNeighborsClassifier()
classifier.n_neighbors=45
classifier.random_state = random_state
q3 = evaluate_model(vectorizer, classifier, 'minDF 2, max DF 90, L2, sublinear, kNN 45')

print(q1)
print(q2)
print(q3)
