# Classification

In [46]:
from nltk.corpus import reuters
import spacy
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    min_length = 3
    tokens = [word.lemma_ for word in nlp(text) if not word.is_stop]
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length,tokens))
    return filtered_tokens

def represent(train_docs, test_docs, representer):
    # Learn and transform train documents
    vectorised_train_documents = representer.fit_transform(train_docs)
    vectorised_test_documents = representer.transform(test_docs)

    # Transform multilabel labels
    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id]) 
    test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])
    
    return (vectorised_train_documents, train_labels, vectorised_test_documents, test_labels)

def evaluate(test_labels, predictions):
    precision = precision_score(test_labels, predictions, average='micro')
    recall = recall_score(test_labels, predictions, average='micro')
    f1 = f1_score(test_labels, predictions, average='micro')
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(test_labels, predictions, average='macro')
    recall = recall_score(test_labels, predictions, average='macro')
    f1 = f1_score(test_labels, predictions, average='macro')

    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [47]:
documents = reuters.fileids()

train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

In [48]:
candidate = {'representer': TfidfVectorizer(tokenizer=tokenize),
             'estimator': OneVsRestClassifier(LinearSVC(random_state=42))}

train_docs, train_labels, test_docs, test_labels = represent(train_docs, test_docs, candidate['representer'])

In [49]:
candidate['estimator'].fit(train_docs, train_labels)
predictions = candidate['estimator'].predict(test_docs)
evaluate(test_labels, predictions)

Micro-average quality numbers
Precision: 0.9468, Recall: 0.8032, F1-measure: 0.8691
Macro-average quality numbers
Precision: 0.6397, Recall: 0.3984, F1-measure: 0.4693


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Test with word2vec

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_path = './'
word2vec_output_file = glove_filename+'.word2vec'
glove2word2vec(glove_path, word2vec_output_file)

In [None]:
word2vec_output_file = glove_filename+'.word2vec'
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

#Show a word embedding
print('King: ',model.get_vector('king'))

result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

print('Most similar word to King + Woman: ', result)


In [None]:
candidate = {'representer': TfidfVectorizer(tokenizer=tokenize),
             'estimator': OneVsRestClassifier(LinearSVC(random_state=42))}

train_docs, train_labels, test_docs, test_labels = represent(train_docs, test_docs, candidate['representer'])