In [1]:
import collections
import spacy
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.mixture.gaussian_mixture import GaussianMixture

In [2]:
spacy_nlp = spacy.load('en_core_web_sm')
def tokenize(article):
    try:
        article_lines = article.readlines()
        new_lines = []
        for idx, line in enumerate(article_lines):
            tokens = line.split(':')
            if ' ' not in tokens[0] and tokens[0] not in ['Subject', 'Highlights']:
                continue
            elif tokens[0] == 'Subject':
                line = line[8:]
            if line[0] in ['>', '@']:
                continue
            new_lines.append(line)
        doc = spacy_nlp('\n'.join(new_lines))
        tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop | token.is_punct | token.is_space | (token.lemma_ == '-PRON-') | token.like_num | token.like_email | (not token.is_alpha)]
        return tokens
    except:
        return []

In [3]:
hockey_docs = [' '.join(tokenize(open('News/hockey/{}'.format(f), encoding='utf8'))) for f in os.listdir('News/hockey/')]
auto_docs = [' '.join(tokenize(open('News/autos/{}'.format(f), encoding='utf8'))) for f in os.listdir('News/autos/')]
space_docs = [' '.join(tokenize(open('News/space/{}'.format(f), encoding='utf8'))) for f in os.listdir('News/space/')]
labels = [0] * len(hockey_docs) + [1] * len(auto_docs) + [2] * len(space_docs)

In [4]:
corpus = hockey_docs + auto_docs + space_docs
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(hockey_docs + auto_docs + space_docs)

In [40]:
km_model = KMeans(n_clusters=3)
km_model.fit_transform(X)

clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)

print("Micro F-1 Score: %0.3f" % metrics.f1_score(labels, km_model.labels_, average='micro'))
print("Macro F-1 Score: %0.3f" % metrics.f1_score(labels, km_model.labels_, average='macro'))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km_model.labels_))

Micro F-1 Score: 0.334
Macro F-1 Score: 0.291
Adjusted Rand-Index: 0.699


In [41]:
gmm_model = GaussianMixture(n_components=3, covariance_type='diag')
gmm_model.fit(X.toarray())
pred = gmm_model.predict(X.toarray())

print("Micro F-1 Score: %0.3f" % metrics.f1_score(labels, pred, average='micro'))
print("Macro F-1 Score: %0.3f" % metrics.f1_score(labels, pred, average='macro'))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, pred))

Micro F-1 Score: 0.330
Macro F-1 Score: 0.294
Adjusted Rand-Index: 0.733
