**functions**

In [21]:
import nltk
from nltk.stem.snowball import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
# todo we can propose a simple "stemmer" :)

stemming_algorithm = "<stemming algorithm>"
def tokenizer_porter_stemmer(text):
    global stemming_algorithm
    stemming_algorithm = "PorterStemmer + SimpleTokenizer"
    stems = stem(tokenize(text), PorterStemmer())
    return stems

def tokenizer_snowball_stemmer(text):
    global stemming_algorithm
    stemming_algorithm = "SnowballStemmer + SimpleTokenizer"
    stems = stem(tokenize(text), SnowballStemmer("english"))
    return stems

def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # tokens = nltk.word_tokenize(text)
    return tokens

def stem(tokens, stemmer):
    # each token (word?) will be stemmed
    # stemmed = []
    # for item in tokens:
    #     stemmed.append(stemmer.stem(item))
    # return stemmed
    return [stemmer.stem(t) for t in tokens]

def filter_tokens(tokens, regex = '[a-zA-Z]'):
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search(regex, token):
            filtered_tokens.append(token)
            # filtered_tokens.append(re.sub(r"[^A-Za-z]", "", token))
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

**fetch classic database and store it in disk**

In [9]:
from classicdb.fetch import fetch_classic
import numpy as np

dataset = fetch_classic(subset='all', categories=None, shuffle=False)
labels = dataset.target
unique_labels = np.unique(labels)
true_k = unique_labels.shape[0]

print("%d documents" % len(dataset.data))
print("unique labels: " + str(unique_labels))

7095 documents
unique labels: [0 1 2 3]


**extracting features, PORTER stemming**

In [25]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer


print("Extracting features from the training dataset ")
t0 = time()
vectorizer = TfidfVectorizer(stop_words='english',
                             use_idf=True,
                             max_features=None,
                             tokenizer=tokenizer_porter_stemmer,
                             ngram_range=(1, 1))
X = vectorizer.fit_transform(dataset.data)
print("using: " + stemming_algorithm)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

terms = vectorizer.get_feature_names()
print("some feature terms:")
print(terms[0:1000])
print()

Extracting features from the training dataset 


using: PorterStemmer + SimpleTokenizer
done in 25.241794s
n_samples: 7095, n_features: 24489

some feature terms:
['!', '$', '%', '&', "'", "''", "''+p", "'60", "'68", "'70", "'70'", "'78", "'a", "'achondroplast", "'adequ", "'agnos", "'agnosic'", "'air", "'alali", "'alveolar", "'anyhow", "'appar", "'averag", "'azaserin", "'basic", "'bend", "'better", "'bibliograph'", "'bibliometr", "'bodi", "'build", "'built-in", "'campo", "'catch-up", "'cell", "'child", "'choke", "'citat", "'citeabl", "'cleft", "'cold", "'commando's'", "'commun", "'compar", "'complement", "'complet", "'comput", "'computer-bas", "'condit", "'core", "'correct", "'cosmopolitan", "'cram", "'crisis-st", "'criteria", "'cut-off", "'densiti", "'describ", "'dialysi", "'displac", "'document", "'documentalist", "'duct", "'earli", "'eclips", "'elementari", "'equival", "'ex", "'exact", "'exact'", "'failur", "'fat'", "'first", "'flat", "'flow", "'forc", "'freez", "'general-dynam", "'giant", "'group", "'growth", "'half", "'half-lif"

**extracting features, SNOWBALL stemming**

In [26]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer

print("Extracting features from the training dataset")
t0 = time()
vectorizer = TfidfVectorizer(stop_words='english',
                             use_idf=True,
                             max_features=None,
                             tokenizer=tokenizer_snowball_stemmer,
                             ngram_range=(1, 1))
X = vectorizer.fit_transform(dataset.data)
print("using: " + stemming_algorithm)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

terms = vectorizer.get_feature_names()
print("some feature terms:")
print(terms[0:1000])
print()

Extracting features from the training dataset


using: SnowballStemmer + SimpleTokenizer
done in 31.469388s
n_samples: 7095, n_features: 24084

some feature terms:
['!', '$', '%', '&', "'", "''", "'+p", "'a", "'n", "'s", "'v", '(', ')', '*', '***medlearn***', '**-at', '*c', '*ln', '+', '++', '+-', '+-f', '+100', '+16mv', '+18degre', '+56', '+8mv', '+o', '+q', '+sin', '+size', '+u', ',', ',l', '-', '--', '-.5*ln', '-0.0904', '-0.199', '-1', '-1.087', '-1.2', '-1.23849', '-10', '-13', '-2', '-25', '-26', '-3', '-340', '-35', '-4', '-40', '-40degre', '-4degre', '-5', '-55', '-60', '-67.6', '-90', '-a', '-amid', '-an', '-bx', '-cholografin', '-construct', '-crysta-', '-crystallin', '-dash', '-distribut', '-georg', '-ii', '-impress', '-in', '-irradi', '-keto', '-ketoglutar', '-label', '-ln', '-m', '-methyl-m-trifluoromethylphenethylamin', '-multiprogram', '-n', '-o', '-p', '-plane', '-point', '-selenouracil', '-selenourea', '-semivertex-angl', '-space', '-sphingosin', '-stabil', '-stabl', '-t', '-u', '-with', '-yy', '.', '.-j', '..', '..

In [27]:
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans

# from sklearn.metrics.pairwise import cosine_similarity
# dist = 1 - cosine_similarity(tfidf_matrix)

km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                     init_size=1000, batch_size=1000, verbose=False)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("true labels vs cluster labels")
print(labels[0:50])
print(km.labels_[0:50])
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))
print()

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)
done in 0.153s

true labels vs cluster labels
[3 0 0 1 2 0 0 1 0 3 0 0 2 0 0 1 3 2 1 2 2 0 3 0 0 3 3 0 0 0 3 0 3 0 3 0 3
 3 1 0 2 3 0 3 0 0 0 2 2 0]
[1 3 0 3 1 0 2 3 3 0 2 2 1 3 0 3 1 1 3 1 1 2 1 0 2 1 1 0 3 0 0 3 1 0 1 3 1
 1 3 0 1 1 2 1 3 2 2 1 1 2]

Homogeneity: 0.509
Completeness: 0.488
V-measure: 0.498
Adjusted Rand-Index: 0.359


Silhouette Coefficient: 0.013

Top terms per cluster:
Cluster 0: comput . program method fortran algol , use techniqu algorithm
Cluster 1: . , flow pressur boundari number effect layer equat method
Cluster 2: algorithm ) ( ] [ integr function $ matrix polynomi
Cluster 3: , . inform librari program use languag comput data retriev
