In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import wikipedia
from kmeans import KMeans

In [2]:
titles = [
    'Linear algebra',
    'Data Science',
    'Artificial intelligence',
    'European Central Bank',
    'Financial technology',
    'International Monetary Fund',
    'Basketball',
    'Swimming',
    'Cricket'
]

In [3]:
def load_data():
    articles = [wikipedia.page(
        title, preload=True).content for title in titles]
    vectorizer = TfidfVectorizer(stop_words={'english'})
    x_train = vectorizer.fit_transform(articles).toarray()
    y_train = np.arange(len(titles))

    return (x_train, y_train), vectorizer

In [5]:
(x_train, y_train), vectorizer = load_data()

In [6]:
def main():
    print("Data loaded, Finding Clusters ...")
    k = [4, 8,12]
    losses = []
    for num_clusters in k:
        kmeans = KMeans(x_train, y_train, num_clusters=num_clusters,
                        seed='cluster', tol=1e-9, max_iter=200)
        kmeans.fit(verbose=False)
        print("Clusters found, printing results ...")
        losses.append(kmeans.calc_loss())
        clusters = [[] for i in range(num_clusters)]
        for i,title in enumerate(titles):
            index = kmeans.cluster_labels[i]
            clusters[index].append(title)
        print("Clusters:")
        for i, cluster in enumerate(clusters):
            print("Cluster {}: {}".format(i, cluster))


In [7]:
main()

Data loaded, Finding Clusters ...
Total Iterations: 1, Loss: 0.22791402402506136
Clusters found, printing results ...
Clusters:
Cluster 0: ['Linear algebra', 'European Central Bank', 'International Monetary Fund', 'Basketball', 'Cricket']
Cluster 1: ['Financial technology']
Cluster 2: ['Data Science', 'Artificial intelligence']
Cluster 3: ['Swimming']
Total Iterations: 1, Loss: 0.028789961445557916
Clusters found, printing results ...
Clusters:
Cluster 0: ['Artificial intelligence']
Cluster 1: ['European Central Bank', 'International Monetary Fund']
Cluster 2: ['Cricket']
Cluster 3: ['Financial technology']
Cluster 4: ['Linear algebra']
Cluster 5: ['Data Science']
Cluster 6: ['Basketball']
Cluster 7: ['Swimming']
Total Iterations: 1, Loss: 0.07497952570859467
Clusters found, printing results ...
Clusters:
Cluster 0: ['Data Science']
Cluster 1: ['Financial technology', 'International Monetary Fund']
Cluster 2: ['Swimming']
Cluster 3: ['European Central Bank']
Cluster 4: []
Cluster 5: []