In [None]:
import numpy as np
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import AffinityPropagation 
from sklearn.cluster import Birch
from sklearn.metrics import adjusted_rand_score
from sklearn import metrics
from pprint import pprint


categories = None
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()
labels = dataset.target
true_k = np.unique(labels).shape[0]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset.data)
print("n_samples: %d, n_features: %d" % X.shape)

In [None]:
def print_scores(input_data,targets,model_labels,sample_size):

  print("Homogeneity: %0.3f" % metrics.homogeneity_score(targets, model_labels))
  print("Completeness: %0.3f" % metrics.completeness_score(targets, model_labels))
  print("V-measure: %0.3f" % metrics.v_measure_score(targets, model_labels))
  print("Adjusted Rand-Index: %.3f"
        % metrics.adjusted_rand_score(targets, model_labels))
  print("Silhouette Coefficient: %0.3f"
        % metrics.silhouette_score(input_data, model_labels, sample_size=sample_size))
  print("Davies-Bouldin score: %0.3f"
            % metrics.davies_bouldin_score(input_data, model_labels))
  print()

def affinity_propagation(similarity_matrix,labels,dataset):
  print("affinity_propagation")
  affinity_propagation = AffinityPropagation(affinity="precomputed", damping=0.5)
  t0 = time()
  affinity_propagation.fit(similarity_matrix)

  
  print("done in %0.3fs" % (time() - t0))
  print_scores(X,labels,affinity_propagation.labels_,1000)

  labels = affinity_propagation.labels_
  cluster_centers = affinity_propagation.cluster_centers_indices_
  tagged_sentences = zip(dataset.data, labels)
  clusters = {}

  for sentence, cluster_id in tagged_sentences:
      clusters.setdefault(dataset.data[cluster_centers[cluster_id]], []).append(sentence)

  print(len(clusters))
  return clusters

def kmeans_performance(input_data):
    print("MiniBatchKMeans " )
    model = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, batch_size=10)
    t0 = time()
    model.fit(input_data)    
    print("done in %0.3fs" % (time() - t0))
    print_scores(input_data,labels,model.labels_,1000)

    
def birch_performace(input_data):
    model = Birch(branching_factor=10, n_clusters=true_k, threshold=1, compute_labels=True)
    model.fit(input_data)
    print_scores(input_data,labels,model.labels_,10)
    clusters = model.predict(input_data)

    labels = model.labels_
   

In [None]:

similarity_matrix = (X * X.T).A

#dataset_test = fetch_20newsgroups(subset='test', categories=categories,shuffle=True, random_state=42)
#print(dataset_test.target)
kmeans_performance(X)
clusters = affinity_propagation(similarity_matrix,labels,dataset)
birch_performace(X)
