In [3]:
from sklearn.datasets import fetch_20newsgroups, fetch_mldata
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics import confusion_matrix
from numpy import linalg as LA
import numpy as np
import sys
sys.path.append('../data/fashion-mnist/utils')
import mnist_reader
import scipy

In [8]:
newsgroups = fetch_20newsgroups(subset='all', data_home='../data/')
vectortype = TfidfVectorizer()
news_vectored_result = vectortype.fit_transform(newsgroups.data)
news_target = newsgroups.target
mnist_full = fetch_mldata('MNIST original', data_home='../data/')
#mnist_full = fetch_mldata('MNIST original', data_home='/home/pussyeater6969/data/')
mnist_data = mnist_full.data
mnist_target = mnist_full.target
fashion_data, fashion_target = mnist_reader.load_mnist('../data/fashion-mnist/data/fashion')

In [49]:
def Find_Gini_Purity(target_cluster, predicted_cluster, k):
    matrix = confusion_matrix(target_cluster, predicted_cluster)
    N = []
    M = []
    P = []
    G = []
    G_temp = 0
    rows = matrix.shape[0]
    columns = matrix.shape[1]

    for i in range(rows):
        N.append(sum(matrix[i]))
    
    for j in range(columns):
        M.append(sum(matrix[:][j]))

    for j in range(columns):
        P.append(max(matrix[:][j]))
        G_temp = 0
        for i in range(rows):
            if M[j] == 0:
                G_temp += (matrix[i][j] / 1) ** 2
            else:
                G_temp += (matrix[i][j] / M[j]) ** 2
        G.append(1-G_temp)
    
    G_temp = 0
    for j in range(columns):
        G_temp += G[j] * M[j]
    
    #print(Matrix)
    #print(M)
    Purity = sum(P) / sum(M)
    Gini = G_temp / sum(M)
    
    return Gini, Purity

In [13]:
def K_Means(data, target, k):

    rng = np.random.RandomState()
    i = rng.permutation(data.shape[0])[:k]
    centroids = data[i]
    p_list = []
    datapoints = data.shape[0]
    while True:
        
        cluster = pairwise_distances_argmin(data, centroids)
        
        new_centroids = np.array([data[cluster == i].mean(0)
                                for i in range(k)])
        
        new_centroids = np.reshape(new_centroids, (k , -1))
        
        
        performance = 0
        for j in range(0, datapoints):
            if scipy.sparse.issparse(data):
                performance += LA.norm(data[j].toarray() - new_centroids[cluster[j]])
            else:
                performance += LA.norm(data[j] - new_centroids[cluster[j]])
                
        p_list.append(performance)

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
     
    gini, purity = Find_Gini_Purity(target, cluster, k)
    
    return centroids, cluster, p_list, gini, purity

In [None]:
K = 10
news_centroids, news_cluster, news_p_list, news_gini, news_purity = K_Means(news_vectored_result, news_target, K)
print("News group")
print("K = ", K)
print("Iterations : ", len(news_p_list)+1)
print("Purity : ", news_purity)
print("Gini : ", news_gini)
print("Final Performance : ", news_p_list[-1])

News group
K =  10
Iterations :  19
Purity :  0.688103576356
Gini :  0.432431363672
Final Performance :  18240.7152413


In [23]:
K = 20
news_centroids, news_cluster, news_p_list, news_gini, news_purity = K_Means(news_vectored_result, news_target, K)
print("News group")
print("K = ", K)
print("Iterations : ", len(news_p_list)+1)
print("Purity : ", news_purity)
print("Gini : ", news_gini)
print("Final Performance : ", news_p_list[-1])

News group
K =  20
Iterations :  78
Purity :  0.407991085642
Gini :  0.723122509103
Final Performance :  18135.1050452


In [54]:
K = 40
news_centroids, news_cluster, news_p_list, news_gini, news_purity = K_Means(news_vectored_result, news_target, K)
print("News group")
print("K = ", K)
print("Iterations : ", len(news_p_list)+1)
print("Purity : ", news_purity)
print("Gini : ", news_gini)
print("Final Performance : ", news_p_list[-1])

News group
K =  40
Iterations :  35
Purity :  0.387403162475
Gini :  0.777174656944
Final Performance :  18020.1662916


In [25]:
K = 5
mnist_centroids, mnist_cluster, mnist_p_list, mnist_gini, mnist_purity = K_Means(mnist_data, mnist_target, K)
print("MNIST group")
print("K = ", K)
print("Iterations : ", len(mnist_p_list)+1)
print("Purity : ", mnist_purity)
print("Gini : ", mnist_gini)
print("Final Performance : ", mnist_p_list[-1])

MNIST group
K =  5
Iterations :  46
Purity :  0.788585714286
Gini :  0.323015264899
Final Performance :  116179533.618


In [26]:
K = 10
mnist_centroids, mnist_cluster, mnist_p_list, mnist_gini, mnist_purity = K_Means(mnist_data, mnist_target, K)
print("MNIST group")
print("K = ", K)
print("Iterations : ", len(mnist_p_list)+1)
print("Purity : ", mnist_purity)
print("Gini : ", mnist_gini)
print("Final Performance : ", mnist_p_list[-1])

MNIST group
K =  10
Iterations :  55
Purity :  0.5994
Gini :  0.514749157541
Final Performance :  110537136.326


In [53]:
K = 20
mnist_centroids, mnist_cluster, mnist_p_list, mnist_gini, mnist_purity = K_Means(mnist_data, mnist_target, K)
print("MNIST group")
print("K = ", K)
print("Iterations : ", len(mnist_p_list)+1)
print("Purity : ", mnist_purity)
print("Gini : ", mnist_gini)
print("Final Performance : ", mnist_p_list[-1])

MNIST group
K =  20
Iterations :  84
Purity :  0.381942857143
Gini :  0.847886812369
Final Performance :  103658157.818


In [28]:
K = 5
fashion_centroids, fashion_cluster, fashion_p_list, fashion_gini, fashion_purity = K_Means(fashion_data, fashion_target, K)
print("Fashion group")
print("K = ", K)
print("Iterations : ", len(fashion_p_list)+1)
print("Purity : ", fashion_purity)
print("Gini : ", fashion_gini)
print("Final Performance : ", fashion_p_list[-1])

Fashion group
K =  5
Iterations :  71
Purity :  0.694166666667
Gini :  0.418044811111
Final Performance :  93325113.8172


In [29]:
K = 10
fashion_centroids, fashion_cluster, fashion_p_list, fashion_gini, fashion_purity = K_Means(fashion_data, fashion_target, K)
print("Fashion group")
print("K = ", K)
print("Iterations : ", len(fashion_p_list)+1)
print("Purity : ", fashion_purity)
print("Gini : ", fashion_gini)
print("Final Performance : ", fashion_p_list[-1])

Fashion group
K =  10
Iterations :  61
Purity :  0.585033333333
Gini :  0.544208627778
Final Performance :  84670470.0561


In [52]:
K = 20
fashion_centroids, fashion_cluster, fashion_p_list, fashion_gini, fashion_purity = K_Means(fashion_data, fashion_target, K)
print("Fashion group")
print("K = ", K)
print("Iterations : ", len(fashion_p_list)+1)
print("Purity : ", fashion_purity)
print("Gini : ", fashion_gini)
print("Final Performance : ", fashion_p_list[-1])

Fashion group
K =  20
Iterations :  77
Purity :  0.40135
Gini :  0.836956941667
Final Performance :  76830657.6526
