In [9]:
# CAP 5630 HW4
# Josh E.

import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import os
import random
import sys

# assumes csv's are in the same directory
train_data = pd.read_csv(os.getcwd() + "\\data.csv", header=None)

for col in train_data.columns:
    if train_data.loc[:, col].mean() == 0 or train_data.loc[:, col].max == train_data.loc[:, col].min():
        train_data.drop(columns=[col], inplace=True)
train_data.columns = range(train_data.shape[1])

labels = pd.read_csv(os.getcwd() + "\\label.csv", header=None)

'''
K-Means Algorithm

Input: K, data matrix
Output: K centers, cluster number/labels of each point

Algorithm:

    1. Initialization:
        - Randomly select K points as K centers
    
    2. Repeat:
        - Assignment: Assign data points to K clusters based on distance between centers and points
        - Update: Compute the new centers(means)
        
    3. Stop Critera:
        - the centers/means do not change (the assignment does not change)

'''

from sklearn.metrics.pairwise import cosine_similarity

def k_means(K, dataset, labels, dist_function):
    
    # initialization
    centers = []
    for i in range(K):
        centers.append(dataset.iloc[math.floor(random.random() * dataset.shape[0])])
    centers = np.array(centers)
    dataset = dataset.to_numpy()
    labels = labels.to_numpy()
    
    iteration = 0
    
    # repeat
    while True:
        iteration+=1
        print("iteration", iteration ,"start")
        clusters = []
        for i in range(centers.shape[0]):
            clusters.append(np.empty([0,dataset.shape[1]]))
        #print(np.reshape(dataset[0],(1, dataset.shape[1])).shape)

        for i in range(dataset.shape[0]):
            distances = np.empty([0])
            for j in range(K):
                distances = np.append(distances, np.array(dist_function(dataset[i], centers[j])))
            cluster_index = np.where(distances==distances.min())[0][0]
            clusters[cluster_index] = np.vstack((clusters[cluster_index], np.array(dataset[i])))
            
        '''for i in range(centers.shape[0]):
            print(clusters[i].shape)'''
        
        new_centers = np.zeros(centers.shape)
        for i in range(new_centers.shape[0]):
            new_centers[i] = np.mean(clusters[i], axis=0)
            
        SSE_centers=0
        SSE_new_centers=0
        for i in range(len(clusters)):
            for j in range(clusters[i].shape[0]):
                SSE_centers=SSE_centers + math.pow(dist_function(clusters[i][j], centers[i]), 2)
                SSE_new_centers= SSE_new_centers + math.pow(dist_function(clusters[i][j], new_centers[i]), 2)
        print("SSE Centers:", SSE_centers)
        print("SSE New Centers:", SSE_new_centers)
        
        # stop criteria
        if (np.equal(centers, new_centers)).all():
            print(centers)
            print("loop end")
            break
        else:
            print(new_centers)
            print("end iteration")
            centers = new_centers 
    
    predicted_labels = []
    for i in range(dataset.shape[0]):
        distances = []
        for j in range(K):
            distances.append(dist_function(dataset[i], centers[j]))
        cluster_index = distances.index(min(distances))
        predicted_labels.append(cluster_index)
        
    from sklearn.metrics import accuracy_score
    
    print("Accuracy: ", accuracy_score(labels, predicted_labels))
    print("Accuracy: ", accuracy_score(predicted_labels, labels))
    
k_means(10, train_data, labels, math.dist)

iteration 1 start
SSE Centers: 43439298360.0
SSE New Centers: 27768843327.520596
[[0.         0.         0.         ... 0.14449541 0.05963303 0.00688073]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.45025381 0.15076142 0.        ]
 ...
 [0.         0.         0.         ... 0.45707997 0.1298606  0.        ]
 [0.0496732  0.30849673 0.20653595 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
end iteration
iteration 2 start
SSE Centers: 26990948224.733078
SSE New Centers: 26575939138.94486
[[0.         0.         0.         ... 1.21032505 0.3126195  0.00573614]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.24966262 0.134278   0.        ]
 [0.04986877 0.30971129 0.20734908 ... 0.         0.         0.        ]
 [0.  

SSE Centers: 25686376735.98637
SSE New Centers: 25680778854.733574
[[0.         0.         0.         ... 1.09139426 0.3509006  0.00400267]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.04928664 0.30609598 0.20492866 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
end iteration
iteration 18 start
SSE Centers: 25675875372.84819
SSE New Centers: 25669250623.313858
[[0.         0.         0.         ... 1.08058124 0.34742404 0.00396301]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.04896907 0.30412371 0.20360825 ... 0.         0.         0.        ]
 [0.         0.    

SSE Centers: 25583651335.987366
SSE New Centers: 25583423338.624443
[[0.         0.         0.         ... 1.09285237 0.35136941 0.00400802]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.04423749 0.27473807 0.18393481 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
end iteration
iteration 34 start
SSE Centers: 25583276791.553875
SSE New Centers: 25582856646.354877
[[0.         0.         0.         ... 1.09212283 0.35113485 0.00400534]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.04434072 0.27537923 0.18436406 ... 0.         0.         0.        ]
 [0.         0.  

In [10]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import os
import random
import sys

# assumes csv's are in the same directory
train_data = pd.read_csv(os.getcwd() + "\\data.csv", header=None)

for col in train_data.columns:
    if train_data.loc[:, col].mean() == 0 or train_data.loc[:, col].max == train_data.loc[:, col].min():
        train_data.drop(columns=[col], inplace=True)
train_data.columns = range(train_data.shape[1])

labels = pd.read_csv(os.getcwd() + "\\label.csv", header=None)

'''
K-Means Algorithm

Input: K, data matrix
Output: K centers, cluster number/labels of each point

Algorithm:

    1. Initialization:
        - Randomly select K points as K centers
    
    2. Repeat:
        - Assignment: Assign data points to K clusters based on distance between centers and points
        - Update: Compute the new centers(means)
        
    3. Stop Critera:
        - the centers/means do not change (the assignment does not change)

'''

from sklearn.metrics.pairwise import cosine_similarity

def k_means(K, dataset, labels, dist_function):
    
    # initialization
    centers = []
    for i in range(K):
        centers.append(dataset.iloc[math.floor(random.random() * dataset.shape[0])])
    centers = np.array(centers)
    dataset = dataset.to_numpy()
    labels = labels.to_numpy()
    
    iteration = 0
    
    # repeat
    while True:
        iteration+=1
        print("iteration", iteration ,"start")
        clusters = []
        for i in range(centers.shape[0]):
            clusters.append(np.empty([0,dataset.shape[1]]))
        #print(np.reshape(dataset[0],(1, dataset.shape[1])).shape)

        #separates points in clusters
        for i in range(dataset.shape[0]):
            distances = np.empty([0])
            for j in range(K):
                distances = np.append(distances, np.array(dist_function(dataset[i], centers[j])))
            cluster_index = np.where(distances==distances.min())[0][0]
            clusters[cluster_index] = np.vstack((clusters[cluster_index], np.array(dataset[i])))
            
        '''for i in range(centers.shape[0]):
            print(clusters[i].shape)'''
        
        new_centers = np.zeros(centers.shape)
        for i in range(new_centers.shape[0]):
            new_centers[i] = np.mean(clusters[i], axis=0)
            
        SSE_centers=0
        SSE_new_centers=0
        for i in range(len(clusters)):
            for j in range(clusters[i].shape[0]):
                SSE_centers=SSE_centers + math.pow(dist_function(clusters[i][j], centers[i]), 2)
                SSE_new_centers= SSE_new_centers + math.pow(dist_function(clusters[i][j], new_centers[i]), 2)
        print("SSE Centers:", SSE_centers)
        print("SSE New Centers:", SSE_new_centers)
        
        # stop criteria
        if (np.equal(centers, new_centers)).all():
            print(centers)
            print("loop end")
            break
        else:
            print(new_centers)
            print("end iteration")
            centers = new_centers 
            
    predicted_labels = []
    for i in range(dataset.shape[0]):
        distances = []
        for j in range(K):
            distances.append(dist_function(dataset[i], centers[j]))
        cluster_index = distances.index(min(distances))
        predicted_labels.append(cluster_index)
        
    from sklearn.metrics import accuracy_score
    
    print("Accuracy: ", accuracy_score(labels, predicted_labels))
    print("Accuracy: ", accuracy_score(predicted_labels, labels))
    
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(X, Y):
    X = np.reshape(X, (1, train_data.shape[1]))
    Y = np.reshape(Y, (1, train_data.shape[1]))
    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0
    return 1-cosine_similarity(X, Y)[0][0]
k_means(10, train_data, labels, cosine_sim)



iteration 1 start
SSE Centers: 1750.2957423783037
SSE New Centers: 919.7047452235496
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.01967892 0.12221647 0.08182289 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.70670391 0.30726257 0.        ]
 ...
 [0.         0.         0.         ... 0.94561934 0.35146022 0.0060423 ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
end iteration
iteration 2 start
SSE Centers: 838.6036849474584
SSE New Centers: 800.0546590613
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.31157635 0.13546798 0.        ]
 ...
 [0.         0.         0.         ... 0.96854305 0.30877483 0.00496689]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.  

SSE Centers: 696.1664851609437
SSE New Centers: 696.2270049166721
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration
iteration 23 start
SSE Centers: 696.2150471054283
SSE New Centers: 696.2834230158267
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration
iteration 24 start
SSE Centers: 696.2733400718205
SSE New Centers: 696.3256842201897
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration
iteration 25 start
SSE Centers: 696.3175805290529
SSE New Centers: 696.343615868302
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration


SSE Centers: 697.1356022008475
SSE New Centers: 697.1629429032249
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration
iteration 56 start
SSE Centers: 697.1623449118576
SSE New Centers: 697.176711957767
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration
iteration 57 start
SSE Centers: 697.1761515397471
SSE New Centers: 697.1852992107118
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration
iteration 58 start
SSE Centers: 697.1851815111233
SSE New Centers: 697.1911105726462
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
end iteration


In [5]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import os
import random
import sys

# assumes csv's are in the same directory
train_data = pd.read_csv(os.getcwd() + "\\data.csv", header=None)

for col in train_data.columns:
    if train_data.loc[:, col].mean() == 0 or train_data.loc[:, col].max == train_data.loc[:, col].min():
        train_data.drop(columns=[col], inplace=True)
train_data.columns = range(train_data.shape[1])

labels = pd.read_csv(os.getcwd() + "\\label.csv", header=None)

'''
K-Means Algorithm

Input: K, data matrix
Output: K centers, cluster number/labels of each point

Algorithm:

    1. Initialization:
        - Randomly select K points as K centers
    
    2. Repeat:
        - Assignment: Assign data points to K clusters based on distance between centers and points
        - Update: Compute the new centers(means)
        
    3. Stop Critera:
        - the centers/means do not change (the assignment does not change)

'''

from sklearn.metrics.pairwise import cosine_similarity

def k_means(K, dataset, labels, dist_function):
    
    # initialization
    centers = []
    for i in range(K):
        centers.append(dataset.iloc[math.floor(random.random() * dataset.shape[0])])
    centers = np.array(centers)
    dataset = dataset.to_numpy()
    labels = labels.to_numpy()
    
    iteration = 0
    
    # repeat
    while True:
        iteration+=1
        print("iteration", iteration ,"start")
        clusters = []
        for i in range(centers.shape[0]):
            clusters.append(np.empty([0,dataset.shape[1]]))
        #print(np.reshape(dataset[0],(1, dataset.shape[1])).shape)

        #separates points in clusters
        for i in range(dataset.shape[0]):
            distances = np.empty([0])
            for j in range(K):
                distances = np.append(distances, np.array(dist_function(dataset[i], centers[j])))
            cluster_index = np.where(distances==distances.min())[0][0]
            clusters[cluster_index] = np.vstack((clusters[cluster_index], np.array(dataset[i])))
            
        '''for i in range(centers.shape[0]):
            print(clusters[i].shape)'''
        
        new_centers = np.zeros(centers.shape)
        for i in range(new_centers.shape[0]):
            new_centers[i] = np.mean(clusters[i], axis=0)
            new_centers[i][np.isnan(new_centers[i])] = 0
        
        SSE_centers=0
        SSE_new_centers=0
        for i in range(len(clusters)):
            for j in range(clusters[i].shape[0]):
                SSE_centers=SSE_centers + math.pow(dist_function(clusters[i][j], centers[i]), 2)
                SSE_new_centers= SSE_new_centers + math.pow(dist_function(clusters[i][j], new_centers[i]), 2)
        print("SSE Centers:", SSE_centers)
        print("SSE New Centers:", SSE_new_centers)
        
        # stop criteria
        if (np.equal(centers, new_centers)).all():
            print(centers)
            print("loop end")
            break
        else:
            print(new_centers)
            print("end iteration")
            centers = new_centers 
            
    dummy_centers=[]
    while True:
        max_shape = 0
        max_shape_index = 0
        for i in range(centers.shape[0]):
            if centers[i].shape[0] > max_shape:
                max_shape = centers[i].shape[0]
                max_shape_index = i
        dummy_centers.append(centers[max_shape_index])
        if len(dummy_centers) == K:
            break;
    centers = dummy_centers
    centers = np.array(centers)
            
    predicted_labels = []
    for i in range(dataset.shape[0]):
        distances = []
        for j in range(K):
            distances.append(dist_function(dataset[i], centers[j]))
        cluster_index = distances.index(min(distances))
        predicted_labels.append(cluster_index)
        
    from sklearn.metrics import accuracy_score
    
    print("Accuracy: ", accuracy_score(labels, predicted_labels))
    print("Accuracy: ", accuracy_score(predicted_labels, labels))
    
from scipy.spatial.distance import jaccard

def jac(X, Y):
    X = np.reshape(X, (1, train_data.shape[1]))
    Y = np.reshape(Y, (1, train_data.shape[1]))
    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0
    return 1-jaccard(X, Y)
k_means(10, train_data, labels, jac)


iteration 1 start
SSE Centers: 0.026880253578650946
SSE New Centers: 1.2583713811100863e-05
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.20355412 0.03473344 0.        ]
 [0.         0.         0.         ... 0.59460834 0.2456765  0.00305188]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.0960961  0.         0.        ]]
end iteration
iteration 2 start
SSE Centers: 0.0
SSE New Centers: 0.0
[[0.0038 0.0236 0.0158 ... 0.1636 0.0526 0.0006]
 [0.     0.     0.     ... 0.     0.     0.    ]
 [0.     0.     0.     ... 0.     0.     0.    ]
 ...
 [0.     0.     0.     ... 0.     0.     0.    ]
 [0.     0.     0.     ... 0.     0.     0.    ]
 [0.     0.     0.     ... 0.     0.     0.    ]]
end iteration
iteration 3 start
SSE Centers: 0.0
SSE New Centers: 0.0
[[0.0038 0.0236 0.0158 .