### CA2 K-Means

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import norm


Load data into pandas dataframe

In [39]:
df1 = 'CA2data/animals'
df2 = 'CA2data/countries'
df3 = 'CA2data/fruits'
df4 = 'CA2data/veggies'

In [59]:
# create initial datasets; get unique values to remap into classes later
df_animals = pd.read_csv(df1, header = None, delimiter = ' ')
animals_unique = [i for i in df_animals[0]]
df_countries = pd.read_csv(df2, header = None, delimiter = ' ')
countries_unique = [i for i in df_countries[0]]
df_fruits = pd.read_csv(df3, header = None, delimiter = ' ')
fruits_unique = [i for i in df_fruits[0]]
df_veggies = pd.read_csv(df4, header = None, delimiter = ' ')
veggies_unique = [i for i in df_veggies[0]]

In [41]:
# concat the four datasets into one pandas dataframe
frames = [df_animals, df_countries, df_fruits, df_veggies]
combined_df = pd.concat(frames)

Need to convert column 0 (labels) into numeric values

In [42]:
# convert pandas dataframe into numpy arrays
df = combined_df.to_numpy()

In [43]:
# as we have labeled data and know the classes, convert categorical data into numeric for accuracy measurement
for array in df:
    if array[0] in animals_unique:
        array[0] = 0
    elif array[0] in countries_unique:
        array[0] = 1
    elif array[0] in fruits_unique:
        array[0] = 2
    elif array[0] in veggies_unique:
        array[0] = 3

Initialize Centroids

In [44]:
#kmeans++ to choose our starting centroid(s)
def kmeansplusplus(dataset, k):
    # initialize first random point as starting centroid
    centroids = dataset[np.random.choice(dataset.shape[0], 1, replace = False)]
    # if/when k = 2
    if k > 1:
        # select second centroid 
        distance = np.zeros((dataset.shape[0],1))
        distance[:, 0] = ((dataset - centroids[0])**2).sum(axis = 1)
        new_centroid = np.argmax(distance, axis = 0)
        centroids = np.concatenate((centroids, dataset[new_centroid]))
    # new centroids if k > 2    
    while len(centroids) != k:
        for i in range(2,k):
            # initialize new distances with zeros 
            new_distances = np.zeros((dataset.shape[0], i))   
            for y in range(i):
                # find distances between each point and each centroids we have
                new_distances[:, y] = ((dataset - centroids[y])**2).sum(axis = 1)
            # sort from lowest to highest distances    
            new_distances = np.sort(new_distances)
            # keep only the closest distances
            new_distances = np.delete(new_distances, np.s_[1:], axis = 1)
            # return max or furthest point's index
            max_index = np.argmax(new_distances, axis = 0)
            # find new centroids using index
            centroids = np.concatenate((centroids, dataset[max_index]))

    return centroids

##### K-Means

In [45]:
def kmeans(dataset, k, normalisation):
    np.random.seed(1)
    # shuffle ordering of arrays
    np.random.shuffle(dataset)
    # remove classification feature
    traindata = dataset[:,1:]
    # set classes as labels
    classes = dataset[:, 0]
    #classes = np.array([i[0] for i in dataset]).T
    # if l2 is implemented:
    if normalisation == 'l2':
        # normalise each vector 
        for item in traindata:
            for num in item:
                l2 = norm(item)
                num = num/l2
    else:
        pass
    # initialize centroids using kmeans++
    centroids = kmeansplusplus(traindata, k)
    # establish initial clusters membership as zeros
    clusters = np.zeros(traindata.shape[0], dtype = int)
    # set initial distances as zeros
    euclidean = np.zeros((traindata.shape[0], k))
    # while loop to update memberships
    while True:
        # copy initial clusters as the previous iteration's clusters
        old_clusters = clusters.copy()
        # measure each point's distance to set of centroids to create vector of length k of distances
        for i in range(k):
            euclidean[:, i] = ((traindata - centroids[i])**2).sum(axis = 1)**0.5 # subtract each subset of centroids from each point in dataset
        # update clusters
        clusters = np.argmin(euclidean, axis = 1)
        # update centroids from new clusters
        for i in range(k):
            centroids[i, :] = traindata[clusters == i].mean(axis = 0)
        # break while loop if no adjustments made to clusters    
        if all(clusters == old_clusters): # the new cluster only indicates memberships, not predictions
            break                
               
    # create dictionary to aid with B-Cubed precision, recall, and F1 scores
    cluster_all = {} # expecting a dictionary where clusters are key, list of associated class labels as values
    for num in range(k):
         # create list of the indices to locate corresponding class instance
         temp_list = [i for i in range(len(clusters)) if clusters[i] == num]
         # list to append class labels
         y_list = [classes[p] for p in temp_list]
         cluster_all[num] = y_list
    
    # precision score
    precision = []
    for values in cluster_all.values():
        for item in values:
            precision.append(values.count(item)/len(values))
    precision_avg = sum(precision)/len(precision)

    # recall score
    recall = []
    for values in cluster_all.values():
        for item in values:
            recall.append(values.count(item)/(classes == item).sum(axis = 0))
            

    recall_avg = sum(recall)/len(recall)    

    # F Score
    f_scores = []
    end_index = len(precision)
    for i in range(end_index):
        f_scores.append((2*precision[i]*recall[i])/(precision[i]+recall[i]))
    f_scores_avg = sum(f_scores)/len(f_scores)

    return precision_avg, recall_avg, f_scores_avg


##### K-Medians

In [47]:
def kmedians(dataset, k, normalisation):
    np.random.seed(1)
    # shuffle ordering of arrays
    np.random.shuffle(dataset)
    # remove classification feature
    traindata = dataset[:,1:]
    # set classes as labels
    classes = dataset[:, 0]
    #classes = np.array([i[0] for i in dataset]).T
    # if l2 is implemented:
    if normalisation == 'l2':
        # normalise each vector 
        for item in traindata:
            for num in item:
                l2 = norm(item)
                num = num/l2
    else:
        pass
    # initialize centroids using kmeans++
    centroids = kmeansplusplus(traindata, k)
    # establish initial clusters membership as zeros
    clusters = np.zeros(traindata.shape[0], dtype = int)
    # set initial distances as zeros
    manhattan = np.zeros((traindata.shape[0], k))
    # while loop to update memberships
    while True:
        # copy initial clusters as the previous iteration's clusters
        old_clusters = clusters.copy()
        # measure each point's distance to set of centroids to create vector of length k of distances
        for i in range(k):
            manhattan[:, i] = (abs(traindata - centroids[i])).sum(axis = 1) # subtract each subset of centroids from each point in dataset
        # update clusters
        clusters = np.argmin(manhattan, axis = 1)
        # update centroids from new clusters
        for i in range(k):
            manhattan[i, :] = np.median(traindata[clusters == i])
        # break while loop if no adjustments made to clusters    
        if all(clusters == old_clusters): # the new cluster only indicates memberships, not predictions
            break
                        
    # create dictionary to aid with B-Cubed precision, recall, and F1 scores
    cluster_all = {}
    for num in range(k):
        # create list of the indices to locate corresponding class instance
        temp_list = [i for i in range(len(clusters)) if clusters[i] == num]
        # list to append class labels
        y_list = [classes[p] for p in temp_list]
        cluster_all[num] = y_list
    
    # precision score
    precision = []
    for values in cluster_all.values():
        for item in values:
            precision.append(values.count(item)/len(values))
    precision_avg = sum(precision)/len(precision)

    # recall score
    recall = []
    for values in cluster_all.values():
        for item in values:
            recall.append(values.count(item)/(classes == item).sum())
    recall_avg = sum(recall)/len(recall)    

    # F Score
    f_scores = []
    end_index = len(precision)
    for i in range(end_index):
        f_scores.append((2*precision[i]*recall[i])/(precision[i]+recall[i]))
    f_scores_avg = sum(f_scores)/len(f_scores)

    return precision_avg, recall_avg, f_scores_avg

In [48]:
def results(function, k, normalisation):
    x = []
    y = []
    for i in range(k):
        p, r, f = function(df, i+1, normalisation)
        y.append((p,r,f))
        x.append(i+1)
    y1 = [i[0] for i in y]
    y2 = [i[1] for i in y]
    y3 = [i[2] for i in y]
    plt.figure(figsize=(8,6))
    plt.scatter(x, y1, c = 'green', label = 'Precision')
    plt.scatter(x, y2, c = 'red', label = 'Recall')
    plt.scatter(x, y3, c = 'blue', label = 'F-Score')
    plt.title('Overall Scores')
    plt.xlabel('k-Clusters')
    plt.ylabel('Scoring Range')    
    plt.legend()
    plt.show()
    return y   


In [58]:
# functions to run for all four results:

# results(kmeans, 9, 'none')
# results(kmeans, 9, 'l2')
# results(kmedians, 9, 'none')
# results(kmedians, 9, 'l2')

In [None]:
list(sample.items())[0][1]['activation']