# K-means

In [1]:
import numpy as np
import pandas as pd
import random
random.seed(0)
import numpy as np
np.random.seed(0)
import csv

## Impleting k-means

In [2]:
def openfile(filename):
    """
    Opening files and return array.
    """
    path='./' + filename
    x = []
    y = []
    label = []
    with open(path) as fh:
        for row in fh:
            row = row.split()
            row = [float(x) for x in row]
            row.append(np.nan)
            x.append(row[0])
            y.append(row[1])
            label.append(row[2])
    df = pd.DataFrame({'x':x, 'y':y, 'label':label})
    return df     

In [3]:
def init_centroid(k, df):
    """
    initialize centroid.
    """
    centroid = {}
    for i in range(k):
        xlist = df['x'].sample(n=k, random_state=i).tolist()
        ylist = df['y'].sample(n=k, random_state=i).tolist()
        centroid[str(i)] = (np.mean(xlist)+0.5, np.mean(ylist)+0.5)
    return centroid

In [4]:
def calc_label(x, y, centroid):
    """
    calc distance between data and centroid,
    and choose shortest one as output label.
    """
    distance = np.inf
    label = None
    for key, value in centroid.items():   
        dist = np.sqrt((x-value[0])**2 + (y-value[1])**2)  
        if dist < distance:
            distance = dist
            label = int(key)    
    return label

In [5]:
# calc label
def k_means(df, centroid):
    """
    calc each row's label and return new dataFrame.
    """
    data = df.to_numpy()
    count = 0
    for idx in range(len(data)):
        location = data[idx][:2]
        x = location[0]
        y = location[1]
        label = calc_label(x,y,centroid)
        count +=1
        data[idx][2] = label
    # change data to DataFrame
    x = data[:,0].tolist()
    y = data[:,1].tolist()
    label = data[:,2].tolist()
    new_df = pd.DataFrame({'x':x, 'y':y, 'label':label})
    return new_df

In [6]:
def calc_new_centroid(df, centroid) -> dict:
    """
    clac new centroid
    """
    for i in range(k):
        xlist = df[df['label']==i]['x'].tolist()
        ylist = df[df['label']==i]['y'].tolist()
        centroid[str(i)] = (np.mean(xlist), np.mean(ylist))
    return centroid

In [7]:
def main(df, centroid):
    """
    Main function
    """
    count = 0
    while True:
        # calc label
        new_df = k_means(df, centroid.copy())

        # calc new centroids
        new_centroid =  calc_new_centroid(new_df, centroid.copy())

        if new_centroid == centroid:
            break
        else:
            centroid = new_centroid
            df = new_df
            count += 1
        
    print('iterations:',count)
    return new_df

In [8]:
def output_pred(df):
    """
    Output prediction as txt files.
    """
    with open('./output/'+filename+'_output.txt', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')

        for idx in range(len(df)):
            writer.writerow([df.iloc[idx][0], df.iloc[idx][1], df.iloc[idx][2]])

## main

In [9]:
filename = 'Clustering_test1'
path = './input/'+filename
data = openfile(path)
k = 4
centroid = init_centroid(k, data)
new_df = main(data, centroid)
output_pred(new_df)

iterations: 6


In [10]:
filename = 'Clustering_test2'
path = './input/'+filename
data = openfile(path)
k = 4
centroid = init_centroid(k, data)
new_df = main(data, centroid)
output_pred(new_df)

iterations: 15


In [11]:
filename = 'Clustering_test3'
path = './input/'+filename
data = openfile(path)
k = 4
centroid = init_centroid(k, data)
new_df = main(data, centroid)
output_pred(new_df)

iterations: 14


In [12]:
filename = 'Clustering_test4'
path = './input/'+filename
data = openfile(path)
k = 4
centroid = init_centroid(k, data)
new_df = main(data, centroid)
output_pred(new_df)

iterations: 10


In [13]:
filename = 'Clustering_test5'
path = './input/'+filename
data = openfile(path)
k = 4
centroid = init_centroid(k, data)
new_df = main(data, centroid)
output_pred(new_df)

iterations: 15
