# Program Assignment #3 - K-means clustering & Support Vector Machine
---

## Name: 李勝維
## Student ID: 0711239
---

In [256]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)

# Data input & Data Preprocessing

In [257]:
from sklearn.preprocessing import LabelEncoder
# Wheat Seeds Dataset
import csv
# remove duplicate '\t's
with open("data/seeds_dataset.txt") as f:
    rows = list(csv.reader(f, delimiter="\t"))
    for row in rows:
        try:
            while True:
                row.remove("")
        except ValueError:
            pass
        row = [float(x) for x in row]
wheat_raw_data = np.array(rows, dtype=np.float32)
np.random.shuffle(wheat_raw_data) # shuffle data
wheat_x, wheat_y = wheat_raw_data[..., :-1], wheat_raw_data[..., -1]
wheat_y = (wheat_y-1).astype(np.int32)

# Ionosphere Dataset
ion_raw_data = pd.read_csv("data/ionosphere.data").values
np.random.shuffle(ion_raw_data) # shuffle data
ion_x, ion_y = ion_raw_data[..., :-1].astype(np.float32), ion_raw_data[..., -1]
ion_y_encoder = LabelEncoder() # transform 'g' -> 1, 'b' -> 0
ion_y = ion_y_encoder.fit_transform(ion_y)

['b' 'g']


# K-means Clustering

In [258]:
# K = 3
MAX_ITERATIONS = 1000
centers = wheat_x[np.random.randint(wheat_x.shape[0], size=3)]
for _ in range(MAX_ITERATIONS):
    # assignment step
    sets = [[] for _ in range(3)] # 3 empty sets
    for i, sample in enumerate(wheat_x):
        dist = [np.sum((sample - c)**2) for c in centers] # calculates distance to centers
        sets[np.argmin(dist)].append(i) # belongs to the set with minimum distance
    
    # update step
    centers = list()
    for i, set in enumerate(sets):
        point_set = wheat_x[set] # "set" stores only index
        centroid = np.mean(point_set, axis=0) # get center of mass for each set
        centers.append(centroid)

# get label by voting
KMEANS_centers = [None]*3
for i, set in enumerate(sets):
    labels = wheat_y[set]
    counts = np.bincount(labels)
    KMEANS_centers[np.argmax(counts)] = centers[i]

# utility for predicting
def KMeans_prediction(x):
    # x.shape = (sample, feature)
    ans = list()
    for sample in x:
        dist = [np.sum((sample - c)**2) for c in KMEANS_centers]
        ans.append(np.argmin(dist))
    return np.array(ans, dtype=np.int32)

# print(KMeans_prediction(wheat_x))

# from sklearn.cluster import KMeans
# K = KMeans(3, random_state=0)
# K.fit(wheat_x)
# print(K.predict(wheat_x))
# print(wheat_y)