In [None]:
import os
# The jupyter notebook is launched from your $HOME directory.
# Change the working directory to the C-Debugging directory
# which was created in your username directory under /scratch/vp91
os.chdir(os.path.expandvars("/scratch/vp91/$USER/AAPP-Pytorch"))


# Unsupervised Learning - Clustering  


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from time import time
from kmeans_pytorch import kmeans, kmeans_predict

## K-means Clustering  

1. Set K – number of clusters
2. Randomly assign k points as the centroid of the clusers
3. Measure distance between point a and the k clusters
4. Assign point a to the cluster with the minimum distance
5. Repeat 3-4 for all data points
6. Recalculate the cluster centroid
7. Repeat 5-6 until the clusters don’t change anymore
8. Calculate total clusters variance
9. Repeat 2-8 N times, result is the clustering with the minimum variance
ariance


In [None]:
# set random seed
np.random.seed(123)
# data
data_size, dims, num_clusters = 1000, 2, 3
x = np.random.randn(data_size, dims) /6
x = torch.from_numpy(x)
# more data
y = np.random.randn(5, dims) /6
y = torch.from_numpy(y)

In [None]:
# set device
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

In [None]:
# k-means
cluster_ids_x, cluster_centers = kmeans(
    X=x, num_clusters=num_clusters, distance='euclidean', device=device
)

In [None]:
# predict cluster ids for y
cluster_ids_y = kmeans_predict(
    y, cluster_centers, 'euclidean', device=device
)

In [None]:
# plot
plt.figure(figsize=(4, 3), dpi=160)
plt.scatter(x[:, 0], x[:, 1], c=cluster_ids_x, cmap='cool')
plt.scatter(y[:, 0], y[:, 1], c=cluster_ids_y, cmap='cool', marker='X')
plt.scatter(
    cluster_centers[:, 0], cluster_centers[:, 1],
    c='white',
    alpha=0.6,
    edgecolors='black',
    linewidths=2
)
plt.axis([-1, 1, -1, 1])
plt.tight_layout()
plt.show()

## K-means CPU vs GPU

In [None]:
# dimensions, num clusters
dims, num_clusters = 2, 3

# data sizes
data_sizes = [100000, 1000000, 5000000, 10000000]

In [None]:
gpu_times = []
cpu_times = []

for data_size in data_sizes:
    print(f'\ndata size: {data_size}')

    # data
    x = np.random.randn(data_size, dims) / 6
    x = torch.from_numpy(x)

    # gpu
    start_gpu = time()
    kmeans_gpu = kmeans(X=x, num_clusters=num_clusters, device=torch.device('cuda:0'))
    gpu_time = time() - start_gpu
    gpu_times.append(gpu_time)
    print(f'gpu time: {gpu_time}')
    
    # cpu
    start_cpu = time()
    kmeans_cpu = kmeans(X=x, num_clusters=num_clusters, device=torch.device('cpu'))
    cpu_time = time() - start_cpu
    cpu_times.append(cpu_time)
    print(f'cpu time: {cpu_time}')

In [None]:
# plot
plt.figure(figsize=(6, 3), dpi=160)
plt.plot(data_sizes, gpu_times, marker='o', label='gpu', color='xkcd:vermillion')
plt.plot(data_sizes, cpu_times, marker='o', label='cpu', color='xkcd:neon blue')
plt.xticks(data_sizes)
plt.legend(fontsize=12)
plt.grid(alpha=0.2)
plt.xlabel('data size', fontsize=14)
plt.ylabel('time (s)', fontsize=14)
plt.show()

In [None]:
dataframe_raw = pd.read_csv('/scratch/vp91/AAPP2023/Data/CalCOFI_bottle.csv')
dataframe_raw.head()

In [None]:
dataframe_raw.describe()

In [None]:
dataframe = dataframe_raw.filter(items=['T_degC', 'Depthm', 'Salnty', 'O2ml_L', 'STheta', 'O2Sat', 'Oxy_µmol/Kg'])
dataframe.isnull().sum()

In [None]:
def customize_dataset(data):
    dataframe = data.copy(deep=True)
    #select only 7 variables
    dataframe = dataframe.filter(items=['T_degC', 'Depthm', 'Salnty', 'O2ml_L', 'STheta', 'O2Sat', 'Oxy_µmol/Kg'])
    #fill na values
    dataframe = dataframe.groupby(dataframe.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))
    return dataframe

In [None]:
dataframe = customize_dataset(dataframe)
dataframe.isnull().sum() 

In [None]:
dataframe

In [None]:
input = dataframe[["T_degC","Depthm"]].to_numpy()
x = torch.from_numpy(input)
input.shape

In [None]:

cluster_ids_x, cluster_centers = kmeans(
    X=x, num_clusters=num_clusters, distance='euclidean', device=device
)