### kMeans
Label "unlabeled" data by finding clusters  
<img src="../doc/41_kmeans.png" alt="kMeans" width="300"/>

#### Algorithm
1. Create k cluster centerpoints (randomly over dataset)
2. Assign each point to a cluster using it's distance
3. Compute centroid of cluster point
4. Repeat with centerpoint of cluster

#### Finding k in kMeans: Elbow-Method
Find k for kMeans, where mean distance of cluster points to center stops to decrease significantly.

In [None]:
# import
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
# Read CSV
df = pd.read_csv("../res/autos_prepared.csv")
df.head()

In [None]:
# Visualize data
%matplotlib inline
import matplotlib.pyplot as plt

plt.xlabel('Year')
plt.ylabel('Price in €')
plt.scatter(df["yearOfRegistration"], df["price"])

In [None]:
# Get data
x = df[["yearOfRegistration", "price"]]

In [None]:
# Scale data
scaler = StandardScaler()
x_transformed = scaler.fit_transform(x)

In [None]:
# Cluster data using kMeans
model = KMeans(n_clusters = 3)
model.fit(x_transformed)

print(model.labels_)
print(scaler.inverse_transform(model.cluster_centers_))

In [None]:
# Visualize clustered data
%matplotlib inline
import matplotlib.pyplot as plt

plt.xlabel('Year')
plt.ylabel('Price in €')
plt.scatter(df["yearOfRegistration"], df["price"], c = model.labels_)

# Plot centers (untransform)
kmeans_centers = scaler.inverse_transform(model.cluster_centers_)
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1], c = 'red', marker = 'x')

In [None]:
# Elbow-Method
inertias = []
for n in range(2, 10):
    model = KMeans(n_clusters = n)
    model.fit(x_transformed)
    inertias.append(model.inertia_)

In [None]:
# Visualize clustered data
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(range(2, 10), inertias)