## K-Means Clustering

- Centroids
- Equidistant line
- Calculate the center of mass
- Elbow Method (WCSS)
- K-Means++
- create a depedent variable


### Importing the libraries


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


### Importing the dataset


In [None]:
# the column choices were for didactic simplification
dataset = pd.read_csv('Mall_Customers.csv')
X = dataset.iloc[:, [3, 4]].values
# NO dependent variable a priori

### Using the elbow method to find the optimal number of clusters

- The sum of the square distances between each observation point
  of the cluster and its central width, the cental width of the cluster
- The "elbow" point on the plot indicates where the rate of decrease in WCSS starts to slow down, suggesting the optimal number of clusters. If you don't see a clear elbow, you may need to explore a wider range of cluster numbers or try different clustering algorithms.


In [None]:

from sklearn.cluster import KMeans

wcss = []

# range from 1 to 10 clusters is a reasonable starting point

for i in range(1, 11):

    # Model

    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)

    kmeans.fit(X)

    wcss.append(kmeans.inertia_)


plt.plot(range(1, 11), wcss)

plt.title('The Elbow Method')

plt.xlabel('Number of clusters')

plt.ylabel('WCSS')

plt.show()
# time: 6.2s

### Training the K-Means model on the dataset


In [None]:
# n_clusters value came from the previous cell
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
# Build the dependent variable
y_kmeans = kmeans.fit_predict(X)


In [None]:
print(y_kmeans)


### Visualizing the clusters

The numbers of clusters is the number of depedent variables


In [None]:
# X[rows,columns]
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1],
            s=100, c='red', label='Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1],
            s=100, c='blue', label='Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1],
            s=100, c='green', label='Cluster 3')


plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1],
            s=100, c='cyan', label='Cluster 4')


plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1],
            s=100, c='magenta', label='Cluster 5')


plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[
            :, 1], s=300, c='yellow', label='Centroids')


plt.title('Clusters of customers')


plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()


plt.show()
