In [1]:
# General Use
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 3D Visualization 
import plotly as py
import plotly.graph_objs as go
import plotly.offline as pyo

#bd
from sqlalchemy import create_engine

In [2]:
dfp = pd.read_sql("SELECT * FROM venta",index_col="cod_cliente",con=engine)
dfp.head()

NameError: name 'engine' is not defined

In [None]:
df = pd.read_csv("Mall_Customers.csv", index_col="CustomerID")
df.head(20)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.drop(columns=['Gender'], inplace=True)

In [None]:
sns.distplot(df["Age"])

In [None]:
sns.distplot(df["Annual Income (k$)"])

In [None]:
sns.distplot(df["Spending Score (1-100)"])

### K-means Clustering

K-means is one of the most known algorithms to find clusters, works with large datasets while being fast and efficient. This technique was developed in 1957!

- Centroid-based clustering algorithm
- Maximize intra-cluster similarity
- Minimize inter-Cluster similarity

To start we need to set the number of clusters for the K-Means algorithm - How many clusters do we want? - Usually represented by K.

1. Then, the algorithm will randomly select K centers, one for each cluster of data (which are called centroids)
2. Assign each instance to the closest centroid.
3. Calculate the centroids of each cluster formed
4. Return to step 2
5. End when the centroids no longer change

Randomly selected centroids impact the results ? Yes, but with scikit-learn

### Client Segmentation with Annual Income and Spending Score

In [None]:
df1 = df[["Annual Income (k$)", "Spending Score (1-100)"]]

In [None]:
df1.plot(kind="scatter",  # tipo de grafica 
        x="Annual Income (k$)",
        y="Spending Score (1-100)",
        figsize=(10, 7))  # tamano de la grafica
plt.show()

NameError: name 'df1' is not defined

In [None]:
sum_of_sqr_dist = {} # diccionario para la suma de la distancia al cuadrada

for k in range(1, 10): # bucle para el valor de k numero de grupos
    km = KMeans(n_clusters=k, init='k-means++', max_iter=1000) # k-means de la biblioteca de scikit-learn, definimos n grupos y init inicializa los centroides  
    km = km.fit(df1) # ajuste del modelo
    sum_of_sqr_dist[k] = km.inertia_ # calculo de la suma de la distancia

In [None]:
sns.pointplot(x=list(sum_of_sqr_dist.keys()), y=list(sum_of_sqr_dist.values())) # grafica de la suma de la distancia al cuadrada 
plt.xlabel("Number of Clusters (K)") # etiqueta eje x 
plt.ylabel("Sum of Square Distances") # etiqueta eje y 
plt.title("Elbow Method for Optimal K") # titulo
plt.show() # mostrar grafica 

In [None]:
Model = KMeans(n_clusters=5, # numero de grupos 
             init='k-means++', # inicializacion
             max_iter=1000) # maximo de iteraciones
Model.fit(df1)

In [None]:
print("Labels", Model.labels_)

In [None]:
print("Centroids", Model.cluster_centers_)

In [None]:
centroids = Model.cluster_centers_

In [None]:
df1_cluster = df1.copy()
df1_cluster["Cluster"] = Model.fit_predict(df1)

In [None]:
df1_cluster.head()

In [None]:
Color = ['red', 'blue', 'black', 'gold', 'navy']
df1_cluster['Color'] = df1_cluster['Cluster'].map(lambda p:Color[p])

In [None]:
plt.figure(figsize=(20, 10))
plt.scatter(df1["Annual Income (k$)"],
           df1["Spending Score (1-100)"],
           c=df1_cluster["Color"])

plt.scatter(centroids[:, 0], centroids[:, 1], c='green', s=250)
plt.show()

In [None]:
labels = Model.labels_

In [None]:
silhouette_score(df1, labels)

In [None]:
silhouette = {} # Almacenamiento de valores 

for k in range(2,8):# el rango es de 2 a 7
    km = KMeans(n_clusters=k, init='k-means++', max_iter=1000) #
    km.fit(df1)
    silhouette[k] = silhouette_score(df1, km.labels_)

In [None]:
sns.pointplot(x=list(silhouette.keys()), y=list(silhouette.values()))
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Scores")
plt.title("Silhouette Scores for Each K")
plt.show()

In [None]:
df2= df.copy()

In [None]:
sum_of_sqr_dist = {}

for k in range(1, 10):
    km = KMeans(n_clusters=k, init='k-means++', max_iter=1000)
    km = km.fit(df2)
    sum_of_sqr_dist[k] = km.inertia_

In [None]:
sns.pointplot(x=list(sum_of_sqr_dist.keys()), y=list(sum_of_sqr_dist.values()))
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Sum of Square Distances")
plt.title("Elbow Method for Optimal K")
plt.show()

In [None]:
Model2 = KMeans(n_clusters=6, init='k-means++', max_iter=1000)
Model2.fit(df2)

In [None]:
df2['Cluster'] = Model2.fit_predict(df2)
df2.head()

In [None]:
labels = Model2.labels_
centroids = Model2.cluster_centers_

In [None]:
df2['labels'] = labels

trace = go.Scatter3d(
                    x = df2['Age'],
                    y = df2['Annual Income (k$)'],
                    z = df2['Spending Score (1-100)'],
                    mode = 'markers',
                    marker=dict(color=df2['labels'], size = 5, line=dict(color=df2['labels'], width=12), opacity=0.8)
                    )
    
data = [trace]
layout = go.Layout(
                    title='Clusters',
                    scene= dict(
                                xaxis = dict(title = 'Age'),
                                yaxis = dict(title = 'Annual Income (k$)'),
                                zaxis = dict(title = 'Spending Score (1-100)'))
                                )

fig = go.Figure(data=data, layout=layout)
pyo.offline.iplot(fig)
plt.show()