In [2]:
#we import all packages needed
import pandas as pd
import numpy as np
import matplotlib

#we fix seed at 42
np.random.seed(42)

In [None]:
#we have to open the excel file from Jupyter
xls=pd.ExcelFile("top 800 portatiles 2020.xlsx")
print(xls.sheet_names) #doing that, we can see different sheets
portatiles=xls.parse("Hoja1") #we select the sheet we want
print(portatiles) #this is the dataset we are going to use

In [4]:
#we asign different names to categorical and numerical variables
datos_numericos = portatiles.select_dtypes([int, float])
datos_categoricos = portatiles.select_dtypes([object, "category"])

In [None]:
#we assign the mean to empty numerical cells
for col in datos_numericos.columns:
    datos_numericos[col].fillna(datos_numericos[col].mean(), inplace=True)

In [6]:
#we import MinMaxScaler for normalizing numerical data, which is a must for using kmeans algorithm
from sklearn.preprocessing import MinMaxScaler

datos_numericos_normalizado = MinMaxScaler().fit_transform(datos_numericos)
datos_numericos_normalizado = pd.DataFrame(datos_numericos_normalizado,
                                               columns=datos_numericos.columns)

In [7]:
#we create all columns for categorical data codified
datos_categoricos_codificados = pd.get_dummies(datos_categoricos, drop_first=True)

In [8]:
#the final file is the sum of all processed variables, normalized numerical data + codified categorical data
portatiles_procesado = pd.concat([datos_numericos_normalizado, datos_categoricos_codificados], axis=1)

In [None]:
#we can see the total shape of the final dataset and the first values
portatiles_procesado.shape
portatiles_procesado.head()

In [10]:
#we import kmeans algorithm
from sklearn.cluster import KMeans

In [None]:
#we create kmeans estimator and we fit it, here we can change the number of clusters
estimador_kmedias = KMeans(random_state=42, n_clusters=8)

estimador_kmedias.fit(portatiles_procesado)

In [None]:
#now we create the clusters
clusters = estimador_kmedias.labels_
clusters

In [None]:
def resumen_cluster(cluster_id):
    cluster = portatiles[clusters==cluster_id] #cluster is equal to cluster id we select
    resumen_cluster = cluster[datos_categoricos.columns].mode().to_dict(orient="records")[0] #mode will be shown for categorical variables
    resumen_cluster.update(cluster.mean().to_dict()) #mean will be shown for numerical variables
    resumen_cluster["cluster_id"] = cluster_id #if we type resumen_cluster(x), it returns the cluster number x
    return resumen_cluster

#this is the first cluster
resumen_cluster(0)

In [14]:
#we define this for comparing all the clusters
def comparar_clusters(*cluster_ids):
    resumenes = []
    for cluster_id in cluster_ids:
        resumenes.append(resumen_cluster(cluster_id))
    return pd.DataFrame(resumenes).set_index("cluster_id").T

In [None]:
#this for seeing all the clusters we calculate
comparar_clusters(*np.unique(clusters))