### KMeans para agrupar datos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [13]:
def clean_df(df):
    '''
    Reglas de descarte
        - drop if culture_cl=="Chanca"
        -  drop if culture_cl=="Lima"
        -  drop if culture_cl=="Cajamarca" & period_n==5
        -  drop if culture_cl=="Cajamarca" & period_n==7
        -  drop if culture_cl=="Pukara" & period_n==6
        -  drop if culture_cl=="Salinar" & period_n==6
        -  drop if culture_cl=="Sican" & period_n==8
        - drop if culture_cl=="Vicus" & period_n==6
        - drop if region is null
        - drop if culture_cl is null
        - drop if period_n==5
    '''
    df = df[df.culture_cl != 'Chanca']
    df = df[df.culture_cl != 'Lima']
    df = df[~((df.culture_cl == 'Cajamarca') & (df.cronology_time == 5))]
    df = df[~((df.culture_cl == 'Cajamarca') & (df.cronology_time == 7))]
    df = df[~((df.culture_cl == 'Pukara') & (df.cronology_time == 6))]
    df = df[~((df.culture_cl == 'Salinar') & (df.cronology_time == 6))]
    df = df[~((df.culture_cl == 'Sican') & (df.cronology_time == 8))]
    df = df[~((df.culture_cl == 'Vicus') & (df.cronology_time == 6))]
    df = df[~df.region.isnull()]
    df = df[~df.culture_cl.isnull()]
    df = df[df.cronology_time != 5]

    df = df.reset_index(drop=True)
    
    return df

### Importamos el dataset consolidado

In [14]:
dataset_df = pd.read_excel('../../data/dataset_v4.xlsx')
dataset_df.head()

Unnamed: 0.1,Unnamed: 0,catalogation_id,cronology,cronology_time,culture_cl,morfofunctional_category,description,principal_scene,decoration_tecnique_external_body_section1,color_external_body_section1,...,trait_n89,trait_n90,trait_n100,trait_n101,trait_n102,trait_n103,trait_n104,trait_n105,file_path,image_path
0,0,ML020107,Horizonte Medio,7,Sican,botella doble cuerpo asa puente cintada silbadora,botella doble cuerpo asa puente cintada silbad...,,pintado escultorico,crema y naranja,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020107a.jpg
1,1,ML020108,Horizonte Medio,7,Sican,botella doble pico asa puente cintada escultorica,botella doble pico asa puente cintada escultor...,,pintado escultorico,rojo y naranja,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020108a.jpg
2,2,ML020109,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020109a.jpg
3,3,ML020110,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020110a.jpg
4,4,ML020111,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y marron,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020111a.jpg


### Importamos los embeddings

In [15]:
dataset_filename = 'bert_embeddings.csv'
embedding_column = 'embedding'
sep_char = ';'

In [16]:
df = pd.read_csv(f'../../data/{dataset_filename}', sep=sep_char)
df

Unnamed: 0.1,Unnamed: 0,catalogation_id,culture_cl,description,embedding
0,0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,"[0.17316726, -0.6274872, 0.0407279, -0.0887306..."
1,1,ML020108,Sican,botella doble pico asa puente cintada escultor...,"[0.17790586, -0.68238825, -0.40351287, -0.2387..."
2,2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,"[0.07478304, -0.5415242, -0.34691423, 0.007677..."
3,3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,"[-0.06786743, -0.45162752, -0.20201102, -0.069..."
4,4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,"[0.095533825, -0.518077, -0.30639765, -0.22455..."
...,...,...,...,...,...
33576,33577,ML038832,Tiahuanaco,plato con diseños geometricos de lineas horizo...,"[-0.21575417, -0.6014442, -0.5351323, 0.295663..."
33577,33578,ML038833,Tiahuanaco,plato con diseños geometricos de eses ( s) y l...,"[-0.59130156, -1.0794916, -0.2997423, -0.03917..."
33578,33579,ML015075,Cajamarca,cuenco escultorico que representa a un felino ...,"[0.10485042, -0.74276745, -0.27508265, -0.3266..."
33579,33580,ML015241,Cajamarca,cuenco con representacion de cabeza estilizada...,"[0.7909232, -0.19150409, -0.6404954, -0.548680..."


Agregamos la columna de cronología en caso no exista

In [17]:
if 'cronology_time' not in df.columns.tolist():
    df['cronology_time'] = dataset_df['cronology_time']
if 'region' not in df.columns.tolist():
    df['region'] = dataset_df['region']

Agregamos las columnas de traits al dataset

In [18]:
trait_columns = [col for col in dataset_df.columns.tolist() if 'trait_' in col]
for t in trait_columns:
    df[t] = dataset_df[t]
df = df.drop('Unnamed: 0', axis=1, errors='ignore')
display(df.head())
len(df)

  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]
  df[t] = dataset_df[t]


Unnamed: 0,catalogation_id,culture_cl,description,embedding,cronology_time,region,trait_color_amarillo,trait_color_beige,trait_color_blanco,trait_color_crema,...,trait_n87,trait_n88,trait_n89,trait_n90,trait_n100,trait_n101,trait_n102,trait_n103,trait_n104,trait_n105
0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,"[0.17316726, -0.6274872, 0.0407279, -0.0887306...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
1,ML020108,Sican,botella doble pico asa puente cintada escultor...,"[0.17790586, -0.68238825, -0.40351287, -0.2387...",7,Costa Norte,0,0,0,0,...,0,0,0,0,1,1,1,0,1,0
2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,"[0.07478304, -0.5415242, -0.34691423, 0.007677...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,"[-0.06786743, -0.45162752, -0.20201102, -0.069...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,"[0.095533825, -0.518077, -0.30639765, -0.22455...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0


33581

### Limpiamos el dataset de embeddings

In [19]:
df = clean_df(df)
df

Unnamed: 0,catalogation_id,culture_cl,description,embedding,cronology_time,region,trait_color_amarillo,trait_color_beige,trait_color_blanco,trait_color_crema,...,trait_n87,trait_n88,trait_n89,trait_n90,trait_n100,trait_n101,trait_n102,trait_n103,trait_n104,trait_n105
0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,"[0.17316726, -0.6274872, 0.0407279, -0.0887306...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
1,ML020108,Sican,botella doble pico asa puente cintada escultor...,"[0.17790586, -0.68238825, -0.40351287, -0.2387...",7,Costa Norte,0,0,0,0,...,0,0,0,0,1,1,1,0,1,0
2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,"[0.07478304, -0.5415242, -0.34691423, 0.007677...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,"[-0.06786743, -0.45162752, -0.20201102, -0.069...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,"[0.095533825, -0.518077, -0.30639765, -0.22455...",7,Costa Norte,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31821,ML035466,Tiahuanaco,cuenco con diseños geometricos de lineas verti...,"[-0.15830764, -0.5200171, -0.5023318, -0.00599...",6,Costa Sur,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
31822,ML035498,Tiahuanaco,botella gollete asa lateral con diseños geomet...,"[-0.3365426, -0.81007195, -0.11956785, -0.1223...",6,Costa Sur,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0
31823,ML038832,Tiahuanaco,plato con diseños geometricos de lineas horizo...,"[-0.21575417, -0.6014442, -0.5351323, 0.295663...",6,Costa Sur,0,0,0,1,...,0,0,0,1,1,1,1,0,1,0
31824,ML038833,Tiahuanaco,plato con diseños geometricos de eses ( s) y l...,"[-0.59130156, -1.0794916, -0.2997423, -0.03917...",6,Costa Sur,0,0,0,1,...,0,0,0,0,1,1,1,0,1,0


In [20]:
def str_to_list(string):
    string_list = string.strip('][').split(', ')
    float_list = [float(i) for i in string_list]
    return float_list

df[embedding_column] = df[embedding_column].apply(str_to_list)
df_spread = df[[embedding_column]].apply(lambda x: pd.Series(x[embedding_column]), axis=1).rename(columns=lambda i: f"c{i+1}")

Definimos funciones para graficar

In [None]:
def sample_df(original_df, n_samples):
    cultures = original_df.culture_cl.unique().tolist()
    sampled_dfs = list()

    for c in cultures:
        culture_df = original_df[original_df.culture_cl == c]
        if n_samples > len(culture_df):
            sampled_dfs.append(culture_df)
        else:
            sampled_df = culture_df.sample(n=n_samples, random_state=100)
            sampled_dfs.append(sampled_df)
    
    df = pd.concat(sampled_dfs, axis=0)
    return df


def plot_embeddings_2d(df, 
                       attribute, 
                       model_name, 
                       path_to_save = '', 
                       original_df=None,
                       get_samples=False, 
                       n_samples=100, 
                       preprocess_fn = None,
                       filter_by_trait: str=None):
    
    new_df = df.copy()
    
    if filter_by_trait is not None and original_df is not None:
        new_df[filter_by_trait] = original_df[filter_by_trait]
        new_df = new_df[new_df[filter_by_trait] > 0]

    if get_samples:
        new_df = sample_df(new_df, n_samples)

    if preprocess_fn:
        new_df['dim1'] = new_df['dim1'].apply(preprocess_fn)
        new_df['dim2'] = new_df['dim2'].apply(preprocess_fn)

    sns.set(rc={"figure.figsize":(12, 10)})

    ax = sns.scatterplot(new_df, x='dim1', y='dim2', hue=attribute)

    # configurar el gráfico    
    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title(f"Dos dimensiones UMAP de distintas descripciones de imágenes según '{attribute}' | {model_name}")
    plt.xlabel('Dimensión 1')
    plt.ylabel('Dimensión 2')
    plt.show()

    if path_to_save:
        fig = ax.get_figure()
        fig.savefig(path_to_save) 

---
## KMeans

Ahora procedemos a utilizar kmeans como técnica de aprendizaje no supervisado. El objetivo de esto es comparar los clusters obtenidos contra los cluster que ya tenemos etiquetados, a modo de revisar el grado de separación y de ajuste entre ambos casos.