### Cargamos nuestro dataset

In [1]:
import pandas as pd

In [3]:
df = pd.read_excel('../../data/dataset_v4.xlsx')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,catalogation_id,cronology,cronology_time,culture_cl,morfofunctional_category,description,principal_scene,decoration_tecnique_external_body_section1,color_external_body_section1,color_internal_body_section1,...,trait_n89,trait_n90,trait_n100,trait_n101,trait_n102,trait_n103,trait_n104,trait_n105,file_path,image_path
0,ML020107,Horizonte Medio,7,Sican,botella doble cuerpo asa puente cintada silbadora,botella doble cuerpo asa puente cintada silbad...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020107a.jpg
1,ML020108,Horizonte Medio,7,Sican,botella doble pico asa puente cintada escultorica,botella doble pico asa puente cintada escultor...,,pintado escultorico,rojo y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020108a.jpg
2,ML020109,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020109a.jpg
3,ML020110,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020110a.jpg
4,ML020111,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y marron,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020111a.jpg


### Calculamos la distancia

In [4]:
text1 = df[df.culture_cl == 'Chanca']['description'].iloc[0]
text1

'botella gollete asa lateral con representacion de cabezas antropomorfas con tocado de plumas.'

In [5]:
text2 = df[df.culture_cl == 'Salinar']['description'].iloc[0]
text2

'cantaro gollete asa lateral con diseños geometricos de lineas, triangulos y circulo.'

In [6]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

Tokenizamos las descripciones y obtenemos los embeddings

In [7]:
def get_BERT_embedding(tokenizer, model, description):

    if description is np.nan:
        return []
    
    tokens = tokenizer([description], padding=True, truncation=True, return_tensors="pt", max_length=128)
    with torch.no_grad():
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state[0].mean(dim=0).reshape(1, -1)

    return embedding


In [8]:
embed_df = df[['catalogation_id', 'culture_cl', 'description', 'cronology_time']]
embed_df

Unnamed: 0,catalogation_id,culture_cl,description,cronology_time
0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,7
1,ML020108,Sican,botella doble pico asa puente cintada escultor...,7
2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,7
3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,7
4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,7
...,...,...,...,...
33577,ML038832,Tiahuanaco,plato con diseños geometricos de lineas horizo...,6
33578,ML038833,Tiahuanaco,plato con diseños geometricos de eses ( s) y l...,6
33579,ML015075,Cajamarca,cuenco escultorico que representa a un felino ...,5
33580,ML015241,Cajamarca,cuenco con representacion de cabeza estilizada...,5


### Seteamos el tokenizador y modelo para texto en español

In [9]:
model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Calculamos los embeddings y los guardamos en un csv

In [106]:
embed_df = embed_df[~embed_df.description.isnull()]

In [107]:
embed_df['bert_embedding'] = embed_df.description.apply(lambda x: get_BERT_embedding(tokenizer, model, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embed_df['bert_embedding'] = embed_df.description.apply(lambda x: get_BERT_embedding(tokenizer, model, x))


In [108]:
def tensor_to_numpy(embedding):
    return embedding.numpy()[0]

embed_df.bert_embedding = embed_df.bert_embedding.apply(tensor_to_numpy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embed_df.bert_embedding = embed_df.bert_embedding.apply(tensor_to_numpy)


In [111]:
embed_df.bert_embedding[0]

array([ 1.73167259e-01, -6.27487183e-01,  4.07278985e-02, -8.87306407e-02,
        6.17066383e-01, -3.37456018e-01,  1.35650739e-01, -1.49836630e-01,
        4.91086215e-01, -7.23881900e-01,  2.30144411e-01,  1.06237583e-01,
        3.14143538e-01,  1.91321909e-01, -8.02977085e-02,  1.02470064e+00,
       -1.06598914e-01, -6.57810509e-01, -2.17211880e-02, -2.85578817e-01,
       -5.75031281e-01, -2.11278543e-01, -2.45187268e-01, -2.55521417e-01,
       -2.15129197e-01,  2.18140204e-02,  2.81334557e-02, -4.54642415e-01,
       -5.49621284e-01, -3.94926481e-02,  3.41715403e-02,  1.34244099e-01,
       -5.11013925e-01,  3.30857217e-01, -6.59677982e-01, -2.92174190e-01,
        2.57504642e-01,  1.69662029e-01,  3.19388926e-01,  5.10906935e-01,
       -4.28253412e-01,  3.68706435e-02, -1.13036263e+00, -1.87272117e-01,
       -1.00036286e-01,  1.13334864e-01,  3.33571702e-01,  3.48164707e-01,
       -3.07757854e-01,  2.25729361e-01, -3.90123218e-01, -4.74586993e-01,
       -5.44732511e-01,  

In [112]:
def array_to_list(array):
    return list(array)

embed_df.bert_embedding = embed_df.bert_embedding.apply(array_to_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embed_df.bert_embedding = embed_df.bert_embedding.apply(array_to_list)


In [113]:
display(type(embed_df.bert_embedding[0]))
display(embed_df.bert_embedding[0])

list

[0.17316726,
 -0.6274872,
 0.0407279,
 -0.08873064,
 0.6170664,
 -0.33745602,
 0.13565074,
 -0.14983663,
 0.4910862,
 -0.7238819,
 0.23014441,
 0.10623758,
 0.31414354,
 0.19132191,
 -0.08029771,
 1.0247006,
 -0.10659891,
 -0.6578105,
 -0.021721188,
 -0.28557882,
 -0.5750313,
 -0.21127854,
 -0.24518727,
 -0.25552142,
 -0.2151292,
 0.02181402,
 0.028133456,
 -0.45464242,
 -0.5496213,
 -0.039492648,
 0.03417154,
 0.1342441,
 -0.5110139,
 0.33085722,
 -0.659678,
 -0.2921742,
 0.25750464,
 0.16966203,
 0.31938893,
 0.51090693,
 -0.4282534,
 0.036870643,
 -1.1303626,
 -0.18727212,
 -0.100036286,
 0.113334864,
 0.3335717,
 0.3481647,
 -0.30775785,
 0.22572936,
 -0.39012322,
 -0.474587,
 -0.5447325,
 0.6497924,
 -0.36503905,
 0.40434918,
 -0.05008657,
 0.093666755,
 -0.14553383,
 -0.51341516,
 0.74688727,
 0.74546146,
 -0.2437545,
 0.06746982,
 -0.3694078,
 -0.016336882,
 0.07407888,
 0.29003033,
 -0.119390175,
 0.34554747,
 0.40465996,
 -0.41130495,
 -0.58150107,
 -0.59824437,
 0.10529467,
 

In [114]:
embed_df.to_csv('../data/bert_embeddings.csv', sep=';')

### Calculamos las distancias entre embeddings de descripciones

In [28]:
df = pd.read_csv('../../data/bert_embeddings.csv', sep=';')
df.head()

Unnamed: 0.1,Unnamed: 0,catalogation_id,culture_cl,description,embedding
0,0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,"[0.17316726, -0.6274872, 0.0407279, -0.0887306..."
1,1,ML020108,Sican,botella doble pico asa puente cintada escultor...,"[0.17790586, -0.68238825, -0.40351287, -0.2387..."
2,2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,"[0.07478304, -0.5415242, -0.34691423, 0.007677..."
3,3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,"[-0.06786743, -0.45162752, -0.20201102, -0.069..."
4,4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,"[0.095533825, -0.518077, -0.30639765, -0.22455..."


### Preprocesamos los embeddings en el DF

In [29]:
def str_to_list(string):
    string_list = string.strip('][').split(', ')
    float_list = [float(i) if len(i) > 0 else 0.0 for i in string_list]
    return float_list

In [30]:
df.embedding = df.embedding.apply(str_to_list)

In [39]:
from openai.embeddings_utils import cosine_similarity
import time
import numpy as np

In [37]:
cultures = embed_df.culture_cl.unique().tolist()
culture_pairs = [(a, b) for idx, a in enumerate(cultures) for b in cultures[idx + 1:]]
len(culture_pairs)

171

In [97]:
df.groupby('culture_cl')['culture_cl'].count().sort_values(ascending=False)

culture_cl
Moche         14250
Chimu          4834
Wari           4749
Nasca          3098
Sican          1367
Chancay         931
Cajamarca       889
Tiahuanaco      710
Salinar         671
Inca            622
Cupisnique      528
Vicus           374
Recuay          327
Paracas         136
Chincha          39
Pukara           31
Gallinazo        20
Chanca            4
Lima              2
Name: culture_cl, dtype: int64

In [99]:
import time
import numpy as np

In [101]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,catalogation_id,culture_cl,description,bert_embedding
0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,"[0.17316726, -0.6274872, 0.0407279, -0.0887306..."
1,ML020108,Sican,botella doble pico asa puente cintada escultor...,"[0.17790586, -0.68238825, -0.40351287, -0.2387..."
2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,"[0.07478304, -0.5415242, -0.34691423, 0.007677..."
3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,"[-0.06786743, -0.45162752, -0.20201102, -0.069..."
4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,"[0.095533825, -0.518077, -0.30639765, -0.22455..."


In [127]:
seconds_per_data = 0.000082850766389
distances = {p: list() for p in culture_pairs}

for i in range(32, len(culture_pairs)):
    start_time = time.time()
    c1 = culture_pairs[i][0]
    c2 = culture_pairs[i][1]
    
    print(f'{i} -> Cultures: {c1}, {c2}')
    df_c1 = embed_df[embed_df.culture_cl == c1]
    df_c2 = embed_df[embed_df.culture_cl == c2]

    # Time estimation
    len_c1 = len(df_c1)
    len_c2 = len(df_c2)
    total_data = len_c1 * len_c2
    estimated_s = total_data * seconds_per_data
    estimated_minute = '0' + str(round(estimated_s / 60)) if round(estimated_s / 60) < 10 else str(round(estimated_s / 60))
    estimated_second = '0' + str(np.round(estimated_s % 60, 2)) if np.round(estimated_s % 60, 2) < 10 else str(np.round(estimated_s % 60, 2))
    print(f'Estimated time in seconds: {np.round(estimated_s, 2)}')
    print(f'Estimated time in minutes: {estimated_minute}:{estimated_second}')

    embed_c1 = df_c1.bert_embedding
    embed_c2 = df_c2.bert_embedding

    for e1 in embed_c1:
        for e2 in embed_c2:
            dist = cosine_similarity(e1, e2)
            distances[(c1, c2)].append(dist)
    
    end_time = time.time()

    # Save distances in file
    mean = np.mean(distances[(c1, c2)])
    file = open(f'../data/distances/mean_cosine_distances_bert_ctime{ctime}.txt', 'a')
    file.write(f'{i}: {c1}-{c2} -> {mean}\n')
    file.close()

    print(f'Time distances between: {end_time - start_time}s')
    print(f'Mean distance: {mean}')
    print()

32 -> Cultures: Recuay, Inca
Estimated time in seconds: 16.85
Estimated time in minutes: 00:16.85
Time distances between: 16.70394992828369s
Mean distance: 0.6501529812812805

33 -> Cultures: Recuay, Tiahuanaco
Estimated time in seconds: 19.24
Estimated time in minutes: 00:19.24
Time distances between: 18.99306082725525s
Mean distance: 0.6567280292510986

34 -> Cultures: Recuay, Chincha
Estimated time in seconds: 1.06
Estimated time in minutes: 00:01.06
Time distances between: 1.0805580615997314s
Mean distance: 0.6457753777503967

35 -> Cultures: Vicus, Lima
Estimated time in seconds: 0.06
Estimated time in minutes: 00:00.06
Time distances between: 0.0668489933013916s
Mean distance: 0.6732009053230286

36 -> Cultures: Vicus, Gallinazo
Estimated time in seconds: 0.62
Estimated time in minutes: 00:00.62
Time distances between: 0.659064769744873s
Mean distance: 0.7099899053573608

37 -> Cultures: Vicus, Moche
Estimated time in seconds: 441.55
Estimated time in minutes: 07:21.55
Time dista

## Calculamos la matriz por periodo

In [31]:
dataset_df = pd.read_excel('../../data/dataset_v4.xlsx')
dataset_df = dataset_df.drop('Unnamed: 0', axis=1)
dataset_df.head()

Unnamed: 0,catalogation_id,cronology,cronology_time,culture_cl,morfofunctional_category,description,principal_scene,decoration_tecnique_external_body_section1,color_external_body_section1,color_internal_body_section1,...,trait_n89,trait_n90,trait_n100,trait_n101,trait_n102,trait_n103,trait_n104,trait_n105,file_path,image_path
0,ML020107,Horizonte Medio,7,Sican,botella doble cuerpo asa puente cintada silbadora,botella doble cuerpo asa puente cintada silbad...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020107a.jpg
1,ML020108,Horizonte Medio,7,Sican,botella doble pico asa puente cintada escultorica,botella doble pico asa puente cintada escultor...,,pintado escultorico,rojo y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020108a.jpg
2,ML020109,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020109a.jpg
3,ML020110,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020110a.jpg
4,ML020111,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y marron,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020111a.jpg


In [32]:
df['cronology_time'] = dataset_df['cronology_time']

In [33]:
df

Unnamed: 0.1,Unnamed: 0,catalogation_id,culture_cl,description,embedding,cronology_time
0,0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,"[0.17316726, -0.6274872, 0.0407279, -0.0887306...",7
1,1,ML020108,Sican,botella doble pico asa puente cintada escultor...,"[0.17790586, -0.68238825, -0.40351287, -0.2387...",7
2,2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,"[0.07478304, -0.5415242, -0.34691423, 0.007677...",7
3,3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,"[-0.06786743, -0.45162752, -0.20201102, -0.069...",7
4,4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,"[0.095533825, -0.518077, -0.30639765, -0.22455...",7
...,...,...,...,...,...,...
33576,33577,ML038832,Tiahuanaco,plato con diseños geometricos de lineas horizo...,"[-0.21575417, -0.6014442, -0.5351323, 0.295663...",6
33577,33578,ML038833,Tiahuanaco,plato con diseños geometricos de eses ( s) y l...,"[-0.59130156, -1.0794916, -0.2997423, -0.03917...",6
33578,33579,ML015075,Cajamarca,cuenco escultorico que representa a un felino ...,"[0.10485042, -0.74276745, -0.27508265, -0.3266...",6
33579,33580,ML015241,Cajamarca,cuenco con representacion de cabeza estilizada...,"[0.7909232, -0.19150409, -0.6404954, -0.548680...",5


In [35]:
ctimes = df.cronology_time.unique().tolist()
ctimes

[7, 6, 8, 5, 9]

In [None]:
ctimes.remove(9)

In [72]:
from numpy.linalg import norm

def get_positive_positions(v1, v2, vector_length):
    positions = [i for i in range(vector_length) if v1[i] != 0.0 and v2[i] != 0.0]
    return positions

def _cosine_similarity(v1, v2):
    positions = get_positive_positions(v1, v2, len(v1))
    if len(positions) == 0:
        return 0.0
    
    reshaped_v1 = [v1[i] for i in positions]
    reshaped_v2 = [v2[j] for j in positions]

    distance = np.dot(reshaped_v1, reshaped_v2) / (norm(v1) * norm(v2))
    print(np.dot(reshaped_v1, reshaped_v2))
    print((norm(v1) / norm(v2)))
    return 1 - distance

In [50]:
ctimes

[7, 6, 8, 5]

In [73]:
for ctime in ctimes:
    if ctime == 7:
        continue
    filtered_df = df[df['cronology_time'] == ctime]
    print(f'Cronology_time: {ctime}')
    display(filtered_df.groupby('culture_cl')['culture_cl'].count().sort_values(ascending=False))

    cultures = filtered_df.culture_cl.unique().tolist()
    culture_pairs = [(a, b) for idx, a in enumerate(cultures) for b in cultures[idx + 1:]]
    print(f'Culture pairs: {len(culture_pairs)}')

    distances = {p: list() for p in culture_pairs}

    for i in range(1, len(culture_pairs)):
        start_time = time.time()
        c1 = culture_pairs[i][0]
        c2 = culture_pairs[i][1]
        
        print(f'{i} -> Cultures: {c1}, {c2}')
        df_c1 = filtered_df[filtered_df.culture_cl == c1]
        df_c2 = filtered_df[filtered_df.culture_cl == c2]

        embed_c1 = df_c1.embedding
        embed_c2 = df_c2.embedding

        for e1 in embed_c1:
            for e2 in embed_c2:
                dist = _cosine_similarity(e1, e2)
                distances[(c1, c2)].append(dist)
        
        end_time = time.time()

        # Save distances in file
        mean = np.mean(distances[(c1, c2)])
        file = open(f'../../data/distances/mean_cosine_distances_bert_ctime_{ctime}.txt', 'a')
        file.write(f'{i}: {c1}-{c2} -> {mean}\n')
        file.close()

        print(f'Time distances between: {end_time - start_time}s')
        print(f'Mean distance: {mean}')
        print()


Cronology_time: 6


culture_cl
Moche         14250
Nasca          3027
Cajamarca       883
Recuay          327
Gallinazo        20
Tiahuanaco       10
Lima              2
Pukara            2
Salinar           2
Chimu             1
Vicus             1
Name: culture_cl, dtype: int64

Culture pairs: 55
1 -> Cultures: Recuay, Lima
188.28082650354781
1.01131494628298
123.32476030471858
0.9715637055834659
211.21807295856775
1.0008307477536489
171.54789393332658
0.9614916040975067
204.8532399688201
0.9758832095762071
152.27271272581427
0.9375246660769168
198.79380100068812
0.9947269300955001
164.92585721853297
0.9556277060863548
205.05818558020343
0.9997367598096862
151.13282440573852
0.9604406169795899
216.2674366989803
1.001371096318816
151.23027758429902
0.9620107134572664
191.39690563114507
1.0200958201214874
131.5783213805701
0.9799994340933202
118.58837581084944
1.3218516057447205
111.89068189115021
1.269894258983334
196.7785081381378
1.1132725104054841
206.52699119362734
1.0695136757438062
152.3132368844961
1.1771485403617732
137.08051239958917
1.1308789631751732
188.64953703258436
1.01608458639807
141.08450378984395
0.9761458678876506
105.33951506363505
1.249191258021143
124.8682520478367
1.2000899344820835
195.71417819455283
1.0162373542254133
148.1500037854128

KeyboardInterrupt: 