## Uso de TF-IDF para calcular distancia entre vectores de descripción

En este notebook se utilizará el algoritmo TF-IDF como approach alternativo para calcular similitud entre diferentes imágenes de culturas diferentes.

### Cargamos el dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('../../data/dataset_v3.xlsx')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,catalogation_id,cronology,cronology_time,culture_cl,morfofunctional_category,description,principal_scene,decoration_tecnique_external_body_section1,color_external_body_section1,color_internal_body_section1,...,trait_n89,trait_n90,trait_n100,trait_n101,trait_n102,trait_n103,trait_n104,trait_n105,file_path,image_path
0,ML020107,Horizonte Medio,7,Sican,botella doble cuerpo asa puente cintada silbadora,botella doble cuerpo asa puente cintada silbad...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020107a.jpg
1,ML020108,Horizonte Medio,7,Sican,botella doble pico asa puente cintada escultorica,botella doble pico asa puente cintada escultor...,,pintado escultorico,rojo y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020108a.jpg
2,ML020109,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020109a.jpg
3,ML020110,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y naranja,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020110a.jpg
4,ML020111,Horizonte Medio,7,Sican,botella gollete asa puente cintada protoma sil...,botella gollete asa puente cintada protoma sil...,,pintado escultorico,crema y marron,,...,0,0,1,1,1,0,1,0,data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTU...,data/images/ML020111a.jpg


### Utilizamos la librería NLTK que nos permitirá implementar el algoritmo

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

Definimos el lematizador y removemos las stopwords

Instalamos el tokenizador y lematizador

In [22]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ldavico/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ldavico/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ldavico/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
stop_words = set(stopwords.words("spanish"))    
lemmatizer = WordNetLemmatizer()


In [43]:
texto1 = 'Hola soy Luciano'
texto2 = 'Hola soy Luciano y me gusta jugar futbol'

In [44]:
text1_tokenized = ' '.join([
    lemmatizer.lemmatize(word.lower()) 
        for word in word_tokenize(texto1) if word.isalnum() and word.lower() not in stop_words
    ])
text2_tokenized = ' '.join([
    lemmatizer.lemmatize(word.lower()) 
        for word in word_tokenize(texto2) if word.isalnum() and word.lower() not in stop_words
    ])

### Vectorizamos por medio de TFidf vectorizer y calculamos la similitud

In [49]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1_tokenized, text2_tokenized])

In [46]:
similitud = 1 - (tfidf_matrix * tfidf_matrix.T).toarray()[0, 1]
similitud

0.4976712217743283

### Ahora lo aplicamos a nuestro dataset y guardamos nuestras features

In [62]:
df = df[~df.description.isnull()][['catalogation_id', 'culture_cl', 'description']]
descriptions = df.description.tolist()

In [67]:
def tokenize(description, lemmatizer, stop_words) -> str:
    return ' '.join([
    lemmatizer.lemmatize(word.lower()) 
        for word in word_tokenize(description) if word.isalnum() and word.lower() not in stop_words
    ])

def tokenize_descriptions(descriptions, lemmatizer, stop_words) -> list:
    tokenized = [tokenize(desc, lemmatizer, stop_words) for desc in descriptions]
    return tokenized

In [68]:
tokenized_descriptions = tokenize_descriptions(descriptions, lemmatizer, stop_words)

In [92]:
def get_vectors_tf_idf(descriptions, vectorizer):
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    return tfidf_matrix.toarray()

In [93]:
description_vectors_tfidf = get_vectors_tf_idf(tokenized_descriptions, vectorizer).tolist()


In [94]:
df['tfidf_vector'] = description_vectors_tfidf

In [95]:
df

Unnamed: 0,catalogation_id,culture_cl,description,tfidf_vector
0,ML020107,Sican,botella doble cuerpo asa puente cintada silbad...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ML020108,Sican,botella doble pico asa puente cintada escultor...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,ML020109,Sican,botella gollete asa puente cintada protoma sil...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,ML020110,Sican,botella gollete asa puente cintada protoma sil...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,ML020111,Sican,botella gollete asa puente cintada protoma sil...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
33577,ML038832,Tiahuanaco,plato con diseños geometricos de lineas horizo...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
33578,ML038833,Tiahuanaco,plato con diseños geometricos de eses ( s) y l...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
33579,ML015075,Cajamarca,cuenco escultorico que representa a un felino ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
33580,ML015241,Cajamarca,cuenco con representacion de cabeza estilizada...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [97]:
df.to_csv('../../data/tfidf_vectors.csv', sep=';', index=False)