In [15]:
from scipy.spatial.distance import cosine

def find_most_similar(embedding, embeddings_dict,incName):
    # Encuentra el vector más cercano usando similitud coseno
    most_similar_name = None
    similarities=[]
    highest_similarity = -1  # Menor posible para similitud coseno (coseno más grande implica más cercano)
    
    for name, candidate_embedding in embeddings_dict.items():
        similarity = 1 - cosine(embedding, candidate_embedding)  # Similitud coseno
        similarities.append((incName,name, similarity))
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_name = name

    return most_similar_name, highest_similarity,similarities



In [16]:

import os
from dotenv import load_dotenv
import google.generativeai as genai
from pathlib import Path
import json

In [17]:

ROOT_DIR = "/home/nahuel/Documents/tesis" + "/fine_tunning"
DATA_FOLDER = ROOT_DIR  + "/data/sedici/jsons/"

filename = DATA_FOLDER + "final_metadata_Chekced2.json"


def flatten_comprehension(matrix):  
    new_matrix = [] 
    for elem in matrix:
        if(isinstance(elem,list)):
            matrix.extend(elem)
        else:
            new_matrix.append(elem)
    return new_matrix

with open(filename) as f:
    data = json.load(f)
    nombres_con_errores = ([value["sedici.creator.person"] for key,value in list(data.items())[30:40]])
    nombres_con_errores = flatten_comprehension(nombres_con_errores)

nombres_reales = ['Santucho, Veronica','Sanchez Veronica' ,'Panigo, Nahuel','Villanueva, Magali Jimena', 'González, Ernesto', 'Rodríguez, Fernando', 'Hoffmann, Karen', 'García Lambas, Dolores', 'Gaztañaga, Emanuel' , 'Garcia, Damian', 'Balghzal, Ahmed','Barberá, José Andres Miguel', 'Vela Gonzáles, Marta', 'Gisbert Caudeli, Vicenta', 'Elisondo, Romina Cecilia', 'Herreras Carrera, Alex', 'Farina, María Andrea', 'De la Ossa Martínez, Marco Antonio', 'Partida-Valdivia, José Marcos', 'Weik, Christian','Pelizza, Fabricio Nahuel', 'Pastor, Sebastián Oscar', 'Sario, Gisela', 'Medina, Matías Eduardo']


In [18]:
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
model_name = "gemini-1.5-pro"

genai.configure(api_key=GOOGLE_API_KEY)


from google.api_core import retry
from tqdm.rich import tqdm


tqdm.pandas()

@retry.Retry(timeout=300.0)
def embed_fn(text: str) -> list[float]:
    # You will be performing classification, so set task_type accordingly.
    response = genai.embed_content(
        model="models/embedding-001", content=text, task_type="SEMANTIC_SIMILARITY"
    )

    return response["embedding"]


def create_embeddings(nombres_reales , nombres_con_errores):
    dict_nombres_con_errores = {elem : embed_fn(elem) for elem in nombres_con_errores}  
    dict_nombres_reales = {elem : embed_fn(elem) for elem in nombres_reales}  
    return dict_nombres_con_errores , dict_nombres_reales

dict_nombres_con_errores , dict_nombres_reales = create_embeddings(nombres_reales , nombres_con_errores)



In [22]:
for elem in  dict_nombres_con_errores.items():
    mostSim,name,sim = find_most_similar(elem[1],dict_nombres_reales,incName=elem[0])
    print(f"{elem[0]} : {name} , {name} , {mostSim}")


Balghzal, Ahmed : 1.0 , 1.0 , Balghzal, Ahmed
Elisondo, Romina Cecilia : 1.0 , 1.0 , Elisondo, Romina Cecilia
Herreras Carrera, Aleix : 0.9351311554355846 , 0.9351311554355846 , Herreras Carrera, Alex
Farina, María Andrea : 1.0 , 1.0 , Farina, María Andrea
De la Ossa Martínez, Marco Antonio : 1.0 , 1.0 , De la Ossa Martínez, Marco Antonio
Partida-Valdivia, José Marcos : 1.0 , 1.0 , Partida-Valdivia, José Marcos
Weik, Christian : 1.0 , 1.0 , Weik, Christian
Santucho, V. : 0.93031908136773 , 0.93031908136773 , Santucho, Veronica
González, E. : 0.9347545252238895 , 0.9347545252238895 , González, Ernesto
Rodríguez, F. : 0.9349529057658487 , 0.9349529057658487 , Rodríguez, Fernando
Hoffmann, K. : 0.927495773826017 , 0.927495773826017 , Hoffmann, Karen
García Lambas, D. : 0.8822028809593124 , 0.8822028809593124 , García Lambas, Dolores
Gaztañaga, E. : 0.9552999936631263 , 0.9552999936631263 , Gaztañaga, Emanuel
Barberá, José Miguel : 0.979438377568586 , 0.979438377568586 , Barberá, José Andr

In [4]:
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

# Filtrar modelos por criterios específicos
filtered_models = [
    model for model in list(genai.list_models()) 
    if model["capabilities"].get("text-generation")  # Filtra por tarea
    and "en" in model["languages"]                   # Filtra por idioma soportado
]

for model in filtered_models:
    print(f"Modelo: {model['name']}, Idiomas: {model['languages']}")

TypeError: 'Model' object is not subscriptable

In [10]:
embedding_models = [
    model for model in genai.list_models()
    if ("embedText"  in model.supported_generation_methods) or "embedContent" in (model.supported_generation_methods)
]


In [14]:
def test_language_support(model, text, language):
    try:
        response = genai.embed_content(model=model.name, content=text,task_type="SEMANTIC_SIMILARITY")
        print(f"Modelo {model.name} soporta el idioma {language}: {response}")
    except Exception as e:
        print(f"Modelo {model.name} no soporta el idioma {language}: {e}")

# Probar el idioma inglés
for model in embedding_models:
    test_language_support(model, "This is a test", "English")

Modelo models/embedding-gecko-001 no soporta el idioma English: 404 models/embedding-gecko-001 is not found for API version v1beta, or is not supported for embedContent. Call ListModels to see the list of available models and their supported methods.
Modelo models/embedding-001 soporta el idioma English: {'embedding': [0.039482996, -0.036107507, -0.014346839, -0.054782055, 0.013545979, 0.014014631, 0.004637926, -0.033478897, -0.009356555, 0.03269703, 0.062836744, 0.012499185, 0.010228307, -0.0414974, 0.06257874, 0.01770084, -0.026797827, -0.01703698, -0.0016644177, -0.0026069242, -2.7632253e-05, 0.018724337, -0.011727795, 0.0031417853, 0.010977595, 0.0069761383, 0.028832309, -0.05181897, -0.033819493, -0.0011543913, -0.09650505, 0.0015438764, -0.072139904, 0.011156209, 0.01780935, -0.06897748, 0.011993268, 0.017032115, 0.0022064508, 0.005291169, 0.006076264, -0.01544491, -0.026038127, -0.0013885648, 0.067053474, 0.0072148214, 0.038998786, 0.016313823, 0.01902624, -0.069474906, 0.051678

In [None]:
    response = genai.embed_content(
        model="models/text-multilingual-embedding-002", content=text, task_type="SEMANTIC_SIMILARITY"
    )