In [1]:
!pip install -U sentence-transformers flagembedding

Collecting flagembedding
  Downloading FlagEmbedding-1.3.4.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets>=2.19.0 (from flagembedding)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting ir-datasets (from flagembedding)
  Downloading ir_datasets-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=2.19.0->flagembedding)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux201

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
# Se cambia el directorio donde se encuentra el archivo .csv
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/')

# Se Cargan los datos del archivo CSV
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset_alianzas_agrosavia.csv')

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from FlagEmbedding import BGEM3FlagModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output

# =============================
# 1. Cargar datos
# =============================
df = df.dropna(subset=['ali_nombre'])  # Asegurar no tener títulos nulos

# =============================
# 2. Cargar modelo de embeddings
# =============================
model = SentenceTransformer('BAAI/bge-m3')

# =============================
# 3. Precomputar embeddings de títulos
# =============================
titulos = df[['ali_nombre']].dropna().head(5000)['ali_nombre'].tolist() # Access the 'ali_nombre' column as a Series before calling tolist()
titulo_embeddings = model.encode(titulos, convert_to_tensor=True, normalize_embeddings=True)

# =============================
# 4. Función de recomendación
# =============================
def recomendar_aliados_por_texto(texto_idea, top_k=10):
    idea_emb = model.encode(texto_idea, convert_to_tensor=True, normalize_embeddings=True)
    similitudes = cosine_similarity(idea_emb.reshape(1, -1), titulo_embeddings)[0]

    # Create a new DataFrame for results instead of modifying the original df
    df_similitudes = pd.DataFrame({'ali_nombre': titulos, 'similitud': similitudes})
    df_similitudes = df_similitudes.sort_values(by='similitud', ascending=False).head(top_k)

    # Merge with the original df to get other columns
    resultado = pd.merge(df_similitudes, df[['uniorg_id', 'uniorg_nombre', 'ali_nombre']], on='ali_nombre', how='left')
    resultado = resultado[['uniorg_id', 'uniorg_nombre', 'ali_nombre', 'similitud']]
    resultado = resultado.drop_duplicates(subset=['uniorg_id']).reset_index(drop=True)
    return resultado

# =============================
# 5. Interfaz interactiva
# =============================
def lanzar_interfaz():
    input_text = widgets.Textarea(
        value='Desarrollo sostenible de sistemas agroforestales en zonas rurales',
        placeholder='Escribe aquí la idea de la alianza...',
        description='Idea:',
        layout=widgets.Layout(width='100%', height='80px'),
        style={'description_width': 'initial'}
    )

    top_k_slider = widgets.IntSlider(
        value=10, min=1, max=20, step=1, description='Top K aliados',
        style={'description_width': 'initial'}
    )

    boton = widgets.Button(description="Recomendar Aliados", button_style='success')
    salida = widgets.Output()

    def on_click(b):
        with salida:
            clear_output()
            idea = input_text.value
            top_k = top_k_slider.value
            resultado = recomendar_aliados_por_texto(idea, top_k=top_k)
            display(resultado)

    boton.on_click(on_click)

    display(widgets.VBox([input_text, top_k_slider, boton, salida]))

# Ejecutar interfaz
lanzar_interfaz()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

VBox(children=(Textarea(value='Desarrollo sostenible de sistemas agroforestales en zonas rurales', description…


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
from matplotlib import pyplot as plt
_df_0['uniorg_id'].plot(kind='hist', bins=20, title='uniorg_id')
plt.gca().spines[['top', 'right',]].set_visible(False)