# Buscador de Documentos públicos Markdown com Embeddings no GitHub

## Primeiro construir um dataframe com as colunas url, titulo e conteudo dos documentos Markdown encontrados dentro do repositório informado 

In [16]:
import requests
import base64
import pandas as pd

def get_markdown_files(owner, repo):
    # Obter último commit da branch master
    branch_url = f"https://api.github.com/repos/{owner}/{repo}/branches/master"
    branch_response = requests.get(branch_url)
    
    # Verificar se a branch master existe
    if branch_response.status_code == 404:
        print("A branch 'master' não existe neste repositório.")
        return pd.DataFrame() # retorna um DataFrame vazio se a branch não existir
    
    commit_sha = branch_response.json()["commit"]["sha"]

    # Obter árvore do commit
    commit_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{commit_sha}"
    commit_response = requests.get(commit_url)
    tree_sha = commit_response.json()["commit"]["tree"]["sha"]

    # Obter conteúdo da árvore
    tree_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree_sha}?recursive=1"
    tree_response = requests.get(tree_url)
    tree_content = tree_response.json()

    # Verificar se a chave 'tree' existe na resposta
    if "tree" in tree_content:
        tree_content = tree_content["tree"]
    else:
        print("A resposta da API não contém a chave 'tree'.")
        return pd.DataFrame()

    # Filtrar arquivos .md e obter seu conteúdo
    data = []  # Lista para armazenar os dados
    for file in tree_content:
        if file["path"].endswith(".md"):
            blob_sha = file["sha"]
            blob_url = f"https://api.github.com/repos/{owner}/{repo}/git/blobs/{blob_sha}"
            blob_response = requests.get(blob_url)
            content = base64.b64decode(blob_response.json()["content"]).decode("utf-8")

            # Construir link completo do arquivo no GitHub
            file_url = f"https://github.com/{owner}/{repo}/blob/master/{file['path']}"

              # Encontrar o título
            lines = content.split('\n')
            title = ''
            found_title = False
            for line in lines:
                if not found_title and line.strip():
                    title += line.strip() + ' '
                    found_title = True
                elif found_title and line.strip():
                    title += line.strip()
                    break

            data.append({"url": file_url, "conteudo": content, "titulo": title})

    # Criar DataFrame
    df = pd.DataFrame(data)
    return df

# Exemplo de uso
df_markdown = get_markdown_files("marcosab10", "ICMC-USP")
print(df_markdown)

                                                  url  \
0   https://github.com/marcosab10/ICMC-USP/blob/ma...   
1   https://github.com/marcosab10/ICMC-USP/blob/ma...   
2   https://github.com/marcosab10/ICMC-USP/blob/ma...   
3   https://github.com/marcosab10/ICMC-USP/blob/ma...   
4   https://github.com/marcosab10/ICMC-USP/blob/ma...   
5   https://github.com/marcosab10/ICMC-USP/blob/ma...   
6   https://github.com/marcosab10/ICMC-USP/blob/ma...   
7   https://github.com/marcosab10/ICMC-USP/blob/ma...   
8   https://github.com/marcosab10/ICMC-USP/blob/ma...   
9   https://github.com/marcosab10/ICMC-USP/blob/ma...   
10  https://github.com/marcosab10/ICMC-USP/blob/ma...   
11  https://github.com/marcosab10/ICMC-USP/blob/ma...   
12  https://github.com/marcosab10/ICMC-USP/blob/ma...   
13  https://github.com/marcosab10/ICMC-USP/blob/ma...   
14  https://github.com/marcosab10/ICMC-USP/blob/ma...   
15  https://github.com/marcosab10/ICMC-USP/blob/ma...   
16  https://github.com/marcosab

## Configuração de API Key e importação das libs numpy e google.generativeai 

In [17]:
import numpy as np
import google.generativeai as genai


genai.configure(api_key="AIzaSyB2Oc0kaKu3BrC8xM4KgjHrWtU8IgrRSRo")

model = "models/embedding-001"

## Criação da função de embed

In [18]:
def embed_fn(title, text):
    return genai.embed_content(model=model,
                               content=text,
                               title=title,
                               task_type="RETRIEVAL_DOCUMENT")["embedding"]

## Criação da coluna de Embeddings no dataframe

In [21]:
df_markdown["Embeddings"] = df_markdown.apply(lambda row: embed_fn(row["url"], row["conteudo"][:9000]), axis=1)
df_markdown

Unnamed: 0,url,conteudo,titulo,Embeddings
0,https://github.com/marcosab10/ICMC-USP/blob/ma...,## Windows CMD Commands\n\n - A maioria dos co...,## Windows CMD Commands - A maioria dos comand...,"[-0.00028735085, -0.011052267, -0.07824952, -0..."
1,https://github.com/marcosab10/ICMC-USP/blob/ma...,### Learning Git\n\n - References\n - [Magi...,### Learning Git - References,"[0.032851085, -0.013950162, -0.027780956, -0.0..."
2,https://github.com/marcosab10/ICMC-USP/blob/ma...,## Linux Commands\n\n#### Source/Credits: http...,## Linux Commands #### Source/Credits: https:/...,"[0.0019569362, -0.008534544, -0.022349399, -0...."
3,https://github.com/marcosab10/ICMC-USP/blob/ma...,## Structured Query Language\n\n - Sites\n ...,## Structured Query Language - Sites,"[0.0134508675, -0.036066815, -0.055087853, -0...."
4,https://github.com/marcosab10/ICMC-USP/blob/ma...,# Título 1\n\n## Título 2\n\n### Título 3\n\n#...,# Título 1 ## Título 2,"[0.023799047, -0.036072277, -0.039160665, 0.01..."
5,https://github.com/marcosab10/ICMC-USP/blob/ma...,"# Parágrafos\n\nLorem ipsum dolor sit amet, co...","# Parágrafos Lorem ipsum dolor sit amet, conse...","[0.055545863, -0.017898032, -0.025671465, 0.01..."
6,https://github.com/marcosab10/ICMC-USP/blob/ma...,# Ênfase\n\n## Negrito 1\n\n**Lorem** ipsum do...,# Ênfase ## Negrito 1,"[0.03740417, -0.01904494, -0.05094322, 0.01915..."
7,https://github.com/marcosab10/ICMC-USP/blob/ma...,# Linhas horizontais\n\n## Exemplo 1\n\n***\n-...,# Linhas horizontais ## Exemplo 1,"[0.0037895082, -0.037596505, -0.038943943, 0.0..."
8,https://github.com/marcosab10/ICMC-USP/blob/ma...,# Listas não-ordenadas 1\n\n* Item 01\n* Item ...,# Listas não-ordenadas 1 * Item 01,"[-0.0036699108, -0.012538876, -0.04924564, -0...."
9,https://github.com/marcosab10/ICMC-USP/blob/ma...,# Listas ordenadas 1\n\n1. Item 01\n2. Item 02...,# Listas ordenadas 1 1. Item 01,"[0.0017971874, -0.024890495, -0.07298121, -0.0..."


## Criação da função gera_e_buscar_consulta

In [22]:
def gera_e_buscar_consulta(consulta, base, model):
    embedding_da_consulta =  genai.embed_content(model=model,
                               content=consulta,
                               task_type="RETRIEVAL_QUERY")["embedding"]
    
    produtos_escalares = np.dot(np.stack(base["Embeddings"]), embedding_da_consulta)

    indice = np.argmax(produtos_escalares)
    return df_markdown.iloc[indice]["url"]

## Finalmente executar a busca do documento que mais se aproxima da consulta

In [24]:
consulta = "Ganhar Dinheiro explicado por Kiyosaki"

url_encontrada = gera_e_buscar_consulta(consulta, df_markdown, model)
print(url_encontrada)

https://github.com/marcosab10/ICMC-USP/blob/master/USP - Economia & Investimentos/README.md
