<a href="https://colab.research.google.com/github/palis-dev/jupyter-notebooks/blob/main/ai_rag_wikipedia_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies


In [None]:
!pip install faiss-cpu sentence_transformers wikipedia

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4

# OpenAI Client

In [None]:
from openai import OpenAI
from threading import Lock

class OpenAIClient:
    """
    Singleton para gerenciar a instância do cliente OpenAI.
    """
    _instance = None
    _lock = Lock()

    def __new__(cls, api_key):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(OpenAIClient, cls).__new__(cls)
                cls._instance._client = OpenAI(api_key=api_key)
            return cls._instance

    def get_client(self):
        """
        Retorna a instância do cliente OpenAI.
        """
        return self._client

API_KEY = "***"

openai_client = OpenAIClient(API_KEY)


# Crawler

In [None]:
import wikipedia

wikipedia.set_lang("pt")

def collect_articles(keywords, max_articles=1000):
    articles = {}
    for keyword in keywords:
        try:
            results = wikipedia.search(keyword, results=10)
            for title in results:
                if title not in articles:
                    try:
                        page = wikipedia.page(title)
                        articles[title] = page.content
                        if len(articles) >= max_articles:
                            return articles
                    except Exception as e:
                        print(f"Error retrieving '{title}': {e}")
        except Exception as e:
            print(f"Error searching for '{keyword}': {e}")
    return articles

keywords = ["história", "civilizações antigas", "história Brasil", "Primeira Revolução Industrial", "Império Romano"]
documents = collect_articles(keywords)
print(f"Collected {len(documents)} documents.")




  lis = BeautifulSoup(html).find_all('li')


Error retrieving 'Revolução Industrial (desambiguação)': "Revolução Industrial (desambiguação)" may refer to: 
Primeira Revolução Industrial
Segunda Revolução Industrial
Terceira Revolução Industrial
Quarta Revolução Industrial
Error retrieving 'Era industrial': "Era industrial" may refer to: 
Revolução Industrial
Segunda Revolução Industrial
Terceira Revolução Industrial
Indústria 4.0
Capitalismo industrial
Indústria
Industrialização
Todas as páginas cujo título começa por "Era industrial"
Todas as páginas que tenham "Era industrial" no título
Busca por "era industrial"
Collected 44 documents.


# Embedding

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

titles = list(documents.keys())
contents = list(documents.values())

embeddings = model.encode(contents, show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

# Database

In [None]:
import faiss
import numpy as np
import pickle
emb_array = np.array(embeddings).astype('float32')
dim = emb_array.shape[1]

index = faiss.IndexFlatL2(dim)

index.add(emb_array)
print(f"Indexed {index.ntotal} vectors.")

faiss.write_index(index, "faiss_index_history.idx")


with open("titles_mapping.pkl", "wb") as f:
    pickle.dump(titles, f)


Indexed 44 vectors.


# Vector similarity - top_k

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle

model = SentenceTransformer('all-MiniLM-L6-v2')

index = faiss.read_index("faiss_index_history.idx")

with open("titles_mapping.pkl", "rb") as f:
    titles = pickle.load(f)

def vector_search(prompt, k=5):

    prompt_embedding = model.encode([prompt])
    prompt_embedding = np.array(prompt_embedding).astype('float32')

    distances, indices = index.search(prompt_embedding, k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        # Check if the index is valid
        if idx < len(titles):
            results.append((titles[idx], dist))

    return results

# ISSO AQUI É SÓ PRA MOSTRAR O RETORNO, O PROMPT QUE USO LA EMBAIXO É OUTRO
prompt_text = "Influence of the Industrial Revolution on modern society"
results = vector_search(prompt_text, k=5)

print("Top search results:")
for title, distance in results:
    print(f"Title: {title}, Distance: {distance}")


Top search results:
Title: Indústria, Distance: 1.118760585784912
Title: Revolução Industrial, Distance: 1.2894843816757202
Title: Segunda Revolução Industrial, Distance: 1.2894843816757202
Title: Quarta Revolução Industrial, Distance: 1.3306961059570312
Title: Indústria química, Distance: 1.4562702178955078


In [None]:
import openai
import numpy as np

def check_fake_news(prompt, top_k=3):

    context_results = vector_search(prompt, k=top_k)

    context_str = ""
    for title, distance in context_results:
        context_str += f"Document: {title}\n"

    messages = [
        {"role": "system", "content": f"""
        Below is a set of content context documents and a news prompt.
        Based on the context, determine whether the news prompt is likely fake or not.
        Respond with 'Fake' if the news appears inconsistent with given facts or 'Not Fake' if it seems plausible.

        Context:
        {context_str}
        Prompt:
        {prompt}
        """},
    ]

    response = openai_client.get_client().chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=50,
        temperature=0.3,
    )

    answer = response.choices[0].message.content
    return answer

news_prompt = "who discovered Brazil was Dom Pedro 1"
result = check_fake_news(news_prompt, top_k=3)
print("Generative model output:", result)


Generative model output: Fake
