### Importar las librerías

In [76]:

import requests
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from transformers import AutoTokenizer
from langchain_huggingface import HuggingFaceEmbeddings 
import chromadb
from langchain_chroma import Chroma
import ollama
from langchain_ollama.llms import OllamaLLM
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


### Scraping de la página web
Usamos BeautifulSoup para hacer scraping y extraer los datos de la web.

In [77]:
url = "https://en.wikipedia.org/wiki/2024_Formula_One_World_Championship"

# realizar solicitud http
respuesta = requests.get(url)
pagina = BeautifulSoup(respuesta.text, 'html.parser')

# tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# extraer textos de la página
textos = [p.text for p in pagina.find_all('p')]

# previsualizar algunos textos
for texto in textos[:5]: 
    print(texto)






The 2024 FIA Formula One World Championship was a motor racing championship for Formula One cars and was the 75th running of the Formula One World Championship. It was recognised by the Fédération Internationale de l'Automobile (FIA), the governing body of international motorsport, as the highest class of competition for open-wheel racing cars. The championship was contested over a record twenty-four Grands Prix held around the world. It began in March and ended in December.

Drivers and teams competed for the titles of World Drivers' Champion and World Constructors' Champion, respectively. Defending Drivers' Champion Max Verstappen started off the season with five wins in the first seven races, but was pressured by McLaren-Mercedes driver Lando Norris for the rest of the season after his RB20 fell behind Norris' MCL38 in terms of performance. Despite this, Verstappen performed consistently at the front of the field and won his fourth consecutive Drivers' Championship title at the

### Procesar el texto
Vamos a limpiar y normalizar los textos con algunas reglas como hacer todo el texto minúsculas, no añadir caracteres especiales, eliminar las stop words (el, la, y, etc.) y tokenizar el texto en palabras para facilitar el proceso.

In [78]:
textos_limpios = [texto.strip() for texto in textos]
textos_minusculas = [texto.lower() for texto in textos_limpios]
textos_sin_caracteres_especiales = [re.sub(r'[^A-Za-z0-9\s]', '', texto) for texto in textos_minusculas]
textos_sin_stopwords = [' '.join([word for word in texto.split() if word not in ENGLISH_STOP_WORDS]) for texto in textos_sin_caracteres_especiales]
textos_tokenizados = [tokenizer.tokenize(texto) for texto in textos_sin_stopwords]

# previsualizar algunos textos
for texto in textos_tokenizados[:5]:
    print(texto)

[]
[]
['202', '##4', 'fia', 'formula', 'world', 'championship', 'motor', 'racing', 'championship', 'formula', 'cars', '75th', 'running', 'formula', 'world', 'championship', 'recognised', 'f', '##dra', '##tion', 'internationale', 'lau', '##tom', '##ob', '##ile', 'fia', 'governing', 'body', 'international', 'motorsport', 'highest', 'class', 'competition', 'open', '##wheel', 'racing', 'cars', 'championship', 'contested', 'record', 'twenty', '##fo', '##ur', 'grand', '##s', 'prix', 'held', 'world', 'began', 'march', 'ended', 'december']
['drivers', 'teams', 'competed', 'titles', 'world', 'drivers', 'champion', 'world', 'construct', '##ors', 'champion', 'respectively', 'defending', 'drivers', 'champion', 'max', 've', '##rst', '##app', '##en', 'started', 'season', 'wins', 'seven', 'races', 'pressured', 'mclaren', '##mer', '##cede', '##s', 'driver', 'land', '##o', 'norris', 'rest', 'season', 'rb', '##20', 'fell', 'norris', 'mc', '##l', '##38', 'terms', 'performance', 'despite', 've', '##rst', 

### Crear Base de Datos Vectorial
Usamos HuggingFace para generar los embeddings y almacenamos los vectores en una base de datos Chroma.

In [79]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# base de datos vectorial
vector_store = Chroma.from_texts(
    texts=textos_limpios, 
    collection_name="web_data", 
    embedding=embeddings, 
    persist_directory="./chroma_web_data"
)

# crea el retriever
recuperador = vector_store.as_retriever()

### Implementación del LLM
Asegurarse que el docker de Ollama está funcionando con 'docker ps'

Para integrar el LLM usaremos LangChain y Ollama

In [83]:
# definir la plantilla del prompt 
plantilla = """Answer the question based only on the following context: 
{context} 

Question: {question} 
""" 
prompt = ChatPromptTemplate.from_template(plantilla) 

# seleccionar el modelo 
modelo_local = OllamaLLM(model="gemma:2b", base_url='http://localhost:11434')

# crear la cadena RAG
cadena = ( 
    {"context": recuperador, "question": RunnablePassthrough()} 
    | prompt 
    | modelo_local 
    | StrOutputParser() 
)

# hacer una pregunta de prueba 
respuesta = cadena.invoke("How many Grand Prix were on the Championship?") 
print(respuesta)

The 2024 FIA Formula One World Championship consisted of 24 Grands Prix held around the world.
