## Mistral AI search engine

In [81]:
import session_info
session_info.show()

In [None]:
!pip install aiohttp bs4 faiss-cpu lxml mistralai nest_asyncio nest_asyncio pandas requests python-dotenv

![image info](./images/mistral-search-graph.png)

In [80]:
from dotenv import load_dotenv
import os

load_dotenv()  # load environment variables from .env file
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

## Scraper Definitions

In [46]:
import aiohttp
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import requests
import re
import pandas as pd
import faiss
import numpy as np
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

# Apply the nest_asyncio patch
nest_asyncio.apply()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}

total_results_to_fetch = 10  # total number of results to fetch
chunk_size = 1000  # size of each text chunk

dataframe_out_path = 'temp.csv'
faiss_index_path = 'faiss_index.index'

mistral_api_key = MISTRAL_API_KEY  # replace with your actual API key

async def fetch(session, url, params=None):
    async with session.get(url, params=params, headers=headers, timeout=30) as response:
        return await response.text()

async def fetch_page(session, params, page_num, results):
    print(f"Fetching page: {page_num}")
    params["start"] = (page_num - 1) * params["num"]
    html = await fetch(session, "https://www.google.com/search", params)
    soup = BeautifulSoup(html, 'html.parser')

    for result in soup.select(".tF2Cxc"):
        if len(results) >= total_results_to_fetch:
            break
        title = result.select_one(".DKV0Md").text
        links = result.select_one(".yuRUbf a")["href"]

        results.append({
            "title": title,
            "links": links
        })

async def fetch_content(session, url):
    async with session.get(url, headers=headers, timeout=30) as response:
        return await response.text()

async def fetch_all_content(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_content(session, url) for url in urls]
        return await asyncio.gather(*tasks)

def get_all_text_from_url(url):
    response = requests.get(url, headers=headers, timeout=30)
    soup = BeautifulSoup(response.text, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

def split_text_into_chunks(text, chunk_size):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        if sum(len(s) for s in current_chunk) + len(sentence) + 1 > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
        else:
            current_chunk.append(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

async def process_text_content(texts, chunk_size):
    loop = asyncio.get_event_loop()
    tasks = [loop.run_in_executor(None, split_text_into_chunks, text, chunk_size) for text in texts]
    return await asyncio.gather(*tasks)

async def get_embeddings_from_mistral(client, text_chunks):
    response = client.embeddings(model="mistral-embed", input=text_chunks)
    return [embedding.embedding for embedding in response.data]

async def fetch_and_process_data(search_query):
    client = MistralClient(api_key=mistral_api_key)

    params = {
        "q": search_query,  # query example
        "hl": "en",         # language
        "gl": "uk",         # country of the search, UK -> United Kingdom
        "start": 0,         # number page by default up to 0
        "num": 10           # parameter defines the maximum number of results to return per page.
    }
    
    async with aiohttp.ClientSession() as session:
        page_num = 0
        results = []
        while len(results) < total_results_to_fetch:
            page_num += 1
            await fetch_page(session, params, page_num, results)

        urls = [result['links'] for result in results]

        with ThreadPoolExecutor(max_workers=10) as executor:
            loop = asyncio.get_event_loop()
            texts = await asyncio.gather(
                *[loop.run_in_executor(executor, get_all_text_from_url, url) for url in urls]
            )

        chunks_list = await process_text_content(texts, chunk_size)

        embeddings_list = []
        for chunks in chunks_list:
            embeddings = await get_embeddings_from_mistral(client, chunks)
            embeddings_list.append(embeddings)

        data = []
        for i, result in enumerate(results):
            if i >= len(embeddings_list):
                print(f"Error: No embeddings returned for result {i}")
                continue
            for j, chunk in enumerate(chunks_list[i]):
                if j >= len(embeddings_list[i]):
                    print(f"Error: No embedding returned for chunk {j} of result {i}")
                    continue
                data.append({
                    'title': result['title'],
                    'url': result['links'],
                    'chunk': chunk,
                    'embedding': embeddings_list[i][j]
                })

        df = pd.DataFrame(data)
        df.to_csv(dataframe_out_path, index=False)

        # FAISS indexing
        dimension = len(embeddings_list[0][0])  # assuming all embeddings have the same dimension
        index = faiss.IndexFlatL2(dimension)

        embeddings = np.array([entry['embedding'] for entry in data], dtype=np.float32)
        index.add(embeddings)

        faiss.write_index(index, faiss_index_path)

await fetch_and_process_data("What is the latest news about apple and openai?")



Fetching page: 1
Fetching page: 2


Exception ignored in: <function MistralClient.__del__ at 0x11efd1ab0>
Traceback (most recent call last):
  File "/Users/hugolebelzic/miniconda3/envs/mistral-cookbook-contrib/lib/python3.10/site-packages/mistralai/client.py", line 49, in __del__
    self._client.close()
AttributeError: 'MistralClient' object has no attribute '_client'


## little embeddings and vector store creation

In [47]:
def query_vector_store(query_embedding, k=5):
    """
    Query the FAISS vector store and return the text results along with metadata.

    :param query_embedding: The embedding to query with.
    :param k: Number of nearest neighbors to retrieve.
    :return: List of dictionaries containing text results and metadata of the k nearest neighbors.
    """
    # Load the index

    index = faiss.read_index(faiss_index_path)

    # Ensure the query embedding is a numpy array with the correct shape
    if not isinstance(query_embedding, np.ndarray):
        query_embedding = np.array(query_embedding, dtype=np.float32)
    if query_embedding.ndim == 1:
        query_embedding = np.expand_dims(query_embedding, axis=0)

    # Query the index
    distances, indices = index.search(query_embedding, k)
    
    # Load the dataframe
    df = pd.read_csv(dataframe_out_path)
    
    # Retrieve the text results and metadata
    results = []
    for idx in indices[0]:
        result = {
            'title': df.iloc[idx]['title'],
            'url': df.iloc[idx]['url'],
            'chunk': df.iloc[idx]['chunk']
        }
        results.append(result)
    
    return results

def query_embeddings(texts):
    """
    Convert text to embeddings using Mistral AI API.

    :param api_key: Your Mistral API key.
    :param texts: List of texts to convert to embeddings.
    :return: List of embeddings.
    """
    client = MistralClient(api_key=MISTRAL_API_KEY)
    response = client.embeddings(model="mistral-embed", input=[texts])
    return [embedding.embedding for embedding in response.data]


embeddings = query_embeddings("AGI")
results = query_vector_store(embeddings[0], k=5)
results


[{'title': 'Apple announces deal with OpenAI. Will it be a game- ...',
  'url': 'https://www.latimes.com/entertainment-arts/business/story/2024-06-10/apple-announces-leap-into-ai-will-it-be-a-game-changer',
  'chunk': '“This is a method to allow Apple to make up for the fact that they haven’t been focused on AI like they should have done over the last decade or so.” Apple Intelligence was one of many announcements and updates from Apple\non Monday, including a feature that lets AirPods Pro users nod yes or shake their heads no to Siri’s questions when they are in crowded spaces. Additionally, the company announced that the\nVision Pro headset will also be available in additional countries starting later this month, including mainland China, Hong Kong, Japan and Singapore.The company also unveiled a new feature called InSight for its tvOS18 that is similar to Amazon’s X-Ray and shows the names of actors or a song playing on an Apple TV+ program.\nCompany Town\nScarlett Johansson also th

## tools definition

In [48]:

nest_asyncio.apply()

tools = [
    {
        "type": "function",
        "function": {
            "name": "mistral_web_search",
            "description": "Fetch and process data from Google search based on a query, store results in FAISS vector store, and retrieve results.",
            "parameters": {
                "type": "object",
                "properties": {
                    "search_query": {
                        "type": "string",
                        "description": "The search query to use for fetching data from Google search."
                    }
                },
                "required": ["search_query"]
            },
        },
    },
]




def mistral_web_search(search_query: str):
    async def run_search():
        await fetch_and_process_data(search_query)
        embeddings = query_embeddings(search_query)
        results_ = query_vector_store(embeddings[0], k=5)
        return results_

    return asyncio.run(run_search())

search_query = "mistral and openai"
results = mistral_web_search(search_query)
print(results)



Fetching page: 1
Fetching page: 2
[{'title': 'Microsoft partners with Mistral in second AI deal beyond ...', 'url': 'https://www.theverge.com/2024/2/26/24083510/microsoft-mistral-partnership-deal-azure-ai', 'chunk': '“Mistral Large achieves strong results on commonly used benchmarks, making it the world’s second-ranked model generally available through an API (next to GPT-4),” says the Mistral AI team.Mistral also has a new AI chatbotMistral Large is available on Mistral’s own infrastructure, hosted in Europe, or through Azure AI Studio and Azure Machine Learning. Mistral Small will also be available today, offering improved latency over Mistral’s 8x7B model. Mistral is also releasing a new conversational chatbot, Le Chat, that’s based on various models from Mistral AI.Mistral’s models have typically been open source, but the partnership with Microsoft means the French AI company can now explore more commercial opportunities. Neither Microsoft nor Mistral are disclosing details of the 

In [49]:
""" little helper function to extract only the texts """
def tools_to_str(tools_output: list) -> str:
    return '\n'.join([tool['chunk'] for tool in tools_output])


tools_to_str(mistral_web_search(search_query))

Fetching page: 1
Fetching page: 2


'“Mistral Large achieves strong results on commonly used benchmarks, making it the world’s second-ranked model generally available through an API (next to GPT-4),” says the Mistral AI team.Mistral also has a new AI chatbotMistral Large is available on Mistral’s own infrastructure, hosted in Europe, or through Azure AI Studio and Azure Machine Learning. Mistral Small will also be available today, offering improved latency over Mistral’s 8x7B model. Mistral is also releasing a new conversational chatbot, Le Chat, that’s based on various models from Mistral AI.Mistral’s models have typically been open source, but the partnership with Microsoft means the French AI company can now explore more commercial opportunities. Neither Microsoft nor Mistral are disclosing details of the investment, though.Microsoft’s investment comes months after a rocky period for its main AI partner, OpenAI.\nSomething went wrong while submitting the form.PricingLog inSign upCase studies\nHow Mistral AI, an OpenAI

In [50]:
import functools

names_to_functions = {
    'mistral_web_search': functools.partial(mistral_web_search),
}

## chat

In [6]:
messages = [
    ChatMessage(role="user", content="What happend during apple WWDC 2024?"),
]



In [7]:
model = "mistral-large-latest"

client = MistralClient(api_key=MISTRAL_API_KEY)
response = client.chat(model=model, messages=messages, tools=tools, tool_choice="auto")
response

ChatCompletionResponse(id='fb0585bb35c449899283753361aebc7a', object='chat.completion', created=1718068657, model='mistral-large-latest', choices=[ChatCompletionResponseChoice(index=0, message=ChatMessage(role='assistant', content='', name=None, tool_calls=[ToolCall(id='SMqSja1Y4', type=<ToolType.function: 'function'>, function=FunctionCall(name='mistral_web_search', arguments='{"search_query": "apple WWDC 2024"}'))], tool_call_id=None), finish_reason=<FinishReason.tool_calls: 'tool_calls'>)], usage=UsageInfo(prompt_tokens=121, total_tokens=156, completion_tokens=35))

In [8]:
messages.append(response.choices[0].message)

In [9]:
import json

tool_call = response.choices[0].message.tool_calls[0]
function_name = tool_call.function.name
function_params = json.loads(tool_call.function.arguments)


print("\nfunction_name: ", function_name, "\nfunction_params: ", function_params)


function_name:  mistral_web_search 
function_params:  {'search_query': 'apple WWDC 2024'}


In [21]:
function_result = tools_to_str(names_to_functions[function_name](**function_params))
function_result

"Apple Intelligence Revealed at WWDC 2024 as Company Jumps Into AI Race - The New York Times\nSkip to contentSkip to site indexTechnology\xa0Today’s PaperArtificial IntelligenceApple Enters A.I. FrayMeta’s A.I. ScrapingHumane’s A.I. Device FlopOpenAI’s ‘Reckless’ CultureThe New ChatGPTFaces QuizAdvertisementSKIP ADVERTISEMENTYou have a preview view of this article while we are checking your access. When we have confirmed access, the full article content will load.Artificial IntelligenceApple Enters A.I. FrayMeta’s A.I. ScrapingHumane’s A.I. Device FlopOpenAI’s ‘Reckless’ CultureThe New ChatGPTFaces QuizSupported bySKIP ADVERTISEMENTApple Jumps Into A.I.\nSigal Samuel, a senior tech reporter for Vox, unpacks what's going on with the company.Apple uses the developer conference at its Cupertino, Calif., headquarters each year to showcase updates to its own apps and operating systems, as well as to show developers new tools they will be able to use in their apps.The company\xa0has been usi

In [22]:
messages.append(ChatMessage(role="tool", name=function_name, content=function_result, tool_call_id=tool_call.id))

response = client.chat(model=model, messages=messages)
response.choices[0].message.content



"At the Apple Worldwide Developers Conference (WWDC) in 2024, Apple made several significant announcements related to artificial intelligence (AI). The tech giant unveiled a suite of new AI capabilities to be integrated into its newest operating system. A notable development was the collaboration with OpenAI's ChatGPT, which aimed to enhance the functionality of Siri, Apple's interactive voice feature. This integration would allow Siri to provide more relevant and contextual information across various apps on Apple devices.\n\nThe partnership with OpenAI was seen as a major move to stay competitive with other industry leaders like Google and Microsoft, who had been emphasizing AI integration in their products. Additionally, it was announced that ChatGPT would be built into Apple's phones, computers, and iPads later in the year. This marked a significant step for Apple in the AI race, as they had been utilizing AI behind the scenes for years to power features on their devices.\n\nDuring

## Chat in a chain (cleaner user experience)

In [None]:
messages = []

while True:
    input_ = input("Ask: ")
    messages.append(ChatMessage(role="user", content=input_))
    response = client.chat(model=model, messages=messages, tools=tools, tool_choice="auto")
    messages.append(response.choices[0].message)
    print(response.choices[0].message.content)
    tool_call = response.choices[0].message.tool_calls[0]
    function_name = tool_call.function.name
    function_params = json.loads(tool_call.function.arguments)

    function_result_raw = names_to_functions[function_name](**function_params)
    print("sources: ", [f"{source['title']} - {source['url']}" for source in function_result_raw])
    function_result_text = tools_to_str(function_result_raw)
    messages.append(ChatMessage(role="tool", name=function_name, content=function_result_text, tool_call_id=tool_call.id))

    response = client.chat(model=model, messages=messages)
    final_response = response.choices[0].message.content
    print(final_response)
