# Answer questions with citations from Reddit SEC filing with Opper and Mistral

This cookbook is an appendix to: https://opper.ai/blog/simple-rag-with-citations

In [325]:
# Install the datasets library from huggingface
!pip install opperai -U
!pip install pydantic


Collecting opperai
  Downloading opperai-0.5.3-py2.py3-none-any.whl.metadata (4.3 kB)
Downloading opperai-0.5.3-py2.py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opperai
  Attempting uninstall: opperai
    Found existing installation: opperai 0.5.2
    Uninstalling opperai-0.5.2:
      Successfully uninstalled opperai-0.5.2
Successfully installed opperai-0.5.3


## Imports

In [335]:
import os
from opperai import Opper, fn, Client, AsyncClient
from opperai.types.indexes import RetrievalResponse
from pydantic import BaseModel, Field
from typing import List, Literal

os.environ["OPPER_API_KEY"] = "op-BN3K7QS92CIJI16MANUE"

opper = Opper()


## Index PDF

In [344]:

# We get or create our index
index = opper.indexes.get(name="mistral-rag4")
if not index:
    index = opper.indexes.create(name="mistral-rag4")

    # we upload our pdf to the index
    index.upload_file(
        file_path="./reddit-sec.pdf",
        )
print(index)


Index(_client=<opperai._client.Client object at 0x107739b90>, _index=Index(id=1816, name='mistral-rag4', created_at=datetime.datetime(2024, 5, 8, 10, 52, 50, 911931, tzinfo=TzInfo(UTC))))


## Retrieve relevant content for answering question

In [345]:
question = "What are the key financial and growth numbers for Reddit?"
#question = "What is Reddits strategy around AI and future value from AI?"
#question = "Who are the biggest shareholders of Reddit?" 
#question = "Is the company making money and how much?"
#question = "Who are the main competitors of Reddit?"


# We retrieve the content from the index
results = index.query(
    query=question,
    k=3
)

print(results)


[RetrievalResponse(content='04/05/2024, 23:15 Document\nhttps://www.sec.gov/Archives/edgar/data/1713445/000162828024006294/reddits-1q423.htm 19/281Table of Contents\nexcluding China and Russia, is expected to grow at a CAGR of 20% to $1.0 trillion in 2027. We believe the importance of data to all\ntypes of analytics and AI, from training to testing and refining models, positions us well to tap into this strong market.\nUser Economy\nCommerce is already at the core of many communities today. As we introduce new ways to enable developers to add additional\nfunctionality to their communities, we believe there will be further development of economic features on Reddit (e.g., games). We\nsee informal exchanges today of digital goods, services, and even physical goods. We recognize the opportunity that commerce\npresents and we have continued to invest in the future of Reddit’s user economy. Using estimates from IDC’s Consumer Market\nModel and focusing on six core geographies (United States

In [346]:
# Extract file_name and page number from results
class Source(BaseModel):
    file_name: str
    content: str
    page_number: int

class Sources(BaseModel):
    sources: List[Source]

processed_results = [
    Source(
        content=result.content,
        file_name=result.metadata.get("file_name"),
        page_number=result.metadata.get("page")
    ) for result in results
]

print(processed_results)


[Source(file_name='reddit-sec.pdf', content='04/05/2024, 23:15 Document\nhttps://www.sec.gov/Archives/edgar/data/1713445/000162828024006294/reddits-1q423.htm 19/281Table of Contents\nexcluding China and Russia, is expected to grow at a CAGR of 20% to $1.0 trillion in 2027. We believe the importance of data to all\ntypes of analytics and AI, from training to testing and refining models, positions us well to tap into this strong market.\nUser Economy\nCommerce is already at the core of many communities today. As we introduce new ways to enable developers to add additional\nfunctionality to their communities, we believe there will be further development of economic features on Reddit (e.g., games). We\nsee informal exchanges today of digital goods, services, and even physical goods. We recognize the opportunity that commerce\npresents and we have continued to invest in the future of Reddit’s user economy. Using estimates from IDC’s Consumer Market\nModel and focusing on six core geographi

## Create response with citations

In [347]:
class Citation(BaseModel):
    file_name: str 
    page_number: int 
    citation: str 

class Response(BaseModel):
    answer: str 
    citations: List[Citation]

@fn(path="test/mistral-rag/citations", model="azure/mistral-large-eu")
def extract_citations(question: str, sources: List[Source]) -> List[Citation]:
    """ Build a list of citations for the question from the sources"""

@fn(path="test/mistral-rag/response", model="azure/mistral-large-eu")
def produce_response(question: str, citations: List[Citation]) -> Response:
    """ Produce an answer to the question using the possible citations. Refer to any statements or facts from citations inline in the answer with [1], [2] etc. """

citations = extract_citations(question, processed_results)

response = produce_response(question, citations)

print(response)


answer="The key financial and growth numbers for Reddit include Daily Active Unique (DAUq) and Weekly Active Unique (WAUq). In the three months ended December 31, 2023, global DAUq grew 27% compared to the prior year period, driven by 34% growth in DAUq in the United States and 21% growth in DAUq in the rest of the world. Global WAUq grew 29% compared to the prior year period, driven by 39% growth in WAUq in the United States and 20% growth in WAUq in the rest of the world. For the three months ended December 31, 2023, the proportion of DAUq to WAUq was 27%. Reddit's monetization strategies include expanding ad platform capabilities, adding contextual and interest-based signals, optimizing marketplace for ROI, and increasing advertiser diversity and depth. [1] [2] [3]" citations=[Citation(file_name='reddit-sec.pdf', page_number=19, citation='The global market for data, excluding China and Russia, is expected to grow at a CAGR of 20% to $1.0 trillion in 2027. The user economy market siz

## Print it! 

In [348]:
print(response.answer)
print()
index = 1
for citation in response.citations:
    print(f"[{index}]", f'"{citation.citation}"')
    index += 1
    


The key financial and growth numbers for Reddit include Daily Active Unique (DAUq) and Weekly Active Unique (WAUq). In the three months ended December 31, 2023, global DAUq grew 27% compared to the prior year period, driven by 34% growth in DAUq in the United States and 21% growth in DAUq in the rest of the world. Global WAUq grew 29% compared to the prior year period, driven by 39% growth in WAUq in the United States and 20% growth in WAUq in the rest of the world. For the three months ended December 31, 2023, the proportion of DAUq to WAUq was 27%. Reddit's monetization strategies include expanding ad platform capabilities, adding contextual and interest-based signals, optimizing marketplace for ROI, and increasing advertiser diversity and depth. [1] [2] [3]

[1] "The global market for data, excluding China and Russia, is expected to grow at a CAGR of 20% to $1.0 trillion in 2027. The user economy market size is $1.3 trillion today, and it is expected to grow at a CAGR of 12% to $2.1