In [7]:
import os
import itertools
import nltk
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import ArxivLoader, PyPDFLoader

nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/mag7273/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
arxiv_papers = {
    "AIM": "2105.13345",
}

papers = ArxivLoader(arxiv_papers["AIM"]).load()
cv = PyPDFLoader("https://mauriciogtec.com/_static/cv.pdf").load()
docs = papers + cv


In [9]:
from langchain.text_splitter import NLTKTextSplitter

splitter = NLTKTextSplitter(chunk_overlap=100, chunk_size=500)
splits = splitter.split_documents(docs)
splits[:10]

Created a chunk of size 739, which is longer than the specified 500
Created a chunk of size 730, which is longer than the specified 500
Created a chunk of size 558, which is longer than the specified 500
Created a chunk of size 644, which is longer than the specified 500
Created a chunk of size 1154, which is longer than the specified 500
Created a chunk of size 819, which is longer than the specified 500


[Document(page_content='Adversarial Intrinsic Motivation for Reinforcement\nLearning\nIshan Durugkar\nDepartment of Computer Science\nThe University of Texas at Austin\nAustin, TX, USA 78703\nishand@cs.utexas.edu\nMauricio Tec\nDepartment of Statistics and Data Sciences\nThe University of Texas at Austin\nAustin, TX, USA 78703\nmauriciogtec@utexas.edu\nScott Niekum\nDepartment of Computer Science\nThe University of Texas at Austin\nAustin, TX, USA 78703\nsniekum@cs.utexas.edu\nPeter Stone\nDepartment of Computer Science\nThe University of Texas at Austin\nAustin, TX, USA 78703 and\nSony AI\npstone@cs.utexas.edu\nAbstract\nLearning with an objective to minimize the mismatch with a reference distribution\nhas been shown to be useful for generative modeling and imitation learning.', metadata={'Published': '2021-10-28', 'Title': 'Adversarial Intrinsic Motivation for Reinforcement Learning', 'Authors': 'Ishan Durugkar, Mauricio Tec, Scott Niekum, Peter Stone', 'Summary': "Learning with an o

In [10]:
splits[0].metadata

{'Published': '2021-10-28',
 'Title': 'Adversarial Intrinsic Motivation for Reinforcement Learning',
 'Authors': 'Ishan Durugkar, Mauricio Tec, Scott Niekum, Peter Stone',
 'Summary': "Learning with an objective to minimize the mismatch with a reference\ndistribution has been shown to be useful for generative modeling and imitation\nlearning. In this paper, we investigate whether one such objective, the\nWasserstein-1 distance between a policy's state visitation distribution and a\ntarget distribution, can be utilized effectively for reinforcement learning\n(RL) tasks. Specifically, this paper focuses on goal-conditioned reinforcement\nlearning where the idealized (unachievable) target distribution has full\nmeasure at the goal. This paper introduces a quasimetric specific to Markov\nDecision Processes (MDPs) and uses this quasimetric to estimate the above\nWasserstein-1 distance. It further shows that the policy that minimizes this\nWasserstein-1 distance is the policy that reaches th

In [11]:
splits[-1].metadata

{'source': 'https://mauriciogtec.com/_static/cv.pdf', 'page': 2}

In [12]:
embedding = OpenAIEmbeddings()

  warn_deprecated(


In [13]:
embedding.embed_documents(
    "Adversarial Intrinsic Motivation for Reinforcement\nLearning"
);

In [14]:
# !rm -rf ./docs/chroma

In [35]:
persist_directory = "./docs/chroma"
vectordb = Chroma.from_documents(
    splits,
    embedding=embedding,
    persist_directory=persist_directory,
)
vectordb.persist()

In [34]:
vectordb._collection.count()

238

In [18]:
query = "Summarize Mauricio's skills"

In [19]:
search = vectordb.search(query, search_type="similarity", k=10)
search

[Document(page_content='Mauricio Tec\nGoogle Scholar ὑ7mauriciogtec.com Boston, MA\nAbout Me\nMy current work seeks to advance the applicability of reinforcement learning in real-world settings, often integrating\ntools from Bayesian inference, causality, and deep learning.\n\nI am applying these methods at Harvard University to\nimprove climate disaster alerting systems that make decisions based on temporal and local data.', metadata={'source': 'https://mauriciogtec.com/_static/cv.pdf', 'page': 0}),
 Document(page_content='B.Sc.\n\nin Applied Mathematics, Instituto Tecnologico Autonomo de Mexico (ITAM), 2007–2012\nWork Experience\nHarvard University, Postdoctoral Research Fellow/Research Associate , 2022–date\n•Designing critical deep reinforcement learning applications to optimize the issuance of US heat alerts that utilize\ndaily timeseries and forecasts to make smart decisions about when and how to take action.\n\n•Writing and publish-\ning papers and software in top ML conferences

In [24]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo


metadata_field_info = [
    AttributeInfo(
        name="Published",
        type="date",
        description="Date of paper/publication",
    ),
    AttributeInfo(
        name="Title",
        type="string",
        description="Title of the paper/publication",
    ),
    AttributeInfo(
        name="Summary",
        type="string",
        description="Summary of the paper/publication",
    ),
    AttributeInfo(
        name="Authors",
        type="string",
        description="Authors of the paper/publication",
    ),
    AttributeInfo(
        name="source",
        type="string",
        description="URL or source of the file in case of a pdf",
    ),
    AttributeInfo(
        name="page",
        type="integer",
        description="Page number in case of a pdf",
    ),
]

In [25]:
document_content_description = "CV information and research publications"
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True,
)

In [27]:
search = retriever.get_relevant_documents("Skills from CV")
search

[Document(page_content='Skills\n•Programming Languages : Python (preferred); Julia, R (proficient); C++ (intermediate);\n•High-performance Computing : Slurm (advanced); AWS/Azure Cloud (intermediate);\n•Data Science : SQL (advanced); tidyverse, ggplot, pandas, ggplot, networkx (proficient); NLP (advanced);\n•Development and Pipelines : Git, Docker, SnakeFlow, Linux (advanced);\n•Deep Learning : PyTorch, Tensorflow, Image segmentation/Object detection, Spatial methods, (proficient); GNNs,\nAttention-based models (advanced); Multi-GPU, Distributed training (intermediate);\n•Experiment Design/ Sequential Decision-making : Active learning, Reinforcement learning, Tree search (proficient);\n•Statistics: Bayesian inference, Causal inference, A/B testing (proficient);\nScholarships, Grants, and Awards\n-NIH Supplement 3RF1AG080948-01S1 (2023–2025).', metadata={'source': 'https://mauriciogtec.com/_static/cv.pdf', 'page': 1}),
 Document(page_content='B.Sc.\n\nin Applied Mathematics, Instituto T

In [28]:
vectordb.max_marginal_relevance_search("Skills from CV", k=2, fetch_k=5)

[Document(page_content='Skills\n•Programming Languages : Python (preferred); Julia, R (proficient); C++ (intermediate);\n•High-performance Computing : Slurm (advanced); AWS/Azure Cloud (intermediate);\n•Data Science : SQL (advanced); tidyverse, ggplot, pandas, ggplot, networkx (proficient); NLP (advanced);\n•Development and Pipelines : Git, Docker, SnakeFlow, Linux (advanced);\n•Deep Learning : PyTorch, Tensorflow, Image segmentation/Object detection, Spatial methods, (proficient); GNNs,\nAttention-based models (advanced); Multi-GPU, Distributed training (intermediate);\n•Experiment Design/ Sequential Decision-making : Active learning, Reinforcement learning, Tree search (proficient);\n•Statistics: Bayesian inference, Causal inference, A/B testing (proficient);\nScholarships, Grants, and Awards\n-NIH Supplement 3RF1AG080948-01S1 (2023–2025).', metadata={'source': 'https://mauriciogtec.com/_static/cv.pdf', 'page': 1}),
 Document(page_content='ITAM,Lecturer, Applied Mathematics Departmen

In [29]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [30]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vectordb.as_retriever(search_type="mmr")
)

In [31]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

question = "Skills from CV"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

•Programming Languages : Python (preferred); Julia, R (proficient); C++ (intermediate);
•High-performance Computing : Slurm (advanced); AWS/Azure Cloud (intermediate);
•Data Science : SQL (advanced); tidyverse, ggplot, pandas, ggplot, networkx (proficient); NLP (advanced);
•Development and Pipelines : Git, Docker, SnakeFlow, Linux (advanced);
•Deep Learning : PyTorch, Tensorflow, Image segmentation/Object detection, Spatial methods, (proficient); GNNs,
Attention-based models (advanced); Multi-GPU, Distributed training (intermediate);
•Experiment Design/ Sequential Decision-making : Active learning, Reinforcement learning, Tree search (proficient);
•Statistics: Bayesian inference, Causal inference, A/B testing (proficient);
----------------------------------------------------------------------------------------------------
Document 2:

ITAM,Lecturer, Applied Mathematics Department , 2015–2017
Taught courses in computational statistics and stochastic processes to undergradua

In [None]:
(
    1
    if (1 > 0) else
    0
)

1