In [12]:
import os
import sys

In [13]:
llm_name = "gpt-3.5-turbo"

In [14]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = "docs/chroma/"
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [15]:
question = "What are Mauricio's skills?"
docs = vectordb.search(question, k=3, search_type="mmr", fetch_k=6)
docs

[Document(page_content='Mauricio Tec\nGoogle Scholar ὑ7mauriciogtec.com Boston, MA\nAbout Me\nMy current work seeks to advance the applicability of reinforcement learning in real-world settings, often integrating\ntools from Bayesian inference, causality, and deep learning.\n\nI am applying these methods at Harvard University to\nimprove climate disaster alerting systems that make decisions based on temporal and local data.', metadata={'source': 'https://mauriciogtec.com/_static/cv.pdf', 'page': 0}),
 Document(page_content='Skills\n•Programming Languages : Python (preferred); Julia, R (proficient); C++ (intermediate);\n•High-performance Computing : Slurm (advanced); AWS/Azure Cloud (intermediate);\n•Data Science : SQL (advanced); tidyverse, ggplot, pandas, ggplot, networkx (proficient); NLP (advanced);\n•Development and Pipelines : Git, Docker, SnakeFlow, Linux (advanced);\n•Deep Learning : PyTorch, Tensorflow, Image segmentation/Object detection, Spatial methods, (proficient); GNNs,\n

In [16]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

  warn_deprecated(


### Retrieval QA

In [17]:
from langchain.chains import RetrievalQA

In [18]:
qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=vectordb.as_retriever(search_type="mmr", fetch_k=6, k=3)
)

In [19]:
result = qa_chain({"query": question})
result

  warn_deprecated(


{'query': "What are Mauricio's skills?",
 'result': "Mauricio's skills include programming languages such as Python, Julia, R, and C++, high-performance computing using Slurm and AWS/Azure Cloud, data science with SQL, tidyverse, ggplot, pandas, ggplot, and networkx, NLP, development and pipelines using Git, Docker, SnakeFlow, and Linux, deep learning with PyTorch, Tensorflow, image segmentation/object detection, spatial methods, GNNs, attention-based models, multi-GPU, and distributed training, experiment design/sequential decision-making including active learning, reinforcement learning, and tree search, and statistics including Bayesian inference, causal inference, and A/B testing."}

### Prompt

In [20]:
from langchain.prompts import PromptTemplate

template = """
Intructions:

You are a chatbot named 'Mauricio Tec's Live CV' designed to provide specific information about Mauricio's professional and academic background. You will encounter questions about Mauricio's key projects, his work on deep learning, his most cited works, and his research on spatial causal inference. To answer these inquiries, you will reference and analyze the content of his papers and data from Google Scholar.

You must emphasize accuracy and detail in discussing his work, always maintaining a professional tone. If a query is about a topic not covered by the available material, you should politely state that the information is not within your provided resources. You're expected to guide users to understand Mauricio's research contributions and academic impact, facilitating a comprehensive insight into his scholarly achievements.

If someone asks about you, respond in first person as if you were Mauricio. If they ask about Mauricio, respond in the third person about him. You should always verify the information in cv.pdf as the main source.

If a questions is not about Mauricio (or you), refuse to answer and politely say that "You are an application with exclusive purpose of being a live CV for Mauricio"

Below are links to Mauricio's paper, that you may use for retrieval:
- Covid-19 model, published at PNAS: https://www.pnas.org/doi/full/10.1073/pnas.2113561119
- The Spatial Confonfounding Environment, published at CLeAR: https://www.cclear.cc/2023/AcceptedDatasets/tec23a.pdf
- Adversarial Intrinsic Motivation, published at NeurIPS: https://proceedings.neurips.cc/paper/2021/file/486c0401c56bf7ec2daa9eba58907da9-Paper.pdf
- Bayesian Non-Parametric Adjustment for Confounding, published at Biometrics: https://arxiv.org/abs/2203.11798#:~:text=Analysis%20of%20observational%20studies%20increasingly,for%20estimation%20of%20causal%20effects.
- Weather2vec, published at AAAI:
https://ojs.aaai.org/index.php/AAAI/article/view/26696
- Towards a Real-Time, Low-Resource, End-to-end Object Detection Pipeline for Robot Soccer, published at Robocup 2022: https://www.cs.utexas.edu/~pstone/Papers/bib2html-links/RoboCup2022-nskiran.pdf
- Watch Where You’re Going! Gaze and Head Orientation as Predictors for Social Robot Navigation, published at IEEE ICRA conference:
https://ieeexplore.ieee.org/document/9561286

You may also answer questions that you can analyze from my Github repository, such as the (good) 
quality and diversity of my code: https://github.com/mauriciogtec

Below you will be given a context and a question you must answer based on the above and the context.

Context:
{context}

Answer the following question(s):
{question}

Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
QA_CHAIN_PROMPT

PromptTemplate(input_variables=['context', 'question'], template='\nIntructions:\n\nYou are a chatbot named \'Mauricio Tec\'s Live CV\' designed to provide specific information about Mauricio\'s professional and academic background. You will encounter questions about Mauricio\'s key projects, his work on deep learning, his most cited works, and his research on spatial causal inference. To answer these inquiries, you will reference and analyze the content of his papers and data from Google Scholar.\n\nYou must emphasize accuracy and detail in discussing his work, always maintaining a professional tone. If a query is about a topic not covered by the available material, you should politely state that the information is not within your provided resources. You\'re expected to guide users to understand Mauricio\'s research contributions and academic impact, facilitating a comprehensive insight into his scholarly achievements.\n\nIf someone asks about you, respond in first person as if you we

In [21]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(search_type="mmr", fetch_k=10, k=3),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [22]:
result = qa_chain({"query": question})

question = "What are Mauricio's skills in reinforcement learning?"
print(result["result"])

Mauricio possesses a diverse range of skills that contribute to his expertise in various domains. His programming languages proficiency includes Python (preferred), Julia, R (proficient), and intermediate knowledge of C++. He is well-versed in high-performance computing tools such as Slurm (advanced) and has intermediate experience with AWS/Azure Cloud. In the field of data science, Mauricio demonstrates advanced skills in SQL and proficiency in tidyverse, ggplot, pandas, ggplot, and networkx. He also has advanced knowledge in NLP. Mauricio is experienced in development and pipelines, utilizing tools like Git, Docker, SnakeFlow, and Linux (advanced). In the realm of deep learning, he is proficient in PyTorch, Tensorflow, image segmentation/object detection, and spatial methods. He has advanced expertise in GNNs and attention-based models, as well as intermediate knowledge in multi-GPU and distributed training. Mauricio's skill set extends to experiment design and sequential decision-ma

In [23]:
result['source_documents']

[Document(page_content='Mauricio Tec\nGoogle Scholar ὑ7mauriciogtec.com Boston, MA\nAbout Me\nMy current work seeks to advance the applicability of reinforcement learning in real-world settings, often integrating\ntools from Bayesian inference, causality, and deep learning.\n\nI am applying these methods at Harvard University to\nimprove climate disaster alerting systems that make decisions based on temporal and local data.', metadata={'source': 'https://mauriciogtec.com/_static/cv.pdf', 'page': 0}),
 Document(page_content='[60] Vieri Giuliano Santucci, Gianluca Baldassarre, and Marco Mirolli.\n\nWhich is the best intrinsic\nmotivation signal for learning multiple skills?\n\nFrontiers in neurorobotics, 7:22, 2013.\n\n[61] T. Schaul, Daniel Horgan, K. Gregor, and D. Silver.\n\nUniversal value function approximators.\n\nIn\nICML, 2015.\n\n[62] Massimiliano Schembri, Marco Mirolli, and Gianluca Baldassarre.\n\nEvolving internal reinforcers\nfor an intrinsically motivated reinforcement-lea