Skip to content

Commit

Permalink
RAG llama LLM
Browse files Browse the repository at this point in the history
  • Loading branch information
manufy committed Jun 10, 2024
1 parent 90807ad commit c61ad9f
Show file tree
Hide file tree
Showing 11 changed files with 655 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ old
*.mp4
5-VectorDB/chroma
onnxruntime*
storage*



25 changes: 15 additions & 10 deletions 4-LangChain-0.2.3/RAG QA/OpeinAI_RAG.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,37 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'langchain_chroma'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[9], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m hub\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_community\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WebBaseLoader\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_chroma\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Chroma\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moutput_parsers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m StrOutputParser\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrunnables\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RunnablePassthrough\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_chroma'"
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:USER_AGENT environment variable not set, consider setting it to identify your requests.\n"
]
}
],
"source": [
"import bs4\n",
"from langchain import hub\n",
"from langchain_community.document_loaders import WebBaseLoader\n",
"from langchain_chroma import Chroma\n",
"#from langchain_chroma import Chroma\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.runnables import RunnablePassthrough\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import chromadb"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
91 changes: 91 additions & 0 deletions 5-VectorDB/ChromaDB/chroma-test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using embedded DuckDB without persistence: data will be transient\n",
"Using embedded DuckDB without persistence: data will be transient\n",
"No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n",
"/opt/anaconda3/envs/langchain-0.2.3/lib/python3.12/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from tqdm.autonotebook import tqdm, trange\n",
"/opt/anaconda3/envs/langchain-0.2.3/lib/python3.12/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n",
"Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n",
"the same time. Both libraries are known to be incompatible and this\n",
"can cause random crashes or deadlocks on Linux when loaded in the\n",
"same Python program.\n",
"Using threadpoolctl may cause crashes or deadlocks. For more\n",
"information and possible workarounds, please see\n",
" https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n",
"\n",
" warnings.warn(msg, RuntimeWarning)\n",
"/opt/anaconda3/envs/langchain-0.2.3/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mEl kernel se bloqueó al ejecutar código en la celda actual o en una celda anterior. \n",
"\u001b[1;31mRevise el código de las celdas para identificar una posible causa del error. \n",
"\u001b[1;31mHaga clic <a href='https://aka.ms/vscodeJupyterKernelCrash'>aquí</a> para obtener más información. \n",
"\u001b[1;31mVea Jupyter <a href='command:jupyter.viewOutput'>log</a> para obtener más detalles."
]
}
],
"source": [
"import chromadb\n",
"# setup Chroma in-memory, for easy prototyping. Can add persistence easily!\n",
"client = chromadb.Client()\n",
"\n",
"chromadb.Client() # connect to a running Chroma server\n",
"\n",
"# Create collection. get_collection, get_or_create_collection, delete_collection also available!\n",
"collection = client.create_collection(\"all-my-documents\")\n",
"\n",
"# Add docs to the collection. Can also update and delete. Row-based API coming soon!\n",
"collection.add(\n",
" documents=[\"This is document1\", \"This is document2\"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well\n",
" metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}], # filter on these!\n",
" ids=[\"doc1\", \"doc2\"], # unique for each doc\n",
")\n",
"\n",
"# Query/search 2 most similar results. You can also .get by id\n",
"results = collection.query(\n",
" query_texts=[\"This is a query document\"],\n",
" n_results=2,\n",
" # where={\"metadata_field\": \"is_equal_to_this\"}, # optional filter\n",
" # where_document={\"$contains\":\"search_string\"} # optional filter\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "langchain-0.2.3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
21 changes: 21 additions & 0 deletions 5-VectorDB/ChromaDB/client-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import chromadb
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()

# Create collection. get_collection, get_or_create_collection, delete_collection also available!
collection = client.create_collection("all-my-documents")

# Add docs to the collection. Can also update and delete. Row-based API coming soon!
collection.add(
documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
ids=["doc1", "doc2"], # unique for each doc
)

# Query/search 2 most similar results. You can also .get by id
results = collection.query(
query_texts=["This is a query document"],
n_results=2,
# where={"metadata_field": "is_equal_to_this"}, # optional filter
# where_document={"$contains":"search_string"} # optional filter
)
File renamed without changes.
1 change: 1 addition & 0 deletions 5-VectorDB/ChromaDB/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chromadb-client
1 change: 1 addition & 0 deletions 6-LlamaIndex/data/mio.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hola
94 changes: 94 additions & 0 deletions 6-LlamaIndex/dataingestion.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import Document\n",
"\n",
"doc = Document(text=\"text\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import VectorStoreIndex\n",
"\n",
"vector_index = VectorStoreIndex.from_documents(doc)\n",
"vector_index.as_query_engine()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import SimpleDirectoryReader\n",
"from llama_index.core.ingestion import IngestionPipeline\n",
"from llama_index.core.node_parser import TokenTextSplitter\n",
"\n",
"documents = SimpleDirectoryReader(\"./data\").load_data()\n",
"\n",
"pipeline = IngestionPipeline(transformations=[TokenTextSplitter(), ...])\n",
"\n",
"nodes = pipeline.run(documents=documents)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import Document\n",
"from llama_index.core import VectorStoreIndex\n",
"document = Document(\n",
" text=\"text\",\n",
" metadata={\"filename\": \"<doc_file_name>\", \"category\": \"<category>\"},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.schema import TextNode\n",
"from llama_index.core import VectorStoreIndex\n",
"\n",
"node1 = TextNode(text=\"<text_chunk>\", id_=\"<node_id>\")\n",
"node2 = TextNode(text=\"<text_chunk>\", id_=\"<node_id>\")\n",
"\n",
"index = VectorStoreIndex([node1, node2])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "langchain-0.2.3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
11 changes: 11 additions & 0 deletions 6-LlamaIndex/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
llama-index-core
llama-index-llms-openai
llama-index-embeddings-openai
llama-index-program-openai
llama-index-question-gen-openai
llama-index-agent-openai
llama-index-readers-file
llama-index-multi-modal-llms-openai

llama-index-llms-ollama
llama-index-embeddings-huggingface
Loading

0 comments on commit c61ad9f

Please sign in to comment.