RAG llama LLM

manufy · Jun 10, 2024 · c61ad9f · c61ad9f
1 parent 90807ad
commit c61ad9f
Show file tree

Hide file tree

Showing 11 changed files with 655 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ old
 *.mp4
 5-VectorDB/chroma
 onnxruntime*
+storage*
 
 
 
diff --git a/4-LangChain-0.2.3/RAG QA/OpeinAI_RAG.ipynb b/4-LangChain-0.2.3/RAG QA/OpeinAI_RAG.ipynb
@@ -16,32 +16,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'langchain_chroma'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m hub\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_community\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WebBaseLoader\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_chroma\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Chroma\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moutput_parsers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m StrOutputParser\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrunnables\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RunnablePassthrough\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_chroma'"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:USER_AGENT environment variable not set, consider setting it to identify your requests.\n"
      ]
     }
    ],
    "source": [
     "import bs4\n",
     "from langchain import hub\n",
     "from langchain_community.document_loaders import WebBaseLoader\n",
-    "from langchain_chroma import Chroma\n",
+    "#from langchain_chroma import Chroma\n",
     "from langchain_core.output_parsers import StrOutputParser\n",
     "from langchain_core.runnables import RunnablePassthrough\n",
     "from langchain_openai import OpenAIEmbeddings\n",
     "from langchain_text_splitters import RecursiveCharacterTextSplitter"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import chromadb"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/5-VectorDB/ChromaDB/chroma-test.ipynb b/5-VectorDB/ChromaDB/chroma-test.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using embedded DuckDB without persistence: data will be transient\n",
+      "Using embedded DuckDB without persistence: data will be transient\n",
+      "No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n",
+      "/opt/anaconda3/envs/langchain-0.2.3/lib/python3.12/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm, trange\n",
+      "/opt/anaconda3/envs/langchain-0.2.3/lib/python3.12/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n",
+      "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n",
+      "the same time. Both libraries are known to be incompatible and this\n",
+      "can cause random crashes or deadlocks on Linux when loaded in the\n",
+      "same Python program.\n",
+      "Using threadpoolctl may cause crashes or deadlocks. For more\n",
+      "information and possible workarounds, please see\n",
+      "    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n",
+      "\n",
+      "  warnings.warn(msg, RuntimeWarning)\n",
+      "/opt/anaconda3/envs/langchain-0.2.3/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mEl kernel se bloqueó al ejecutar código en la celda actual o en una celda anterior. \n",
+      "\u001b[1;31mRevise el código de las celdas para identificar una posible causa del error. \n",
+      "\u001b[1;31mHaga clic <a href='https://aka.ms/vscodeJupyterKernelCrash'>aquí</a> para obtener más información. \n",
+      "\u001b[1;31mVea Jupyter <a href='command:jupyter.viewOutput'>log</a> para obtener más detalles."
+     ]
+    }
+   ],
+   "source": [
+    "import chromadb\n",
+    "# setup Chroma in-memory, for easy prototyping. Can add persistence easily!\n",
+    "client = chromadb.Client()\n",
+    "\n",
+    "chromadb.Client()  # connect to a running Chroma server\n",
+    "\n",
+    "# Create collection. get_collection, get_or_create_collection, delete_collection also available!\n",
+    "collection = client.create_collection(\"all-my-documents\")\n",
+    "\n",
+    "# Add docs to the collection. Can also update and delete. Row-based API coming soon!\n",
+    "collection.add(\n",
+    "    documents=[\"This is document1\", \"This is document2\"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well\n",
+    "    metadatas=[{\"source\": \"notion\"}, {\"source\": \"google-docs\"}], # filter on these!\n",
+    "    ids=[\"doc1\", \"doc2\"], # unique for each doc\n",
+    ")\n",
+    "\n",
+    "# Query/search 2 most similar results. You can also .get by id\n",
+    "results = collection.query(\n",
+    "    query_texts=[\"This is a query document\"],\n",
+    "    n_results=2,\n",
+    "    # where={\"metadata_field\": \"is_equal_to_this\"}, # optional filter\n",
+    "    # where_document={\"$contains\":\"search_string\"}  # optional filter\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langchain-0.2.3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/5-VectorDB/ChromaDB/client-test.py b/5-VectorDB/ChromaDB/client-test.py
@@ -0,0 +1,21 @@
+import chromadb
+# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
+client = chromadb.Client()
+
+# Create collection. get_collection, get_or_create_collection, delete_collection also available!
+collection = client.create_collection("all-my-documents")
+
+# Add docs to the collection. Can also update and delete. Row-based API coming soon!
+collection.add(
+    documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
+    metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
+    ids=["doc1", "doc2"], # unique for each doc
+)
+
+# Query/search 2 most similar results. You can also .get by id
+results = collection.query(
+    query_texts=["This is a query document"],
+    n_results=2,
+    # where={"metadata_field": "is_equal_to_this"}, # optional filter
+    # where_document={"$contains":"search_string"}  # optional filter
+)
diff --git a/5-VectorDB/Chroma-Docker/docker-compose.yml → 5-VectorDB/ChromaDB/docker-compose.yml b/5-VectorDB/Chroma-Docker/docker-compose.yml → 5-VectorDB/ChromaDB/docker-compose.yml
diff --git a/5-VectorDB/ChromaDB/requirements.txt b/5-VectorDB/ChromaDB/requirements.txt
@@ -0,0 +1 @@
+chromadb-client
diff --git a/6-LlamaIndex/data/mio.txt b/6-LlamaIndex/data/mio.txt
@@ -0,0 +1 @@
+hola
diff --git a/6-LlamaIndex/dataingestion.ipynb b/6-LlamaIndex/dataingestion.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import Document\n",
+    "\n",
+    "doc = Document(text=\"text\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "vector_index = VectorStoreIndex.from_documents(doc)\n",
+    "vector_index.as_query_engine()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SimpleDirectoryReader\n",
+    "from llama_index.core.ingestion import IngestionPipeline\n",
+    "from llama_index.core.node_parser import TokenTextSplitter\n",
+    "\n",
+    "documents = SimpleDirectoryReader(\"./data\").load_data()\n",
+    "\n",
+    "pipeline = IngestionPipeline(transformations=[TokenTextSplitter(), ...])\n",
+    "\n",
+    "nodes = pipeline.run(documents=documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import Document\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "document = Document(\n",
+    "    text=\"text\",\n",
+    "    metadata={\"filename\": \"<doc_file_name>\", \"category\": \"<category>\"},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.schema import TextNode\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "node1 = TextNode(text=\"<text_chunk>\", id_=\"<node_id>\")\n",
+    "node2 = TextNode(text=\"<text_chunk>\", id_=\"<node_id>\")\n",
+    "\n",
+    "index = VectorStoreIndex([node1, node2])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langchain-0.2.3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/6-LlamaIndex/requirements.txt b/6-LlamaIndex/requirements.txt
@@ -0,0 +1,11 @@
+llama-index-core
+llama-index-llms-openai
+llama-index-embeddings-openai
+llama-index-program-openai
+llama-index-question-gen-openai
+llama-index-agent-openai
+llama-index-readers-file
+llama-index-multi-modal-llms-openai
+
+llama-index-llms-ollama 
+llama-index-embeddings-huggingface