In [1]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
TEXT = ["Python is a versatile and widely used programming language known for its clean and readable syntax, which relies on indentation for code structure",
        "It is a general-purpose language suitable for web development, data analysis, AI, machine learning, and automation. Python offers an extensive standard library with modules covering a broad range of tasks, making it efficient for developers.",
        "It is cross-platform, running on Windows, macOS, Linux, and more, allowing for broad application compatibility."
        "Python has a large and active community that develops libraries, provides documentation, and offers support to newcomers.",
        "It has particularly gained popularity in data science and machine learning due to its ease of use and the availability of powerful libraries and frameworks."]

meta_data = [{"source": "document 1", "page": 1},
             {"source": "document 2", "page": 2},
             {"source": "document 3", "page": 3},
             {"source": "document 4", "page": 4}]

In [3]:
embedding_func = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
)

In [5]:
vector_db = Chroma.from_texts(
    texts=TEXT,
    embedding=embedding_func,
    metadatas=meta_data
)

In [6]:
response = vector_db.similarity_search(
    query="Tell me about a programming language used for data science", k=2)

In [7]:
print(response)

[Document(metadata={'page': 2, 'source': 'document 2'}, page_content='It is a general-purpose language suitable for web development, data analysis, AI, machine learning, and automation. Python offers an extensive standard library with modules covering a broad range of tasks, making it efficient for developers.'), Document(metadata={'page': 1, 'source': 'document 1'}, page_content='Python is a versatile and widely used programming language known for its clean and readable syntax, which relies on indentation for code structure')]


In [8]:
response = vector_db.max_marginal_relevance_search(
    query="Tell me about a programming language used for data science", k=2, fetch_k=3)

In [9]:
print(response)

[Document(metadata={'page': 2, 'source': 'document 2'}, page_content='It is a general-purpose language suitable for web development, data analysis, AI, machine learning, and automation. Python offers an extensive standard library with modules covering a broad range of tasks, making it efficient for developers.'), Document(metadata={'page': 4, 'source': 'document 4'}, page_content='It has particularly gained popularity in data science and machine learning due to its ease of use and the availability of powerful libraries and frameworks.')]


In [10]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

In [11]:
model = pipeline(
    "text-generation",
    model="facebook/opt-125m",  # This is a smaller, free model
    device="cpu"
)

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyboardInterrupt: 

In [None]:
llm = HuggingFacePipeline(pipeline=model)