In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from uuid import uuid4


In [5]:
client = QdrantClient(path="./tmp/langchain_qdrant")

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embeddings,
)

In [11]:
import json

# Load JSON file
with open("sources.json", "r", encoding="utf-8") as f:
    data = json.load(f)

data

{'WHO_STD.txt': 'https://www.who.int/en/news-room/fact-sheets/detail/sexually-transmitted-infections-(stis)',
 'CDC_Syphilis.txt': 'https://www.cdc.gov/syphilis/about/index.html',
 'CDC_Gonorrhea.txt': 'https://www.cdc.gov/gonorrhea/about/index.html',
 'CDC_HPV.txt': 'https://www.cdc.gov/sti/about/about-genital-hpv-infection.html',
 'CDC_OralSex.txt': 'https://www.cdc.gov/sti/about/about-sti-risk-and-oral-sex.html',
 'CDC_HIV1.txt': 'https://www.cdc.gov/hiv/about/index.html',
 'CDC_HIV2.txt': 'https://www.cdc.gov/hiv/causes/index.html',
 'CDC_HIV3.txt': 'https://www.cdc.gov/hiv/prevention/index.html',
 'CleavelandClinic_HIV.txt': 'https://my.clevelandclinic.org/health/diseases/4251-hiv-aids',
 'CleavelandClinic_HSV.txt': 'https://my.clevelandclinic.org/health/diseases/22855-herpes-simplex',
 'CleavelandClinic_HPV.txt': 'https://my.clevelandclinic.org/health/diseases/11901-hpv-human-papilloma-virus',
 'CleavelandClinic_Chlamydia.txt': 'https://my.clevelandclinic.org/health/diseases/4023

In [30]:
Documents = []

for file, url in data.items():
    source = file.split('.')[0].split('_')[0]
    topic = file.split('.')[0].split('_')[-1]
    
    with open(f"data/raw/{file}", "r", encoding="utf-8") as f:
        text = f.read()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

    chunks = text_splitter.split_text(text)

    meta_data = {'source':source, 'url':url, 'topic':topic}
    for chunk in chunks:
        document = Document(page_content=chunk, metadata=meta_data)
        Documents.append(document)
        

    
    

In [33]:
uuids = [str(uuid4()) for _ in range(len(Documents))]

In [34]:
vector_store.add_documents(documents=Documents, ids=uuids)

['b57f4136-2d28-41a4-8f0a-5bc3f96ceff2',
 'a6141d6a-e61c-41d0-a753-633be5113306',
 'e0bb191a-982d-4dd6-84ed-4e449526f861',
 '0644acea-e1b5-4b96-8413-f5bb3815afc6',
 'bfc80ff7-f198-4f6f-af13-d5235fe557d3',
 'cb10f5c9-2fc9-4611-846c-49a5ecd28415',
 '2293420f-a654-4bbe-8aa0-e612cc99d0e9',
 '6daa5a45-799c-466f-84b0-7fa44731c790',
 'd507b19d-3c1e-44a4-88d7-7d19cc3a5029',
 '6cfa6923-97aa-4380-9182-e6527c7357f8',
 '6211d17a-75dc-41d9-b0b4-cb2a96841c91',
 'f1e786bb-024d-4e8f-85fd-f3798cf1bf8a',
 'a4d9c2e2-d493-403f-91a4-6a17709cdbfe',
 '32745185-23bc-46e7-a9bc-1d4340228d16',
 '61a5236a-cce7-48a7-958f-ba3432f00988',
 '705cf4cf-7c04-4060-b74a-a4d2d4304efe',
 '0f2f6b26-906f-42ec-ae16-c26ec8d9293c',
 '55ccea04-b3df-45c6-84f8-28cb018f307c',
 'd8bbf8bc-e47f-4452-a8b8-74d962e8780b',
 '7117852e-ea2c-4a95-88c5-ec0dae87e9e3',
 '021a0e0e-0952-49e7-b457-d1a2bfe6538b',
 '0104b1f1-5862-4015-ba8c-73fa827e64fe',
 '9c878071-ffd5-4dc1-9aac-1daa0ff140df',
 'c3f1aea7-f6c3-4c5f-81d1-e11764b6d14c',
 '95099bc2-a6d2-