In [1]:
import os
from langchain_community.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Milvus as m

In [None]:
os.environ["APIFY_API_TOKEN"] = "apify_api_qUQEtSba0coNeeNI8awoz8HnDUph7k2CHSGO"

#### Load Dataset

##### If from already created dataset

In [None]:
from langchain.document_loaders import ApifyDatasetLoader

loader = ApifyDatasetLoader(
    dataset_id = "NAezyund9gB9RCNUb",
    dataset_mapping_function = lambda item: Document(
        page_content = item["text"] or "", metadata = {"source": item["url"]}
    ), 
)

##### If dataset needs to be created

In [None]:
from langchain_community.utilities import ApifyWrapper

apify = ApifyWrapper()
urls = [{"url": "https://bids-specification.readthedocs.io/en/stable/"}]

loader = apify.call_actor(
    actor_id = "apify/website-content-crawler",
    run_input = {"startUrls": urls},
    dataset_mapping_function=lambda item: Document(
        page_content = item["text"] or "", metadata = {"source": item["url"]}
    ), 

)

##### Load and split dataset

In [None]:
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=25)
all_splits = text_splitter.split_documents(data)

#### JINA embedding NO LONGER USING

In [None]:
from langchain_community.embeddings.jina import JinaEmbeddings

embeddings = JinaEmbeddings(
   jina_api_key="jina_83b2617068f34370aaf8e29793afa2ddU5NjqSHZ7kAiAhrziXr5VX0rOKoB", model_name="jina-embeddings-v2-small-en"
)


#### Vector Database

In [2]:
COLLECTION_NAME = 'bids_db'

URI = 'http://localhost:19530'

connection_args = {'uri':URI}

In [None]:
vector_store = m(
    embedding_function = embeddings,
    connection_args = connection_args,
    collection_name = COLLECTION_NAME,
    drop_old=True,
    ).from_documents(all_splits,
                     embedding= embeddings,
                     collection_name = COLLECTION_NAME,
                     connection_args=connection_args,
                     )

##### Testing Vector Database

In [None]:
querry = "How can I contribute to BIDS?"

docs = vector_store.similarity_search(querry)

#### Example Selector

In [3]:
import json

In [4]:
from langchain_community.embeddings import AlephAlphaSymmetricSemanticEmbedding
from langchain_chroma import Chroma
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_community.embeddings import HuggingFaceHubEmbeddings

In [5]:
with open("examples.json","r") as f:
    examples = json.load(f)

In [8]:
examples

[{'index': 'bold.json',
  'SeriesDescription': 'BOLD IPAT2 BART',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - Stopmanual1',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - Stopmanual2',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - Stopvocal1',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - Stopvocal2',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - Stopword1',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - Stopword2',
  'ProtocolName': 'NA'},
 {'index': 'inplaneT2.json',
  'SeriesDescription': 'Matched Bandwidth Hi-Res',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - StopManual1',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  'SeriesDescription': 'BOLD EPI - StopManual2',
  'ProtocolName': 'NA'},
 {'index': 'bold.json',
  

In [7]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [8]:
vector_store = m(
    embedding_function = hf,
    connection_args = connection_args,
    collection_name = COLLECTION_NAME,
    drop_old=True,
    )

In [9]:
example_selector = SemanticSimilarityExampleSelector.from_examples(
    examples,
    hf,
    vector_store,
    k=5,
)


In [26]:
example_prompt = PromptTemplate(
    input_variables = ["SeriesDescription", "ProtocolName", "index"], 
    template = """
    SeriesDescription: "{SeriesDescription}" \n ProtocolName: "{ProtocolName}"\n Suffix: "{index}"
    """
)

In [15]:
example_prompt

PromptTemplate(input_variables=['ProtocolName', 'SeriesDescription', 'Suffix'], template='\n    SeriesDescription: "{SeriesDescription}" \n ProtocolName: "{ProtocolName}"\n Suffix: "{Suffix}"\n    ')

In [29]:
prompt = FewShotPromptTemplate(
    example_selector = example_selector,
    example_prompt = example_prompt,
    suffix = "SeriesDescription: {SeriesDescription} \n ProtocolName: {ProtocolName} \n what is the suffix?",
    input_variables = ["SeriesDescription", "ProtocalName"],
)


In [30]:
prompt

FewShotPromptTemplate(input_variables=['ProtocolName', 'SeriesDescription'], example_selector=SemanticSimilarityExampleSelector(vectorstore=<langchain_community.vectorstores.milvus.Milvus object at 0x7f5fa8a150d0>, k=5, example_keys=None, input_keys=None, vectorstore_kwargs=None), example_prompt=PromptTemplate(input_variables=['ProtocolName', 'SeriesDescription', 'index'], template='\n    SeriesDescription: "{SeriesDescription}" \n ProtocolName: "{ProtocolName}"\n Suffix: "{index}"\n    '), suffix='SeriesDescription: {SeriesDescription} \n ProtocolName: {ProtocolName} \n what is the suffix?')

In [31]:
query = prompt.format(SeriesDescription = "dots_motion", ProtocolName = "dots_motion")

In [33]:
print(query)



    SeriesDescription: "dots_motion" 
 ProtocolName: "dots_motion"
 Suffix: "sbref.json"
    


    SeriesDescription: "dots_motion" 
 ProtocolName: "dots_motion"
 Suffix: "sbref.json"
    


    SeriesDescription: "dots_motion" 
 ProtocolName: "dots_motion"
 Suffix: "bold.json"
    


    SeriesDescription: "dots_motion" 
 ProtocolName: "dots_motion"
 Suffix: "bold.json"
    


    SeriesDescription: "dot_motion" 
 ProtocolName: "dot_motion"
 Suffix: "sbref.json"
    

SeriesDescription: dots_motion 
 ProtocolName: dots_motion 
 what is the suffix?


#### LLM part

In [None]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

In [None]:
llm = Ollama(
                    model="gemma:2b",
                    callback_manager=CallbackManager(
                                [StreamingStdOutCallbackHandler()]
                    ),
                    stop=["<|eot_id|>"],
                )
retriever = vector_store.as_retriever()

template = """Use the following pieces of context to answer the question at the end. 
Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""

rag_prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
)

#### Test LLM

In [None]:
print(rag_chain.invoke("How can I contribute to BIDS?"))

In [None]:
print(rag_chain.invoke("Explain what BIDS is."))

In [1]:
import subprocess
import os

def rec(path,  dest: list):
    for file in path[2]:
        print(f"Found file: {path}")
        
    for dir in path[1]:
        for smt in os.walk(dir):
            rec(smt, dest)
    return dest

In [3]:
def helper(path):
    dest = []
    for smt in os.walk(path):
        rec(smt, dest)
    return dest

In [4]:
paths = helper("ds000001")

Found file: ('ds000001', ['.datalad', '.git', 'sub-01', 'sub-02', 'sub-03', 'sub-04', 'sub-05', 'sub-06', 'sub-07', 'sub-08', 'sub-09', 'sub-10', 'sub-11', 'sub-12', 'sub-13', 'sub-14', 'sub-15', 'sub-16'], ['.gitattributes', 'CHANGES', 'dataset_description.json', 'participants.tsv', 'README', 'task-balloonanalogrisktask_bold.json'])
Found file: ('ds000001/.datalad', [], ['.gitattributes', 'config'])
Found file: ('ds000001/.git', ['annex', 'branches', 'hooks', 'info', 'logs', 'objects', 'refs'], ['config', 'description', 'HEAD', 'index', 'packed-refs'])
Found file: ('ds000001/.git/annex', ['journal'], ['index', 'index.lck', 'journal.lck', 'mergedrefs', 'othertmp.lck', 'sentinal', 'sentinal.cache', 'smudge.lck'])
Found file: ('ds000001/.git/hooks', [], ['applypatch-msg.sample', 'commit-msg.sample', 'fsmonitor-watchman.sample', 'post-checkout', 'post-merge', 'post-receive', 'post-update.sample', 'pre-applypatch.sample', 'pre-commit', 'pre-commit.sample', 'pre-merge-commit.sample', 'pre-p

In [5]:
paths

[]