[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neoaman/diagnoflow/blob/main/notebooks/01_create_vector_store_FAISS_med.ipynb)

# Setup Ollama server

In [None]:
# Download ollama
! curl https://ollama.ai/install.sh | sh
# Serve ollama
%env OLLAMA_HOST=0.0.0.0
!ollama serve &> /dev/null &
!ollama pull gemma:2b &> /dev/null

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0>>> Downloading ollama...
100  8575    0  8575    0     0  15871      0 --:--:-- --:--:-- --:--:-- 15879
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
env: OLLAMA_HOST=0.0.0.0


In [None]:
# Install required packages
%%writefile requirements.txt
faiss-gpu
langchain
jq
langchainhub
icecream
minio

Writing requirements.txt


In [None]:
# Download required python package
! pip install -r requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.1/809.1 kB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m656.0/656.0 kB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.8/258.8 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

# Test Ollama server

In [None]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from icecream import ic
llm = Ollama(model="gemma:2b",base_url="http://0.0.0.0:11434",callback_manager = CallbackManager([StreamingStdOutCallbackHandler]))
ic(llm("What is molecular formula of zinc di oxide ?"))

  warn_deprecated(
ic| llm("What is molecular formula of zinc di oxide ?"): ('Sure, the molecular formula for zinc di oxide is ZnO2. It is a chemical '
                                                          'compound composed of zinc and oxygen.')


'Sure, the molecular formula for zinc di oxide is ZnO2. It is a chemical compound composed of zinc and oxygen.'

# Download data for vector store

In [None]:
! wget https://github.com/project-baize/baize-chatbot/raw/main/data/medical_chat_data.json

--2024-03-14 14:16:25--  https://github.com/project-baize/baize-chatbot/raw/main/data/medical_chat_data.json
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/project-baize/baize-chatbot/main/data/medical_chat_data.json [following]
--2024-03-14 14:16:25--  https://raw.githubusercontent.com/project-baize/baize-chatbot/main/data/medical_chat_data.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61608619 (59M) [text/plain]
Saving to: ‘medical_chat_data.json’


2024-03-14 14:16:26 (291 MB/s) - ‘medical_chat_data.json’ saved [61608619/61608619]



# Store data in vector store

In [None]:
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from minio import Minio

file_path='/content/medical_chat_data.json'
data = json.loads(Path(file_path).read_text())

In [None]:
ACCESS_KEY = input("Enter access key:")
SECRET_KEY = input("Enter secret key:")
MINIO_CLIENT = Minio("s3.mlhub.in", access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)

In [None]:
loader = JSONLoader(
    file_path='/content/medical_chat_data.json',
    jq_schema='.[]',
    text_content=False,
    json_lines=True
)

data = loader.load()

In [None]:
ic(len(data))

ic| len(data): 46867


46867

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(data)

In [None]:
%env OLLAMA_HOST=0.0.0.0
!ollama serve &> /dev/null &

env: OLLAMA_HOST=0.0.0.0


## Run below section to initate the vector database

In [None]:
# NOTE Run if require to remove the vector database
# !rm -r medical_index

In [None]:
# NOTE RUN ONCE to initiate the vector db

# START = 0
# db = FAISS.from_documents(documents[(START*100):(START+1)*100], OllamaEmbeddings(model="gemma:2b",base_url="http://0.0.0.0:11434"))
# db.save_local("medical_index")

In [None]:
# NOTE RUN ONCE to initiate the vector db info

# with open("medical_index/meta_info.txt","w") as mt:
#     mt.write(str(START))

In [None]:
# NOTE For initial upload only (testing purpose)
# MINIO_CLIENT.fput_object("public", "medical_index/index.faiss","/content/medical_index/index.faiss")
# MINIO_CLIENT.fput_object("public", "medical_index/index.pkl","/content/medical_index/index.pkl")
# MINIO_CLIENT.fput_object("public", "medical_index/meta_info.txt","/content/medical_index/meta_info.txt")

### Setup and store in bucket

In [None]:
# START = 1
# for i in range(START,round(len(data)/100)):
#     db.add_documents(documents[i*100:(i+1)*100])
#     db.save_local("medical_index")
#     with open("medical_index/meta_info.txt","w") as mt:
#         mt.write(f"i:{i} -> from {i*100} to {(i+1)*100}")
#     MINIO_CLIENT.fput_object("public", "medical_index/index.faiss","/content/medical_index/index.faiss")
#     MINIO_CLIENT.fput_object("public", "medical_index/index.pkl","/content/medical_index/index.pkl")
#     MINIO_CLIENT.fput_object("public", "medical_index/meta_info.txt","/content/medical_index/meta_info.txt")
#     print(i*100,(i+1)*100)

## Resume work Execute if you know the DB exists

In [None]:
MINIO_CLIENT.fget_object("public",object_name="medical_index/index.faiss",file_path="medical_index/index.faiss")
MINIO_CLIENT.fget_object("public",object_name="medical_index/index.pkl",file_path="/content/medical_index/index.pkl")
MINIO_CLIENT.fget_object("public",object_name="medical_index/meta_info.txt",file_path="/content/medical_index/meta_info.txt")

<minio.datatypes.Object at 0x7a49ea3a32e0>

In [None]:
%env OLLAMA_HOST=0.0.0.0
!ollama serve &> /dev/null &

env: OLLAMA_HOST=0.0.0.0


In [None]:
db = FAISS.load_local("medical_index",embeddings=OllamaEmbeddings(model="gemma:2b",base_url="http://0.0.0.0:11434"),allow_dangerous_deserialization=True)

In [None]:
START = 363
for i in range(START,round(len(data)/100)):
    db.add_documents(documents[i*100:(i+1)*100])
    db.save_local("medical_index")
    with open("medical_index/meta_info.txt","w") as mt:
        mt.write(f"i:{i} -> from {i*100} to {(i+1)*100}")
    MINIO_CLIENT.fput_object("public", "medical_index/index.faiss","/content/medical_index/index.faiss")
    MINIO_CLIENT.fput_object("public", "medical_index/index.pkl","/content/medical_index/index.pkl")
    MINIO_CLIENT.fput_object("public", "medical_index/meta_info.txt","/content/medical_index/meta_info.txt")
    print(i*100,(i+1)*100)

36300 36400
36400 36500
36500 36600
36600 36700
36700 36800
36800 36900
36900 37000
37000 37100
37100 37200
37200 37300
37300 37400
37400 37500
37500 37600
37600 37700
37700 37800
37800 37900
37900 38000
38000 38100
38100 38200
38200 38300
38300 38400
38400 38500
38500 38600
38600 38700
38700 38800
38800 38900
38900 39000
39000 39100
39100 39200
39200 39300
39300 39400
39400 39500
39500 39600
39600 39700
39700 39800
39800 39900
39900 40000
40000 40100
40100 40200
40200 40300
40300 40400
40400 40500
40500 40600
40600 40700
40700 40800
40800 40900
40900 41000
41000 41100
41100 41200
41200 41300
41300 41400
41400 41500
41500 41600
41600 41700
41700 41800
41800 41900
41900 42000
42000 42100
42100 42200
42200 42300
42300 42400
42400 42500
42500 42600
42600 42700
42700 42800
42800 42900
42900 43000
43000 43100
43100 43200
43200 43300
43300 43400
43400 43500
43500 43600
43600 43700
43700 43800
43800 43900
43900 44000
44000 44100
44100 44200
44200 44300
44300 44400
44400 44500
44500 44600
4460

In [None]:
len(data)

46867