In [166]:
import pickle
import os

# For download embeddings model
from huggingface_hub import snapshot_download

# For embeddings and vector stores
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

# Settings

In [150]:
embeddings_model_name = "intfloat/multilingual-e5-large-instruct"
embeddings_path = f"./embedding_model/{embeddings_model_name}"

# Extract txts from Website


In [151]:
import bs4
import requests

url = 'https://home.cern'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
links = [link.get('href') for link in links if link.get('href') is not None]
urls = []
for l in links:
    if l.startswith('/science') :
        if l not in urls:
            urls.append(url + l)

In [None]:
import langchain_community.document_loaders

loader = langchain_community.document_loaders.UnstructuredURLLoader(
    urls=urls
)
with open("loader.pkl", "wb") as f:
    pickle.dump(loader, f)

In [152]:
with open("loader.pkl", "rb") as f:
    loader = pickle.load(f)

# DB establishment

## Download the embeddigns model

In [None]:
if os.path.isfile(f"embedding_model/{embeddings_model_name}/config.json"):
    print("Model already exists.")
else:
    download_path = snapshot_download(
        repo_id=embeddings_model_name,
        local_dir = f"embedding_model/{embeddings_model_name}",
        local_dir_use_symlinks=False # If you want to use symlinks, set this to True
        )

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/763 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

onnx/special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/686k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

## Make retriever

In [162]:
docs = loader.load()

In [164]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 512,
    chunk_overlap = 32,
    length_function = len,
)

docs = text_splitter.split_documents(docs)
embeddings = HuggingFaceEmbeddings(
    model_name=embeddings_path
)

Created a chunk of size 550, which is longer than the specified 512
Created a chunk of size 565, which is longer than the specified 512
Created a chunk of size 771, which is longer than the specified 512
Created a chunk of size 552, which is longer than the specified 512
Created a chunk of size 586, which is longer than the specified 512
Created a chunk of size 617, which is longer than the specified 512
Created a chunk of size 586, which is longer than the specified 512
Created a chunk of size 584, which is longer than the specified 512
Created a chunk of size 826, which is longer than the specified 512
Created a chunk of size 805, which is longer than the specified 512
Created a chunk of size 575, which is longer than the specified 512
Created a chunk of size 926, which is longer than the specified 512
Created a chunk of size 999, which is longer than the specified 512
Created a chunk of size 713, which is longer than the specified 512
Created a chunk of size 1203, which is longer th

In [167]:
db = FAISS.from_documents(docs, embeddings)

In [178]:
retriever = db.as_retriever()

In [None]:
with open("retriever.pkl", "wb") as f:
    pickle.dump(retriever, f)

# RAG chain

In [182]:
from helpers import LLM
llm = LLM()

Prompt Template Created
input_variables=['context', 'question', 'source'] input_types={} partial_variables={} template="\n        <|system|>\n        Use the following pieces of context to answer the question at the end. : {context}\n        And you need to answer with following engagements;\n            - If you don't know the answer, just say that you don't know, don't try to make up an answer.\n            - Use markdown formatting when displaying code.\n            - Emphasis should be used to terminologies.\n            - Give sources you used at the end.\n            - Answer in Japanese.\n        </s>\n        <|user|>\n        {question}\n        </s>\n        {source}\n    "


In [184]:
question = "LHCで何が起こっているのですか？"
response = llm.chat(question, retriever=retriever)

# Trained RAG Answer
LHCは、素粒子を衝突させることで、素粒子の性質を研究するための装置です。LHCは、プロトンを高速で衝突させることで、素粒子の生成と相互作用を研究します。この衝突で生成される多くのquarkは、すぐに他の形態に崩壊します。LHCbは、b quarkを捉えるために、LHCのビームの軌道近くに高度な可動式トラッキングデテクターを開発しました。

LHCは、ATLAS、CMS、LHCb、TOTEM、LHCf、MoEDAL-MAPP、FASER、SND@LHC、Fixed-target experimentsなどの多くの実験装置が設置されています。各実験装置は、異なる研究対象に焦点を当てています。

LHCは、素粒子の研究のための世界的な協力プロジェクトです。LHCは、CERN（欧州核研究機構）が運営しています。

Sources:
- https://home.cern/science/experiments
- https://home.cern/science/experiments
- https://home.cern/science/experiments
- https://home.cern/science/experiments/lhcb