In [None]:
import pickle
import os

# For download embeddings model
from huggingface_hub import snapshot_download

# For embeddings and vector stores
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [143]:
import pipreqs

In [146]:
pipreqs.init()

AttributeError: module 'pipreqs' has no attribute 'init'

# Settings

In [None]:
embeddings_model_name = "intfloat/multilingual-e5-base"
embeddings_path = "./embedding_model/intfloat/multilingual-e5-base"

# Extract txts from Website


In [4]:
import bs4
import requests

url = 'https://home.cern'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
links = [link.get('href') for link in links if link.get('href') is not None]
urls = []
for l in links:
    if l.startswith('/science') :
        if l not in urls:
            urls.append(url + l)

In [None]:
import langchain_community.document_loaders

loader = langchain_community.document_loaders.UnstructuredURLLoader(
    urls=urls
)
with open("loader.pkl", "wb") as f:
    pickle.dump(loader, f)

In [3]:
with open("loader.pkl", "rb") as f:
    loader = pickle.load(f)

# DB establishment

## Download the embeddigns model

In [4]:
if os.path.isfile(f"embedding_model/{embeddings_model_name}/config.json"):
    print("Model already exists.")
else:
    download_path = snapshot_download(
        repo_id=embeddings_model_name,
        local_dir = f"path_to_model/{embeddings_model_name}",
        local_dir_use_symlinks=False # If you want to use symlinks, set this to True
        )

Model already exists.


## Make retriever

In [110]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 512,
    chunk_overlap = 32,
    length_function = len,
)

embeddings = HuggingFaceEmbeddings(
    model_name=embeddings_path
)

index = VectorstoreIndexCreator(
    vectorstore_cls=Chroma,
    embedding=embeddings,
    text_splitter=text_splitter,
).from_loaders([loader])

retriever = index.vectorstore.as_retriever()

Created a chunk of size 550, which is longer than the specified 512
Created a chunk of size 565, which is longer than the specified 512
Created a chunk of size 771, which is longer than the specified 512
Created a chunk of size 552, which is longer than the specified 512
Created a chunk of size 586, which is longer than the specified 512
Created a chunk of size 617, which is longer than the specified 512
Created a chunk of size 586, which is longer than the specified 512
Created a chunk of size 584, which is longer than the specified 512
Created a chunk of size 826, which is longer than the specified 512
Created a chunk of size 805, which is longer than the specified 512
Created a chunk of size 575, which is longer than the specified 512
Created a chunk of size 926, which is longer than the specified 512
Created a chunk of size 999, which is longer than the specified 512
Created a chunk of size 713, which is longer than the specified 512
Created a chunk of size 1203, which is longer th

# RAG chain

In [137]:
from helpers import LLM
llm = LLM()

Prompt Template Created
input_variables=['context', 'question', 'source'] input_types={} partial_variables={} template="\n        <|system|>\n        Use the following pieces of context to answer the question at the end. : {context}\n        And you need to answer with following engagements;\n            - If you don't know the answer, just say that you don't know, don't try to make up an answer.\n            - Use markdown formatting when displaying code.\n            - Emphasis should be used to terminologies.\n            - Give sources you used at the end.\n            - Answer in Japanese.\n        </s>\n        <|user|>\n        {question}\n        </s>\n        {source}\n    "


In [142]:
question = "LHCにある実験施設は何ですか？"
response = llm.chat(question, retriever=retriever)

# Trained RAG Answer
LHCにある実験施設は、9つの異なる実験によって構成されています。 これらの実験は、世界中の研究所の科学者が協力して運営されています。 各実験は、独特の検出器で特徴づけられています。

- **ATLAS** (A Toroidal LHC Apparatus): ATLASは、LHCの衝突点で生成される粒子の性質を調べるために設計された検出器です。
- **CMS** (Compact Muon Solenoid): CMSは、ATLASと同じく、LHCの衝突点で生成される粒子の性質を調べるために設計された検出器です。
- **ALICE** (A Large Ion Collider Experiment): ALICEは、LHCで衝突するイオンの相互作用を調べるために設計された検出器です。
- **LHCb** (Large Hadron Collider beauty): LHCbは、LHCの衝突点で生成される美しさ粒子の性質を調べるために設計された検出器です。
- **TOTEM** (TOTal cross section, Elastic scattering and diffraction dissociation Measurement at the LHC): TOTEMは、LHCの衝突点でのプロトン-プロトンの弾性散乱とdiffractive dissociationの測定を目的としています。
- **LHCf** (Large Hadron Collider forward): LHCfは、LHCの衝突点での粒子の前方散乱の測定を目的としています。
- **MoEDAL** (Monopole and Exotics Detector at the LHC): MoEDALは、LHCの衝突点で生成される磁気単極子やexotic particlesの検出を目的としています。
- **FASER** (ForwArd Search ExpeRiment): FASERは、LHCの衝突点での粒子の前方散乱の測定を目的としています。
- **SHiP** (Search for Hidden Particles): SHiPは、LHCの衝突点で生成される隠れた粒子の検出を目的としています。

これらの実験施設は、LHCの衝突点で生成される粒子の性質を調べるために設計されています。

Sources:
- [CERN - Accelerator Complex](https://home.cern/science/accelerators/accelerator-complex)
- [CERN - Experiments](https://home.cern/science/experiments)