<a href="https://colab.research.google.com/github/orekhovsky/GenAI-mini-projects/blob/main/RAG_on_gigachat_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Без векторной бд

In [None]:
pip install gigachat, transformers, rouge-score

In [None]:
pip install -U langchain-community

In [None]:
import pandas as pd
import ast
from langchain.chat_models.gigachat import GigaChat
from google.colab import userdata

df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/passages.parquet/part.0.parquet")
df_test = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/test.parquet/part.0.parquet")

# Преобразование чтобы правильно считать строку со списком релевантных id из корпуса текстов
df_test['relevant_passage_ids'] = df_test['relevant_passage_ids'].apply(ast.literal_eval)

# Функция для извленения чанков и формирования контекста для подачи в llm
def perform_retrieval(question, df, df_test):

    matching_rows = df_test[df_test['question'] == question]
    if matching_rows.empty:
        raise ValueError(f"Question '{question}' not found in df_test.")

    question_index = matching_rows.index[0]

    # Получение релевантных чанков из корпуса текстов
    relevant_passage_ids = df_test.loc[question_index, 'relevant_passage_ids']
    relevant_passages = [df.loc[i, 'passage'] for i in relevant_passage_ids]

    # Формируем контекст
    context = " ".join(relevant_passages)

    return context

# Авторизация через токен
auth = userdata.get('SBER_AUTH')

# Инициализация модели GigaChat
llm = GigaChat(
    credentials=auth,
    model='GigaChat:latest',
    verify_ssl_certs=False,
    profanity_check=False
)

# Пайплайн для RAG
def perform_rag_pipeline(question, df, df_test, llm):
    # Шаг 1: Извлекаем контекст из корпуса на основе вопроса
    context = perform_retrieval(question, df, df_test)

    # Шаг 2: Входные данные для модели
    input_text = f"""
    Answer the user's question.
    Use only the information from the context. If the context does not contain enough information to answer the question, let the user know.
    Context: {context}
    Question: {question}
    Answer:"""

    # Шаг 3: Генерируем ответ
    response = llm.predict(input_text)

    return response

# Пример вопроса
q1 = 'Is Hirschsprung disease a mendelian or a multifactorial disorder?'

response = perform_rag_pipeline(q1, df, df_test, llm)
print(f"Question: {q1}")
print(f"Answer: {response}")


Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?
Answer: Hirschsprung disease (HSCR) is considered a **multifactorial** disorder rather than a **mendelian** disorder. This means that it results from the combined effects of multiple genetic factors and environmental influences, rather than being caused by mutations in a single gene following a clear Mendelian inheritance pattern. 

While some specific genes play key roles, such as RET and other loci identified, the full range of phenotypic variability and incomplete penetrance observed in HSCR suggest that additional genetic and environmental factors influence the disease's manifestation.


In [None]:
from rouge_score import rouge_scorer
true_answer = df_test['answer'][0]

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(true_answer, response)

# Метрики (не очень, но по сути верно)
print("ROUGE-1:", scores['rouge1'])
print("ROUGE-2:", scores['rouge2'])
print("ROUGE-L:", scores['rougeL'])


ROUGE-1: Score(precision=0.26506024096385544, recall=0.36065573770491804, fmeasure=0.3055555555555556)
ROUGE-2: Score(precision=0.06097560975609756, recall=0.08333333333333333, fmeasure=0.07042253521126761)
ROUGE-L: Score(precision=0.10843373493975904, recall=0.14754098360655737, fmeasure=0.125)


In [None]:
question = 'Is Hirschsprung disease a mendelian or a multifactorial disorder?'

matching_rows = df_test[df_test['question'] == question]

question_index = matching_rows.index[0]

# Получение релевантных чанков из корпуса текстов
relevant_passage_ids = df_test.loc[question_index, 'relevant_passage_ids']
relevant_passages = [df.loc[i, 'passage'] for i in relevant_passage_ids]

# Формируем контекст
context = " ".join(relevant_passages)

In [None]:
context

'The major gene for Hirschsprung disease (HSCR) encodes the receptor tyrosine \nkinase RET. In a study of 690 European- and 192 Chinese-descent probands and \ntheir parents or controls, we demonstrate the ubiquity of a >4-fold \nsusceptibility from a C-->T allele (rs2435357: p = 3.9 x 10(-43) in European \nancestry; p = 1.1 x 10(-21) in Chinese samples) that probably arose once within \nthe intronic RET enhancer MCS+9.7. With in vitro assays, we now show that the T \nvariant disrupts a SOX10 binding site within MCS+9.7 that compromises RET \ntransactivation. The T allele, with a control frequency of 20%-30%/47% and case \nfrequency of 54%-62%/88% in European/Chinese-ancestry individuals, is involved \nin all forms of HSCR. It is marginally associated with proband gender (p = 0.13) \nand significantly so with length of aganglionosis (p = 7.6 x 10(-5)) and \nfamiliality (p = 6.2 x 10(-4)). The enhancer variant is more frequent in the \ncommon forms of male, short-segment, and simplex fam

Если сравнить с gpt-4o-mini

In [None]:
response ="Hirschsprung disease (HSCR) is a complex disorder with both Mendelian and multifactorial aspects. While some forms of the disease, particularly the syndromic and familial types, follow Mendelian inheritance patterns (autosomal dominant, recessive, and polygenic), the more common non-syndromic forms exhibit non-Mendelian inheritance with variable expression and incomplete penetrance. Genetic studies suggest that multiple loci and genetic interactions contribute to the disease, making it a multifactorial disorder."

In [None]:
from rouge_score import rouge_scorer
true_answer = df_test['answer'][0]

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(true_answer, response)

# Метрики (не очень, но по сути верно)
print("ROUGE-1:", scores['rouge1'])
print("ROUGE-2:", scores['rouge2'])
print("ROUGE-L:", scores['rougeL'])

ROUGE-1: Score(precision=0.3235294117647059, recall=0.36065573770491804, fmeasure=0.3410852713178295)
ROUGE-2: Score(precision=0.1044776119402985, recall=0.11666666666666667, fmeasure=0.11023622047244094)
ROUGE-L: Score(precision=0.20588235294117646, recall=0.22950819672131148, fmeasure=0.2170542635658915)
