In [116]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [117]:
!pip install -q langchain langchain-community langchain-core faiss-cpu transformers sentencepiece PyPDF2
!pip install pypdf
!pip install fastapi uvicorn pyngrok nest-asyncio



In [118]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [119]:
model_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


In [120]:
pdf_path = "/content/drive/MyDrive/LLM/pandas.pdf"
loader = PyPDFLoader(pdf_path)
documents=loader.load()

In [121]:
splitter=RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)
pages=splitter.split_documents(documents)

In [122]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(pages, embedding_model)

In [123]:
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Bạn là một trợ lý AI chuyên về Python và thư viện pandas.

Dựa vào nội dung sau:
---------------------
{context}
---------------------

Trả lời ngắn gọn và chính xác cho câu hỏi: {question}

Nếu không có đủ thông tin, hãy nói: "Tôi không chắc câu trả lời."
"""
)

In [124]:
memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [125]:
retriever = vectorstore.as_retriever(search_kwargs={'k': 2})
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"  # Thêm dòng này vào để tránh lỗi!
)


qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    output_key="answer"
)

In [126]:
query = "Liệt kê các chức năng của hàm groupby"
chat_history = []

result = qa_chain.invoke({
    "question": query,
    "chat_history": chat_history
})

print("Trả lời:")
print(result["answer"])  # result.get("result", "Không có câu trả lời")

# In tài liệu tham khảo
for doc in result.get("source_documents", []):
    print("\nTrích từ:")
    print(doc.page_content[:300])

Trả lời:
groupby.transform

Trích từ:
'C'      :  [np.NaN, 1, np.NaN, np.NaN, 
                               np.NaN, np.NaN, np.NaN, 4]}) 
 
 df 
Out[34]: 
     B    C group1 group2 
0  one  NaN      A      C 
1  NaN  1.0      A      C 
2  NaN  NaN      A      C 
3  NaN  NaN      A      D 
4  NaN  NaN      B      E

Trích từ:
4  NaN  NaN      B      E 
5  two  NaN      B      E 
6  NaN  NaN      B      F 
7  NaN  4.0      B      F
I want to get the count of non-missing observations of B for each combination of group1 and group2. 
groupby.transform is a very powerful function that does exactly that.
