In [1]:
# import libs
import torch
import json
import glob
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import RecursiveJsonSplitter
from langchain_core.documents import Document
from langchain.vectorstores import Chroma as Vectorstore
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from BCEmbedding import EmbeddingModel
from pathlib import Path
from pprint import pprint
from LLM import InternLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read the data
file_path = Path('./data/train.json') 
json_data = json.loads(Path(file_path).read_text())

In [3]:
# Split data
chunk_size = 50
splitter = RecursiveJsonSplitter(max_chunk_size=chunk_size)
docs = splitter.create_documents(json_data)

In [4]:
# Embedding model
embeddings = HuggingFaceEmbeddings()

04/15/2024 11:05:30 - [INFO] -sentence_transformers.SentenceTransformer->>>    Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
04/15/2024 11:05:35 - [INFO] -sentence_transformers.SentenceTransformer->>>    Use pytorch device_name: cuda


In [5]:
# Create the vector database
# persist_directory = './data_base/vector_db/chroma'
# vectordb = Vectorstore.from_documents(
#     documents=docs, 
#     embedding=embeddings,
#     persist_directory=persist_directory)
# vectordb.persist()

04/15/2024 11:05:36 - [INFO] -chromadb.telemetry.product.posthog->>>    Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


KeyboardInterrupt: 

In [6]:
persist_directory = './data_base/vector_db/chroma'
vectordb = Vectorstore(
    persist_directory=persist_directory, 
    embedding_function=embeddings
)

In [None]:
# Retriever
# retriever = vectordb.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 documents

In [7]:
# Load model
llm = InternLM()
llm.predict("你是谁")

正在从本地加载模型...


Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.31it/s]


完成本地模型的加载


  warn_deprecated(


'我是书生·浦语，由上海人工智能实验室开发的人工智能助手。我能够理解并流利地使用汉语和英语进行交流。我的设计理念是有用、诚实并且无害，旨在通过执行常见的基于语言的任务和提供建议来帮助人类。'

In [9]:
# Prompt template
template = """你是一个雅思作文小助手，需要帮用户按照雅思官方标准批改他们的作文。使用以下上下文来批改用户的问题。如果你不知道答案，就说你不知道。总是使用中文回答。
问题: {question}
可参考的上下文：
···
{context}
···
如果给定的上下文无法让你做出回答，请回答你不知道。
有用的回答:"""

# 调用 LangChain 的方法来实例化一个 Template 对象，该对象包含了 context 和 question 两个变量，在实际调用时，这两个变量会被检索到的文档片段和用户提问填充
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context","question"],template=template)

In [10]:
# Chat
qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectordb.as_retriever(),return_source_documents=True,chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})

In [11]:
# 检索问答链回答效果
question = """Question: Interviews form the basic criteria for most large companies. However, some people think that the interview is not a reliable method of choosing whom to employ and there are other better methods. To what extent do you agree or disagree? Essay: It is believed by some experts that the traditional approach of recruiting candidates which is interviewing is the best way, whereas others think different methods such as exams writing, CVs, cover letters or application letters and many more are good. I strongly agree with the statement, "interview is the most reliable approach to recruit workers" because this method assists the recruiters to know the person and his ability to do the work and their problem-solving abilities. 

To begin with, an interview enables the  recruiter to know the kind of person he or she is recruiting. It helps the employer to see the personality traits of the employee such as how he answers questions, his facial mannerisms and also his communication skills, that is, whether introvert or extrovert, also his teamwork skill is measured during the dialogue. For instance, jobs like sales personnel require good communication skills to be able to do the work effectively and efficiently. So interviews allow the manager to assess whether or not the applicant qualifies for the job. 

Furthermore, recruiters also assess the applicant's ability to solve problems when they arise. A good idea generated or how one handles situations can bring great development to the company. For instance, pressure can put fear into an employee which can make him make a wrong decision that can bring loss to the company, while some too can take pressure in a calm action and make a good decision. 

On the other hand, other methods such as CVs, cover letters, the use of only certificates and many more are not a suitable step to recruit an applicant due to the fact that it does not allow the recruiter to see the full potential of the candidate. Information found in the CV or cover letter may not be true because people lie to obtain what they desire. In the same way, a candidate can also lie to acquire the position. 

To sum up, I think an interview is still the most reliable practice of hiring employees rather than using other methods. So I suggest managers use only interviews as a means of sourcing workers for their companies. 请帮我的这一片作文打分"""
result = qa_chain({"query": question})
print("检索问答链回答 question 的结果：")
print(result["result"])

# 仅 LLM 回答效果
result_2 = llm(question)
print("大模型回答 question 的结果：")
print(result_2)

  warn_deprecated(


检索问答链回答 question 的结果：
你的作文很好地表达了你的观点，即面试是招聘员工的可靠方法。你提供了充分的理由来支持你的立场，包括面试可以帮助招聘者了解应聘者的个性和解决问题的能力。你同时提到了其他招聘方法的不足之处，如简历和求职信可能不真实，而且可能存在作弊行为。你的结论明确有力，强调了面试仍然是招聘员工的最佳方法。

总体而言，你的作文结构清晰，论点明确，语言流畅。但是，有一些地方可以改进。首先，你的作文有一些语法错误，如“facial mannerisms”应该写成“facial expressions”，“the use of only certificates”应该写成“the use of only certificates”等等。其次，你的作文有些地方可以更加具体和详细。例如，你可以提供更多的例子来说明面试如何帮助招聘者了解应聘者的能力和特质，或者面试如何帮助招聘者判断应聘者的压力承受能力和创新思维能力。最后，你可以更加深入地讨论其他招聘方法的优缺点，以使你的论点更加全面和有力。

总的来说，你的作文表现出了对主题的深入理解和清晰的表达能力。通过一些修改和改进，你的作文将会更加出色。


  warn_deprecated(


大模型回答 question 的结果：
在这篇文章中，你提出了自己的观点，即面试是招聘员工最可靠的方法。你的论点主要集中在以下几点：

1. **面试帮助招聘者了解候选人的性格和能力**：你提到了面试可以揭示候选人的沟通技能、团队合作能力和解决问题的能力，这对于某些职位（如销售人员）非常重要。

2. **面试可以评估候选人的应变能力**：在面试中，候选人可能会面临压力情况，而他们的反应可以显示出他们是否能做出正确的决策，这对公司的发展至关重要。

3. **其他方法的不足**：你指出了简历、求职信等书面材料可能不真实，因为人们可能会为了获得职位而撒谎。

4. **结论和建议**：你总结了自己的观点，并建议招聘者应只使用面试作为招聘员工的方法。

总体而言，你的文章结构清晰，论点明确，并且提供了合理的理由来支持你的观点。然而，为了进一步提高你的文章质量，以下是一些建议：

- **举例支持观点**：你可以在文章中提供更多实际的例子来支持你的观点，这将使你的论点更加有说服力。
- **讨论其他方法的优点**：虽然你提到了其他方法的缺点，但你也可以简要讨论它们在某些情况下的优点，以展现你的全面思考。
- **解决可能的反驳**：预见并解决可能的反驳观点可以增加文章的说服力。例如，你可以提到虽然书面材料可能不真实，但它们仍然可以作为面试前的筛选工具。
- **结尾强化观点**：在文章的结尾，你可以再次强调面试作为招聘最可靠方法的重要性，并鼓励招聘者坚持这一方法。

通过这些改进，你的文章将更加全面、有说服力。继续练习写作，相信你的文章评分会更高。


In [None]:
torch.cuda.empty_cache()
