In [1]:
# import libs
import torch
import json
import glob
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import RecursiveJsonSplitter
from langchain_core.documents import Document
from langchain.vectorstores import Chroma as Vectorstore
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from BCEmbedding import EmbeddingModel
from pathlib import Path
from pprint import pprint
from LLM import InternLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read the data
file_path = Path('./data/train_processed.json') 
json_data = json.loads(Path(file_path).read_text())

In [3]:
# Split data
chunk_size = 50
splitter = RecursiveJsonSplitter(max_chunk_size=chunk_size)
docs = splitter.create_documents(json_data)

In [4]:
# Embedding model
embeddings = HuggingFaceEmbeddings()

04/21/2024 16:13:39 - [INFO] -sentence_transformers.SentenceTransformer->>>    Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
04/21/2024 16:13:45 - [INFO] -sentence_transformers.SentenceTransformer->>>    Use pytorch device_name: cuda


In [5]:
# Create the vector database
# persist_directory = './data_base/vector_db/chroma'
# vectordb = Vectorstore.from_documents(
#     documents=docs, 
#     embedding=embeddings,
#     persist_directory=persist_directory)
# vectordb.persist()

In [6]:
persist_directory = './data_base/vector_db/chroma'
vectordb = Vectorstore(
    persist_directory=persist_directory, 
    embedding_function=embeddings
)

04/21/2024 16:13:47 - [INFO] -chromadb.telemetry.product.posthog->>>    Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [7]:
# Retriever
# retriever = vectordb.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 documents

In [8]:
# Load model
llm = InternLM()
llm.predict("你是谁")

正在从本地加载模型...


Loading checkpoint shards: 100%|██████████| 8/8 [00:15<00:00,  1.89s/it]


完成本地模型的加载


  warn_deprecated(


'你好！我是一个名叫书生·浦语的AI助手，由上海人工智能实验室开发。我致力于通过语言交流提供帮助，无论是解答问题、提供建议，还是进行简单的对话。我能够理解并使用中文和英文进行交流。如果你有任何问题或需要帮助，请随时告诉我，我会尽力为你提供帮助。'

In [18]:
# Prompt template
template = """你是一个雅思作文小助手，需要帮用户按照雅思官方标准（打分+评价），满分1-9分批改他们的作文。
评分的准则包含（
    1.任务完成度
    2.连贯与衔接
    3.词汇丰富度
    4.语法广度和准确性
）
参考以下上下文为模板来批改用户的问题。如果你不知道答案，就说你不知道。总是使用中文回答。
问题: {question}
可参考的上下文：
···
{context}
···
如果给定的上下文无法让你做出回答，请回答你不知道。
有用的回答:"""

# 调用 LangChain 的方法来实例化一个 Template 对象，该对象包含了 context 和 question 两个变量，在实际调用时，这两个变量会被检索到的文档片段和用户提问填充
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context","question"],template=template)

In [20]:
# Chat
qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectordb.as_retriever(),return_source_documents=True,chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})

In [21]:
# 检索问答链回答效果
question = """Question: Interviews form the basic criteria for most large companies. However, some people think that the interview is not a reliable method of choosing whom to employ and there are other better methods. To what extent do you agree or disagree? Essay: It is believed by some experts that the traditional approach of recruiting candidates which is interviewing is the best way, whereas others think different methods such as exams writing, CVs, cover letters or application letters and many more are good. I strongly agree with the statement, "interview is the most reliable approach to recruit workers" because this method assists the recruiters to know the person and his ability to do the work and their problem-solving abilities. 

To begin with, an interview enables the  recruiter to know the kind of person he or she is recruiting. It helps the employer to see the personality traits of the employee such as how he answers questions, his facial mannerisms and also his communication skills, that is, whether introvert or extrovert, also his teamwork skill is measured during the dialogue. For instance, jobs like sales personnel require good communication skills to be able to do the work effectively and efficiently. So interviews allow the manager to assess whether or not the applicant qualifies for the job. 

Furthermore, recruiters also assess the applicant's ability to solve problems when they arise. A good idea generated or how one handles situations can bring great development to the company. For instance, pressure can put fear into an employee which can make him make a wrong decision that can bring loss to the company, while some too can take pressure in a calm action and make a good decision. 

On the other hand, other methods such as CVs, cover letters, the use of only certificates and many more are not a suitable step to recruit an applicant due to the fact that it does not allow the recruiter to see the full potential of the candidate. Information found in the CV or cover letter may not be true because people lie to obtain what they desire. In the same way, a candidate can also lie to acquire the position. 

To sum up, I think an interview is still the most reliable practice of hiring employees rather than using other methods. So I suggest managers use only interviews as a means of sourcing workers for their companies. 请帮我的这一片作文打分"""
result = qa_chain({"query": question})
print("检索问答链回答 question 的结果：")
print(result["result"])

# 仅 LLM 回答效果
result_2 = llm(question)
print("大模型回答 question 的结果：")
print(result_2)

检索问答链回答 question 的结果：
根据雅思作文评分标准，我会从以下几个方面来对这篇作文进行评分：

1. **任务完成度**：
   - 作文题目要求对“面试是否是最可靠的选择员工的途径”这一观点进行讨论，作者明确表达了同意这一观点的态度，并提供了相关的理由和例子，符合题目要求。

2. **连贯与衔接**：
   - 作者通过逻辑清晰的段落结构展开论述，每个段落都有一个明确的主题句，并使用过渡词和短语（如“To begin with”，“Furthermore”，“On the other hand”等）来连接段落，保持了文章的连贯性。

3. **词汇丰富度**：
   - 作者使用了多种词汇来表达观点，包括一些高级词汇（如“personality traits”，“trait”，“confidence”等），这表明作者具有足够的词汇量来表达自己的思想。

4. **语法广度和准确性**：
   - 作者在句子结构和语法方面表现良好，没有明显的语法错误，这表明作者对英语语法有较好的掌握。

综合以上四个方面的评估，我会给这篇作文打分：**8.0**。作者清晰地表达了自己的观点，并以逻辑清晰的方式提供了支持观点的理由和例子。然而，有一些可以改进的地方，比如可以增加更多的实例来支持观点，或者更深入地探讨其他选择方法的优缺点。此外，在文章中也可以更广泛地使用同义词和更复杂的句子结构，以进一步提高词汇和语法的多样性。

希望这个评分和反馈对您有所帮助。如果您有其他问题或需要进一步的帮助，请随时告诉我。
大模型回答 question 的结果：
Your essay presents a clear argument in favor of interviews as the most reliable method for recruiting employees. You've effectively highlighted the advantages of this approach, such as assessing a candidate's personality traits, problem-solving abilities, and communication skills. However, there are a few areas w

In [12]:
torch.cuda.empty_cache()
