-
Notifications
You must be signed in to change notification settings - Fork 95
/
prompts.py
340 lines (279 loc) · 12 KB
/
prompts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import re
import pathway as pw
@pw.udf
def prompt_short_qa(
query: str, docs: list[pw.Json] | list[str], additional_rules: str = ""
) -> str:
"""
Generate a RAG prompt with given context.
Specifically for getting short and concise answers.
Given a question, and list of context documents, generates prompt to be sent to the LLM.
Suggests specific formatting for yes/no questions and dates.
Args:
query: Question or prompt to be answered.
docs: List of documents to be passed to the LLM as context. pw.Json can be wrapped around
dict, string or any other type as document.
additional_rules: Optional parameter for rest of the string args that may include
additional instructions or information.
Returns:
Prompt containing question and relevant docs.
"""
context_pieces = []
for i, doc in enumerate(docs, 1):
context_pieces.append(str(doc))
context_pieces.append("")
context_str = "\n".join(context_pieces)
prompt = (
"Please provide an answer based solely on the provided sources. "
"Keep your answer concise and accurate. Make sure that it starts with an expression in standardized format. "
"Only respond without any explanation, for example questions asking for date should be answered in strictly date format: `05 January 2011`. " # noqa: E501
"Yes or No questions should be responded with simple `Yes` or `No` and so on. "
"If question cannot be inferred from documents SAY `No information found`. "
)
prompt += additional_rules + " "
prompt += (
"Now it's your turn. Below are several sources of information:"
"\n------\n"
f"{context_str}"
"\n------\n"
f"Query: {query}\n"
"Answer:"
)
return prompt
@pw.udf
def prompt_qa(
query: str,
docs: list[pw.Json] | list[str],
information_not_found_response="No information found.",
additional_rules: str = "",
) -> str:
"""
Generate RAG prompt with given context.
Given a question and list of context documents, generates prompt to be sent to the LLM.
Args:
query: Question or prompt to be answered.
docs: List of documents to be passed to the LLM as context. pw.Json can be wrapped around
dict, string or any other type as document.
information_not_found_response: Response LLM should generate in case answer cannot
be inferred from the given documents.
additional_rules: Optional parameter for rest of the string args that may include
additional instructions or information.
Returns:
Prompt containing question and relevant docs.
>>> import pandas as pd
>>> import pathway as pw
>>> from pathway.xpacks.llm import prompts
>>> t = pw.debug.table_from_pandas(pd.DataFrame([{"question": "What is rag?"}]))
>>> docs = [{"text": "Pathway is a high-throughput, low-latency data processing framework that handles live data & streaming for you."},
... {"text": "RAG stands for Retrieval Augmented Generation."}]
>>> t_with_docs = t.select(*pw.this, docs=docs)
>>> r = t_with_docs.select(prompt=prompts.prompt_qa(pw.this.question, pw.this.docs))
""" # noqa: E501
context_pieces = []
for i, doc in enumerate(docs, 1):
context_pieces.append(str(doc))
context_str = "\n\n".join(context_pieces)
prompt = (
"Please provide an answer based solely on the provided sources. "
"Keep your answer concise and accurate. "
)
prompt += additional_rules + " "
prompt += (
f"If question cannot be inferred from documents SAY `{information_not_found_response}`. "
"Now it's your turn. Below are several sources of information:"
"\n------\n"
f"{context_str}"
"\n------\n"
f"Query: {query}\n"
"Answer:"
)
return prompt
# prompt for `answer_with_geometric_rag_strategy`, it is the same as in the research project
@pw.udf
def prompt_qa_geometric_rag(
query: str,
docs: list[pw.Json] | list[str],
information_not_found_response="No information found.",
additional_rules: str = "",
strict_prompt: bool = False, # instruct LLM to return json for local models, improves performance
):
context_pieces = []
for i, doc in enumerate(docs, 1):
if isinstance(doc, str):
context_pieces.append(f"Source {i}: {doc}")
else:
context_pieces.append(f"Source {i}: {doc['text']}") # type: ignore
context_str = "\n".join(context_pieces)
if strict_prompt:
prompt = f"""
Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write "{information_not_found_response}" Do not explain.
ONLY RESPOND IN PARSABLE JSON WITH THE ONLY KEY `answer`.
When referencing information from a source, cite the appropriate source(s) using their corresponding numbers. Every answer should include at least one source citation.
Only cite a source when you are explicitly referencing it.
For example:
Given following sources and query
Example 1: "Source 1: The sky is red in the evening and blue in the morning.\nSource 2: Water is wet when the sky is red.
Query: When is water wet?
Response: {{"answer": "When the sky is red [2], which occurs in the evening [1]."}}
Example 2: "Source 1: LLM stands for Large language models.
Query: Who is the current pope?
Response: {{"answer": "{information_not_found_response}"}}
""" # noqa
else:
prompt = f"""
Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write "{information_not_found_response}" Do not answer in full sentences.
When referencing information from a source, cite the appropriate source(s) using their corresponding numbers. Every answer should include at least one source citation.
Only cite a source when you are explicitly referencing it. For example:
"Source 1:
The sky is red in the evening and blue in the morning.
Source 2:
Water is wet when the sky is red.\n
Query: When is water wet?
Answer: When the sky is red [2], which occurs in the evening [1]."
""" # noqa
prompt += additional_rules + " "
if strict_prompt: # further instruction is needed for smaller models
prompt += (
"\n------\n"
f"{context_str}"
f"Query: {query}\n"
"ONLY RESPOND IN PARSABLE JSON WITH THE ONLY KEY `answer` containing your response. "
)
response_str = "Response"
else:
prompt += (
"Now it's your turn. "
"\n------\n"
f"{context_str}"
"\n------\n"
f"Query: {query}\n"
)
response_str = "Answer"
prompt += f"{response_str}:"
return prompt
@pw.udf
def prompt_summarize(text_list: list[str]) -> str:
"""
Generate a summarization prompt with the list of texts.
Args:
text_list: List of text documents.
Returns:
Summarized text.
"""
text = "\n".join(text_list)
prompt = f"""Given a list of documents, summarize them in few sentences \
while preserving important points and entities.
Documents: {text}
Summary:"""
return prompt
@pw.udf
def prompt_query_rewrite_hyde(query: str) -> str:
"""
Generate prompt for query rewriting using the HyDE technique.
Args:
query: Original search query or user prompt.
Returns:
Transformed query.
"""
prompt = f"""Write 4 responses to answer the given question with hypothetical data.
Try to include as many key details as possible.
Question: `{query}`.
Responses:"""
return prompt
@pw.udf
def prompt_query_rewrite(query: str, *additional_args: str) -> str:
"""
Generate prompt for query rewriting.
Prompt function to generate and augment index search queries using important names,
entities and information from the given input. Generates three transformed queries
concatenated with comma to improve the search performance.
Args:
query: Original search query or user prompt.
additional_args: Additional information that may help LLM in generating the query.
Returns:
Transformed query.
"""
prompt = f"""Given a question that will be used to retrieve similar documents for RAG application.
Rewrite question to be better usable in retrieval search.
Use important entities, words that may be related to query and other entity names.
Your response should be three queries based on the question provided, separated by comma.
Question: `{query}`
"""
if additional_args:
prompt += """If any of the provided sections are related to question, write section name in the query as well.
Here is additional info that you can include in search: """
for arg in additional_args:
prompt += f" `{arg}`\n"
prompt += "Rewritten query:"
return prompt
@pw.udf
def prompt_citing_qa(
query: str, docs: list[pw.Json], additional_rules: str = ""
) -> str:
context_pieces = []
for i, doc in enumerate(docs, 1):
context_pieces.append(f"# Source {i}")
context_pieces.append(doc["text"]) # type: ignore
context_pieces.append("")
context_str = "\n".join(context_pieces)
prompt = (
"Please provide an answer based solely on the provided sources. "
"When referencing information from a source, "
"cite the appropriate source(s) using their corresponding numbers. "
"Every answer should include at least one source citation. "
"Only cite a source when you are explicitly referencing it. "
"If exists, mention specific article/section header you use at the beginning of answer, such as '4.a Client has rights to...'. " # noqa: E501
"Article headers may or may not be in docs, dont mention it if there is none. "
# "If none of the sources are helpful, you should indicate that. "
# "For example:\n"
# "# Source 1:\n"
# "4.a The sky is red in the evening and blue in the morning.\n"
# "# Source 2:\n"
# "5.c Water is wet when the sky is red.\n"
# "Query: When is water wet?\n"
# "Answer: *5.c* Water will be wet when the sky is red [2], "
# "which occurs in the evening [1].\n"
# "If several citations are used, separate them with comma such as, '*5.c,4.a*'\n"
"If question cannot be inferred from documents SAY `No information found`. "
)
prompt += additional_rules + " "
prompt += (
"Now it's your turn. Below are several numbered sources of information:"
"\n------\n"
f"{context_str}"
"\n------\n"
f"Query: {query}\n"
"Answer:"
)
return prompt
@pw.udf
def parse_cited_response(response_text, docs):
cited_docs = [
int(cite[1:-1]) - 1
for cite in set(re.findall("\[\d+\]", response_text)) # noqa: W605
]
start_index = response_text.find("*") + 1
end_index = response_text.find("*", start_index)
citations = [docs[i] for i in cited_docs if i in cited_docs]
cleaned_citations = []
if (
start_index != -1 and end_index != -1
): # doing this for the GIF, we need a better way to do this, TODO: redo
cited = response_text[start_index:end_index]
response_text = response_text[end_index:].strip()
cited = (
cited.replace(" ", "")
.replace(",,", ",")
.replace(",", ",\n")
.replace(" ", "\n")
)
text_body = citations[0]["text"]
new_text = f"<b>{cited}</b>\n\n".replace("\n\n\n", "\n") + text_body
citations[0]["text"] = new_text
cleaned_citations.append(citations[0])
if len(citations) > 1:
for doc in citations[1:]:
text_body = doc["text"] # TODO: unformat and clean the text
doc["text"] = text_body
cleaned_citations.append(doc)
return response_text, cleaned_citations