Skip to content

Commit

Permalink
Merge pull request #66 from michaelthwan/refactor
Browse files Browse the repository at this point in the history
Restructure classes and refactoring
  • Loading branch information
michaelthwan committed Mar 11, 2023
2 parents 54dbfc5 + f9b3fe3 commit f810f77
Show file tree
Hide file tree
Showing 15 changed files with 207 additions and 202 deletions.
Binary file modified img/architecture_roadmap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 8 additions & 8 deletions src/BingService.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@
class BingService:
def __init__(self, config):
self.config = config
extract_svc = self.config.get('bing_search').get('text_extract')
extract_svc = self.config.get('source_service').get('bing_search').get('text_extract')
if extract_svc == 'trafilatura':
self.txt_extract_svc = TrafilaturaSvc()
elif extract_svc == 'beautifulsoup':
self.txt_extract_svc = BeautifulSoupSvc()

@storage_cached('bing_search_website', 'query')
def call_bing_search_api(self, query: str) -> pd.DataFrame:
logger.info("BingService.call_bing_search_api. query: " + query)
subscription_key = self.config.get('bing_search').get('subscription_key')
endpoint = self.config.get('bing_search').get('end_point') + "/v7.0/search"
@storage_cached('bing_search_website', 'search_text')
def call_bing_search_api(self, search_text: str) -> pd.DataFrame:
logger.info("BingService.call_bing_search_api. query: " + search_text)
subscription_key = self.config.get('source_service').get('bing_search').get('subscription_key')
endpoint = self.config.get('source_service').get('bing_search').get('end_point') + "/v7.0/search"
mkt = 'en-US'
params = {'q': query, 'mkt': mkt}
params = {'q': search_text, 'mkt': mkt}
headers = {'Ocp-Apim-Subscription-Key': subscription_key}

try:
Expand All @@ -37,7 +37,7 @@ def call_bing_search_api(self, query: str) -> pd.DataFrame:
columns = ['name', 'url', 'snippet']
website_df = pd.DataFrame(response.json()['webPages']['value'])[columns]
website_df['url_id'] = website_df.index + 1
website_df = website_df[:self.config.get('bing_search').get('result_count')]
website_df = website_df[:self.config.get('source_service').get('bing_search').get('result_count')]
except Exception as ex:
raise ex
return website_df
Expand Down
65 changes: 0 additions & 65 deletions src/FrontendService.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,27 +94,6 @@ def get_explain_json(text, word_color_dict):
source_explain_json = get_explain_json(source_text, word_color_dict)
return response_explain_json, source_explain_json

in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int)
in_scope_source_df.sort_values('docno', inplace=True)
source_text_list = []
source_json = []
source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True)
for index, row in source_url_df.iterrows():
url_text = ''
url_text += f"[{row['url_id']}] {row['url']}\n"

for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows():
url_text += f" {row['text']}\n"

source_text_list.append(url_text)

domain_name = urlparse(row['url']).netloc.replace('www.', '')
source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet']))
source_text = ''.join(sorted(source_text_list))

source_json = sorted(source_json, key=lambda x: x['footnote'])
return source_json, source_text

response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df)
response_json = get_response_json(response_text)
source_json, source_text = get_source_json(in_scope_source_df)
Expand All @@ -125,47 +104,3 @@ def get_explain_json(text, word_color_dict):
'response_explain_json': response_explain_json,
'source_explain_json': source_explain_json
}


if __name__ == '__main__':
paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]."
paragraph2 = """
Source (1)
ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response.
- ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly.
ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure.
Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF).
Source (3)
ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data.
But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware.
ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn.
But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader.
ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on.
ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way".
ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times.
ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss
"""

# common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2)
# # print(common_stems)
# for common_stem in common_stems:
# print(common_stem)

# text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early",
# "sensitive to tweaks to the input phrasing or attempting the same prompt multiple",
# "is fine-tuned from a model in the gpt-3.5 series, which finished training in",
# "is fine-tuned from a model in the gpt-3.5 series, which finished training",
# "sensitive to tweaks to the input phrasing or attempting the same prompt",
# "is fine-tuned from a model in the gpt-3.5 series, which finished",
# "sensitive to tweaks to the input phrasing or attempting the same",
# "sensitive to tweaks to the input phrasing or attempting the",
# "is fine-tuned from a model in the gpt-3.5 series, which"]
# text_list = FrontendService.remove_substrings(text_list)
# for text in text_list:
# print(text)

response_text = "is fine-tuned from a gpt-3.5 series"
split_list = FrontendService.split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"])
for sentence in split_list:
print(sentence)
14 changes: 7 additions & 7 deletions src/LLMService.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def clean_response_text(self, response_text: str):

def get_prompt(self, search_text: str, gpt_input_text_df: pd.DataFrame):
logger.info(f"OpenAIService.get_prompt. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}")
prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit')
is_use_source = self.config.get('search_option').get('is_use_source')
prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')
is_use_source = self.config.get('source_service').get('is_use_source')
if is_use_source:
prompt_engineering = f"\n\nAnswer the question '{search_text}' using above information with about 100 words:"
prompt = ""
Expand All @@ -43,7 +43,7 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame):
for index, row in gpt_input_text_df[gpt_input_text_df['url_id'] == url_id].iterrows():
context_str += f"{row['text']}\n"
context_str += "\n"
prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit')
prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')
context_str = context_str[:prompt_length_limit]
prompt = \
f"""
Expand All @@ -58,7 +58,7 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame):
return prompt

def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame):
if not self.config.get('search_option').get('is_use_source'):
if not self.config.get('source_service').get('is_use_source'):
prompt = \
f"""
Instructions: Write a comprehensive reply to the given query.
Expand All @@ -75,7 +75,7 @@ def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame):
for index, row in gpt_input_text_df[(gpt_input_text_df['url_id'] == row_url['url_id']) & gpt_input_text_df['in_scope']].iterrows():
context_str += f"{row['text']}\n"
context_str += "\n\n"
prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit')
prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')
context_str = context_str[:prompt_length_limit]
prompt = \
f"""
Expand All @@ -98,14 +98,14 @@ def call_api(self, prompt):
class OpenAIService(LLMService):
def __init__(self, config):
super().__init__(config)
open_api_key = config.get('openai_api').get('api_key')
open_api_key = config.get('llm_service').get('openai_api').get('api_key')
if open_api_key is None:
raise Exception("OpenAI API key is not set.")
openai.api_key = open_api_key

@storage_cached('openai', 'prompt')
def call_api(self, prompt: str):
openai_api_config = self.config.get('openai_api')
openai_api_config = self.config.get('llm_service').get('openai_api')
model = openai_api_config.get('model')
logger.info(f"OpenAIService.call_api. model: {model}, len(prompt): {len(prompt)}")

Expand Down
44 changes: 44 additions & 0 deletions src/NLPUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,47 @@ def split_with_delimiters(string, delimiter_list):
if start < len(string):
result.append(string[start:])
return result


if __name__ == '__main__':
paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]."
paragraph2 = """
Source (1)
ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response.
- ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly.
ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure.
Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF).
Source (3)
ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data.
But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware.
ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn.
But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader.
ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on.
ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way".
ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times.
ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss
"""

# common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2)
# # print(common_stems)
# for common_stem in common_stems:
# print(common_stem)

# text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early",
# "sensitive to tweaks to the input phrasing or attempting the same prompt multiple",
# "is fine-tuned from a model in the gpt-3.5 series, which finished training in",
# "is fine-tuned from a model in the gpt-3.5 series, which finished training",
# "sensitive to tweaks to the input phrasing or attempting the same prompt",
# "is fine-tuned from a model in the gpt-3.5 series, which finished",
# "sensitive to tweaks to the input phrasing or attempting the same",
# "sensitive to tweaks to the input phrasing or attempting the",
# "is fine-tuned from a model in the gpt-3.5 series, which"]
# text_list = FrontendService.remove_substrings(text_list)
# for text in text_list:
# print(text)

response_text = "is fine-tuned from a gpt-3.5 series"
split_list = split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"])
for sentence in split_list:
print(sentence)
Loading

0 comments on commit f810f77

Please sign in to comment.