Merge pull request #66 from michaelthwan/refactor

Restructure classes and refactoring
michaelthwan · Mar 11, 2023 · f810f77 · f810f77
2 parents 54dbfc5 + f9b3fe3
commit f810f77
Show file tree

Hide file tree

Showing 15 changed files with 207 additions and 202 deletions.
diff --git a/img/architecture_roadmap.png b/img/architecture_roadmap.png
diff --git a/src/BingService.py b/src/BingService.py
@@ -15,19 +15,19 @@
 class BingService:
     def __init__(self, config):
         self.config = config
-        extract_svc = self.config.get('bing_search').get('text_extract')
+        extract_svc = self.config.get('source_service').get('bing_search').get('text_extract')
         if extract_svc == 'trafilatura':
             self.txt_extract_svc = TrafilaturaSvc()
         elif extract_svc == 'beautifulsoup':
             self.txt_extract_svc = BeautifulSoupSvc()
 
-    @storage_cached('bing_search_website', 'query')
-    def call_bing_search_api(self, query: str) -> pd.DataFrame:
-        logger.info("BingService.call_bing_search_api. query: " + query)
-        subscription_key = self.config.get('bing_search').get('subscription_key')
-        endpoint = self.config.get('bing_search').get('end_point') + "/v7.0/search"
+    @storage_cached('bing_search_website', 'search_text')
+    def call_bing_search_api(self, search_text: str) -> pd.DataFrame:
+        logger.info("BingService.call_bing_search_api. query: " + search_text)
+        subscription_key = self.config.get('source_service').get('bing_search').get('subscription_key')
+        endpoint = self.config.get('source_service').get('bing_search').get('end_point') + "/v7.0/search"
         mkt = 'en-US'
-        params = {'q': query, 'mkt': mkt}
+        params = {'q': search_text, 'mkt': mkt}
         headers = {'Ocp-Apim-Subscription-Key': subscription_key}
 
         try:
@@ -37,7 +37,7 @@ def call_bing_search_api(self, query: str) -> pd.DataFrame:
             columns = ['name', 'url', 'snippet']
             website_df = pd.DataFrame(response.json()['webPages']['value'])[columns]
             website_df['url_id'] = website_df.index + 1
-            website_df = website_df[:self.config.get('bing_search').get('result_count')]
+            website_df = website_df[:self.config.get('source_service').get('bing_search').get('result_count')]
         except Exception as ex:
             raise ex
         return website_df

diff --git a/src/FrontendService.py b/src/FrontendService.py
@@ -94,27 +94,6 @@ def get_explain_json(text, word_color_dict):
             source_explain_json = get_explain_json(source_text, word_color_dict)
             return response_explain_json, source_explain_json
 
-            in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int)
-            in_scope_source_df.sort_values('docno', inplace=True)
-            source_text_list = []
-            source_json = []
-            source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True)
-            for index, row in source_url_df.iterrows():
-                url_text = ''
-                url_text += f"[{row['url_id']}] {row['url']}\n"
-
-                for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows():
-                    url_text += f"  {row['text']}\n"
-
-                source_text_list.append(url_text)
-
-                domain_name = urlparse(row['url']).netloc.replace('www.', '')
-                source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet']))
-            source_text = ''.join(sorted(source_text_list))
-
-            source_json = sorted(source_json, key=lambda x: x['footnote'])
-            return source_json, source_text
-
         response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df)
         response_json = get_response_json(response_text)
         source_json, source_text = get_source_json(in_scope_source_df)
@@ -125,47 +104,3 @@ def get_explain_json(text, word_color_dict):
                              'response_explain_json': response_explain_json,
                              'source_explain_json': source_explain_json
                              }
-
-
-if __name__ == '__main__':
-    paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]."
-    paragraph2 = """
-Source (1)
-ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response.
-- ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly.
-ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure.
-Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF).
-
-Source (3)
-ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data.
-But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware.
-ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn.
-But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader.
-ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on.
-ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way".
-ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times.
-ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss
-    """
-
-    # common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2)
-    # # print(common_stems)
-    # for common_stem in common_stems:
-    #     print(common_stem)
-
-    # text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early",
-    #              "sensitive to tweaks to the input phrasing or attempting the same prompt multiple",
-    #              "is fine-tuned from a model in the gpt-3.5 series, which finished training in",
-    #              "is fine-tuned from a model in the gpt-3.5 series, which finished training",
-    #              "sensitive to tweaks to the input phrasing or attempting the same prompt",
-    #              "is fine-tuned from a model in the gpt-3.5 series, which finished",
-    #              "sensitive to tweaks to the input phrasing or attempting the same",
-    #              "sensitive to tweaks to the input phrasing or attempting the",
-    #              "is fine-tuned from a model in the gpt-3.5 series, which"]
-    # text_list = FrontendService.remove_substrings(text_list)
-    # for text in text_list:
-    #     print(text)
-
-    response_text = "is fine-tuned from a gpt-3.5 series"
-    split_list = FrontendService.split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"])
-    for sentence in split_list:
-        print(sentence)
diff --git a/src/LLMService.py b/src/LLMService.py
@@ -20,8 +20,8 @@ def clean_response_text(self, response_text: str):
 
     def get_prompt(self, search_text: str, gpt_input_text_df: pd.DataFrame):
         logger.info(f"OpenAIService.get_prompt. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}")
-        prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit')
-        is_use_source = self.config.get('search_option').get('is_use_source')
+        prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')
+        is_use_source = self.config.get('source_service').get('is_use_source')
         if is_use_source:
             prompt_engineering = f"\n\nAnswer the question '{search_text}' using above information with about 100 words:"
             prompt = ""
@@ -43,7 +43,7 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame):
             for index, row in gpt_input_text_df[gpt_input_text_df['url_id'] == url_id].iterrows():
                 context_str += f"{row['text']}\n"
             context_str += "\n"
-        prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit')
+        prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')
         context_str = context_str[:prompt_length_limit]
         prompt = \
             f"""
@@ -58,7 +58,7 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame):
         return prompt
 
     def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame):
-        if not self.config.get('search_option').get('is_use_source'):
+        if not self.config.get('source_service').get('is_use_source'):
             prompt = \
                 f"""
 Instructions: Write a comprehensive reply to the given query.  
@@ -75,7 +75,7 @@ def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame):
             for index, row in gpt_input_text_df[(gpt_input_text_df['url_id'] == row_url['url_id']) & gpt_input_text_df['in_scope']].iterrows():
                 context_str += f"{row['text']}\n"
             context_str += "\n\n"
-        prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit')
+        prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')
         context_str = context_str[:prompt_length_limit]
         prompt = \
             f"""
@@ -98,14 +98,14 @@ def call_api(self, prompt):
 class OpenAIService(LLMService):
     def __init__(self, config):
         super().__init__(config)
-        open_api_key = config.get('openai_api').get('api_key')
+        open_api_key = config.get('llm_service').get('openai_api').get('api_key')
         if open_api_key is None:
             raise Exception("OpenAI API key is not set.")
         openai.api_key = open_api_key
 
     @storage_cached('openai', 'prompt')
     def call_api(self, prompt: str):
-        openai_api_config = self.config.get('openai_api')
+        openai_api_config = self.config.get('llm_service').get('openai_api')
         model = openai_api_config.get('model')
         logger.info(f"OpenAIService.call_api. model: {model}, len(prompt): {len(prompt)}")
 

diff --git a/src/NLPUtil.py b/src/NLPUtil.py
@@ -73,3 +73,47 @@ def split_with_delimiters(string, delimiter_list):
     if start < len(string):
         result.append(string[start:])
     return result
+
+
+if __name__ == '__main__':
+    paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]."
+    paragraph2 = """
+Source (1)
+ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response.
+- ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly.
+ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure.
+Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF).
+
+Source (3)
+ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data.
+But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware.
+ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn.
+But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader.
+ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on.
+ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way".
+ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times.
+ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss
+    """
+
+    # common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2)
+    # # print(common_stems)
+    # for common_stem in common_stems:
+    #     print(common_stem)
+
+    # text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early",
+    #              "sensitive to tweaks to the input phrasing or attempting the same prompt multiple",
+    #              "is fine-tuned from a model in the gpt-3.5 series, which finished training in",
+    #              "is fine-tuned from a model in the gpt-3.5 series, which finished training",
+    #              "sensitive to tweaks to the input phrasing or attempting the same prompt",
+    #              "is fine-tuned from a model in the gpt-3.5 series, which finished",
+    #              "sensitive to tweaks to the input phrasing or attempting the same",
+    #              "sensitive to tweaks to the input phrasing or attempting the",
+    #              "is fine-tuned from a model in the gpt-3.5 series, which"]
+    # text_list = FrontendService.remove_substrings(text_list)
+    # for text in text_list:
+    #     print(text)
+
+    response_text = "is fine-tuned from a gpt-3.5 series"
+    split_list = split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"])
+    for sentence in split_list:
+        print(sentence)