Skip to content

Commit 84a6a6e

Browse files
MSCetin37okhleif-10pre-commit-ci[bot]lkk12014402chensuyue
authored
Adding URL summary option to DocSum Gradio-UI (#1248)
Signed-off-by: okhleif-IL <omar.khleif@intel.com> Co-authored-by: okhleif-IL <omar.khleif@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: lkk <33276950+lkk12014402@users.noreply.github.com> Co-authored-by: chen, suyue <suyue.chen@intel.com> Co-authored-by: WenjiaoYue <wenjiao.yue@intel.com>
1 parent 89a7f9e commit 84a6a6e

File tree

2 files changed

+60
-1
lines changed

2 files changed

+60
-1
lines changed

DocSum/ui/gradio/docsum_ui_gradio.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
import json
77
import logging
88
import os
9+
from urllib.parse import urlparse
910

1011
import gradio as gr
1112
import requests
1213
import uvicorn
1314
from fastapi import FastAPI
14-
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
15+
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader, UnstructuredURLLoader
1516

1617
# Configure logging
1718
logging.basicConfig(level=logging.INFO)
@@ -91,6 +92,42 @@ def read_video_file(self, file):
9192
base64_str = self.encode_file_to_base64(file)
9293
return self.generate_summary(base64_str, document_type="video")
9394

95+
def is_valid_url(self, url):
96+
try:
97+
result = urlparse(url)
98+
return all([result.scheme, result.netloc])
99+
except ValueError:
100+
return False
101+
102+
def read_url(self, url):
103+
"""Read and process the content of a url.
104+
105+
Args:
106+
url: The url to be read as a document.
107+
108+
Returns:
109+
str: The content of the website or an error message if the url is unsupported.
110+
"""
111+
112+
self.page_content = ""
113+
114+
logger.info(">>> Reading url: %s", url)
115+
if self.is_valid_url(url=url):
116+
os.environ["no_proxy"] = f"{os.environ.get('no_proxy', '')},{url}".strip(",")
117+
try:
118+
loader = UnstructuredURLLoader([url])
119+
page = loader.load()
120+
self.page_content = [content.page_content for content in page][0]
121+
except Exception as e:
122+
msg = f"There was an error trying to read '{url}' --> '{e}'\nTry adding the domain name to your `no_proxy` variable and try again. Example: example.com*"
123+
logger.error(msg)
124+
else:
125+
msg = f"Invalid URL '{url}'. Make sure the link provided is a valid URL"
126+
logger.error(msg)
127+
return msg
128+
129+
return self.page_content
130+
94131
def generate_summary(self, doc_content, document_type="text"):
95132
"""Generate a summary for the given document content.
96133
@@ -201,6 +238,25 @@ def render(self):
201238
)
202239
submit_btn.click(fn=self.generate_summary, inputs=[input_text], outputs=[generated_text])
203240

241+
with gr.Blocks() as url_ui:
242+
# URL text UI
243+
with gr.Row():
244+
with gr.Column():
245+
input_text = gr.TextArea(
246+
label="Please paste a URL for summarization",
247+
placeholder="Paste a URL for the information you need to summarize",
248+
)
249+
submit_btn = gr.Button("Generate Summary")
250+
with gr.Column():
251+
generated_text = gr.TextArea(
252+
label="Text Summary", placeholder="Summarized text will be displayed here"
253+
)
254+
submit_btn.click(
255+
lambda input_text: self.generate_summary(self.read_url(input_text)),
256+
inputs=input_text,
257+
outputs=generated_text,
258+
)
259+
204260
# File Upload UI
205261
file_ui = self.create_upload_ui(
206262
label="Please upload a document (.pdf, .doc, .docx)",
@@ -232,6 +288,8 @@ def render(self):
232288
audio_ui.render()
233289
with gr.TabItem("Upload Video"):
234290
video_ui.render()
291+
with gr.TabItem("Enter URL"):
292+
url_ui.render()
235293

236294
return self.demo
237295

DocSum/ui/gradio/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ numpy==1.26.4
66
opencv-python==4.10.0.82
77
Pillow==10.3.0
88
pypdf
9+
unstructured

0 commit comments

Comments
 (0)