In [1]:
import os
from io import BytesIO
import markdown2
import requests
from langgraph.constants import Send
from langgraph.graph import END, StateGraph
from LLM_get_folder import get_local_folder
from PIL import Image
from utilities.wordpress_tools import (
    get_news_urls,
    insert_keyword_url,
    post_wordpress_file,
    post_wordpress_post,
    set_news_url_flag,
    tags_to_IDs,
    tags_to_IDs_en,
    update_summary_qa,
    image_insert_fuc,
)
from schemas.schemas_publish import (
    OutlinesList,
    GraphState,
    SummaryOutput,
    MorePoints,
    MetaFormat,
)
from utilities.llm_wrapper import llm_wrapper_raw, llm_image_wrapper
from utilities.web_search_wrapper import web_search_wrapper
from utilities.web_loader_wrapper import web_loader_wrapper
from langfuse.decorators import observe

MAX_WEB_URL = 5
MAX_QUERY_RESULT = 3

In [2]:
@observe
def summary_from_url(url):
    """
    from langchain_community.document_loaders import SeleniumURLLoader
    loader =SeleniumURLLoader(urls=[])
    docs=loader.load()
    """
    try:
        content = web_loader_wrapper(url)
        summary_sys_prompt = """
        Task Description: You are a professional news summarizer. Based on the content of the webpage provided, create a news summary of \
            approximately 500 English words.The summary must be written in English, ensuring comprehensive coverage of the information.
        Specific Requirements:
        1. News Summary: Extract the core content of the news, ensuring the information is complete and coherent. \
            The length should be around 500 English words.
        2. Title Extraction: If the webpage already contains a title, extract it. If there is no title, summarize an appropriate title based on the content. \
            The title must be in English.
        3. Date Information: If the webpage includes a publication date, make sure to include this date in the news summary, \
            using a format that includes the year.
        4. Content Related to U.S. Universities: If the webpage mentions U.S. universities (such as Harvard University, Yale University, etc.), \
        ensure that any related information (e.g., connection to the event or the author) is included in the summary.
        """
        summary_user_prompt = f"""content: {content}"""
        response = llm_wrapper_raw(summary_sys_prompt, summary_user_prompt, SummaryOutput).parsed
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404 or e.response.status_code == 403:
            response = "404"
        else:
            response = None
    except:
        response = None
    return response

In [3]:
@observe
def summary(state):
    url = state["url"]
    response = summary_from_url(url)
    if response is None:
        set_news_url_flag(url)
        raise Exception("Orginal url None error")
    if response == "404":
        set_news_url_flag(url)
        raise Exception("Orginal url 403/404 error")
    title = response.title
    summary = response.summary
    print("Finish Initial Summary: ", url)
    url_base = url.split("?")[0]
    return {
        "summary": summary,
        "documents": [
            {
                "topic": "This is the original article summary.",
                "url": url_base,
                "title": title,
                "summary": summary,
            }
        ],
    }

In [4]:
@observe
def more_topics(state):
    summary = state["summary"]
    summary_system_prompt = """From the original summary provided by the user, identify five topics that are \
        closely related to the content and can stimulate further discussion. For each topic, generate a concise \
            and relevant search query in English to represent the discussion point."""
    summary_user_prompt = f"""summary: {summary}"""
    response = llm_wrapper_raw(summary_system_prompt, summary_user_prompt, MorePoints).parsed
    topics = response.more
    return {"topics": topics}

In [5]:
def topics_to_search(state):
    return [
        Send("web_search", {"query": topic}) for topic in state["topics"][:MAX_WEB_URL]
    ]

In [6]:
@observe
def web_search(state):
    query = state["query"]
    results = web_search_wrapper(query)
    n_results = 0
    documents = []
    for result in results["news"]:
        if n_results >= MAX_QUERY_RESULT:
            break
        url = result["link"]
        response = summary_from_url(url)
        if response is None:
            print(url, " --- none response!")
            continue
        elif response == "404":
            print(url, " --- 403/404 response")
            continue
        else:
            print(url, " --- done!")
            summary = response.summary
            title = response.title
            documents = documents + [
                {"topic": query, "url": url, "title": title, "summary": summary}
            ]
            n_results = n_results + 1
    return {"documents": documents}

In [7]:
@observe
def write_outline_cn(state):
    documents = state["documents"]
    outline_system_prompt = """
    你是一名专注于美国大学新闻的专业评论员。你将收到新闻标题、链接和新闻摘要，以及相关讨论主题和相关文章链接及其摘要。基于这些信息，\
        你的任务是为一篇3000字的评论文章创建详细的写作大纲。你的任务包括以下要求：
    1. 你不需要撰写完整的评论文章，而是提供一个全面的写作规划大纲。
    2. 将文章分为3到5个部分，每个部分应有一个明确的标题，标题放入`title`字段中。标题只包含标题内容，不要包含字数信息。
    3. 对于每个部分，详细描述该部分要涵盖的具体内容，讨论如何展开，以及信息应如何结构化。尽可能提供清晰且具体的指导。描述放入`description`字段中。
    4. 根据3000字的总目标，估算每个部分的合适字数分配。字数放入`words`字段中。
    5. 在撰写大纲时，确保你仅基于提供的信息，并据此规划文章结构。
    6. 每个部分的顺序应与最终文章中的部分顺序一致。
    7. 所有输出内容都应该是中文。
    """
    outline_user_prompt = f"原始新闻及相关讨论主题的材料：{documents}"
    response = llm_wrapper_raw(outline_system_prompt, outline_user_prompt, OutlinesList).parsed
    sections = response.sections
    return {"sections": sections}

In [8]:
@observe
def write_outline_en(state):
    documents = state["documents"]
    outline_system_prompt = """
    You are a professional commentator specializing in news about U.S. universities. You will be provided with a headline, link, \
        and summary of a piece of original news, along with related discussion topics and links to relevant articles and their summaries. \
            Based on this information, your task is to create a detailed outline for a 3,000-word commentary article.
    Please follow these instructions:
    1. You do not need to write the full article; instead, provide a comprehensive outline.
    2. Divide the article into 3 to 5 sections, each with a clear title (use the `title` field).
    3. For each section, describe the specific content to be covered, explain how the discussion should develop, and outline the \
        structure of the information (use the `description` field).
    4. Distribute the 3,000-word target across the sections, estimating a word count for each section (use the `words` field).
    5. Base the outline solely on the provided materials, and ensure the sections appear in the order they would in the final article.
    6. All outputs should be in English.
    """
    outline_user_prompt = f"Original summary and topics and summaries for expanded discussion：{documents}"
    response = llm_wrapper_raw(outline_system_prompt, outline_user_prompt, OutlinesList).parsed
    sections = response.sections
    return {"sections_en": sections}

In [9]:
@observe
def write_article_cn(state):
    documents = state["documents"]
    sections = state["sections"]
    article = ""
    write_sections = []
    for section in sections:
        write_system_prompt = f"""
你是一名美国续航教育公司专注于美国大学新闻的年轻评论员，25岁左右，女性，来自中国并在美国大学研究生毕业。你的评论风格年轻化、轻松化，\
    习惯称呼自己为“小编”，并在评论文章中加入独到的见解。你的用词丰富多样，避免平淡。

你将收到新闻标题、链接和新闻摘要，以及相关讨论主题和相关文章的链接及摘要。请使用这些材料，按照提供的写作大纲，**接续已完成的部分，撰写评论文章的下一部分**。

请确保你的写作符合以下要求：
    - **内容要求**：{section.description}
    - **字数限制**：约{section.words}字
    - **连贯性**：你的内容应紧接已完成的部分，保持上下文的连贯，衔接自然
    - **避免总结性陈述**：除非你要写作的部分是文章的结论部分，否则不要做出任何总结性或概括性的陈述，尤其不要对这一部分进行总结
    - **不含标题**：内容中不应包含任何标题
    - **语言要求**：最终内容请用中文撰写
    - **结构一致性**：参考全文的写作大纲和已完成的部分，确保你的内容与整体结构一致
    - **语言自然性**：注意用词多样性，使文章自然流畅，更贴近人类的表达方式
    - **输出格式**： 输出内容仅包含文字部分，不要包含任何其他信息或格式，例如不要使用代码块（不要使用```）、markdown等
"""

        write_user_prompt = f"原始新闻及相关讨论主题的材料：{documents}；全文的写作大纲：{sections}；文章已完成的部分：{article}"


        response = llm_wrapper_raw(write_system_prompt, write_user_prompt).text
        write_sections = write_sections + [
            {"title": section.title, "content": response}
        ]
        article = article + "\n\n" + response
        print(section.title, "--------section done!")
    return {"write_sections": write_sections}

In [25]:
@observe
def write_article_en(state):
    documents = state["documents"]
    sections = state["sections_en"]
    article = ""
    write_sections = []
    for section in sections:
        write_system_prompt = f"""
You are a young commentator specializing in U.S. university news for Nuhom Education Company, approximately 25 years old, female, originally from China, and a graduate of a U.S. university with a master's degree. Your commentary style is youthful and casual; you habitually refer to yourself as "the editor" and include your unique insights in your articles. Your vocabulary is rich and varied, avoiding blandness.

You will receive the news title, link, and summary, as well as related discussion topics and links to related articles with their summaries. Please use these materials to **continue from the completed parts and write the next section of the commentary article** according to the provided writing outline.

Please ensure your writing meets the following requirements:
- **Content Requirements**: {section.description}
- **Word Limit**: Approximately {section.words} words
- **Coherence**: Your content should directly follow the completed parts, maintaining contextual coherence and natural transitions
- **Avoid Summative Statements**: Unless the part you're writing is the conclusion of the article, do not make any summarizing or generalizing statements, especially do not summarize this section
- **No Titles**: The content should not include any titles
- **Language Requirements**: The final content should be written in English
- **Structural Consistency**: Refer to the overall writing outline and the completed parts of the article to ensure your content is consistent with the overall structure
- **Natural Language**: Pay attention to the diversity of word choice to make the article flow naturally and be closer to human expression
- **Output Format**: Please ensure the output contains only the main text, without any additional information or formatting, such as code blocks, markdown, etc.
"""

        write_user_prompt = f"Materials of the original news and related discussion topics: {documents}; Full writing outline: {sections}; Completed parts of the article: {article}"

        response = llm_wrapper_raw(write_system_prompt, write_user_prompt).text
        write_sections = write_sections + [
            {"title": section.title, "content": response}
        ]
        article = article + "\n\n" + response
        print(section.title, "--------section done!")
    return {"write_sections_en": write_sections}

In [26]:
@observe
def add_reference(state):
    documents = state["documents"]
    sections = state["sections"]
    sections_en = state["sections_en"]
    write_sections = state["write_sections"]
    write_sections_en = state["write_sections_en"]
    content_cn = ""
    content_en = ""
    section_number = 0
    section_number_cn = len(write_sections) // 2 - 1
    section_number_en = len(write_sections_en) // 2 - 1
    for section in sections:
        for write_section in write_sections:
            if write_section["title"] == section.title:
                if section_number == section_number_cn:
                    content_cn = (
                        content_cn
                        + "###"
                        + write_section["title"]
                        + "\n\n"
                        + image_insert_fuc(write_section["content"])
                        + "\n\n"
                    )
                else:
                    content_cn = (
                        content_cn
                        + "###"
                        + write_section["title"]
                        + "\n\n"
                        + write_section["content"]
                        + "\n\n"
                    )
                break
        section_number = section_number + 1
    section_number = 0
    for section in sections_en:
        for write_section in write_sections_en:
            if write_section["title"] == section.title:
                if section_number == section_number_en:
                    content_en = (
                        content_en
                        + "###"
                        + write_section["title"]
                        + "\n\n"
                        + image_insert_fuc(write_section["content"])
                        + "\n\n"
                    )
                else:
                    content_en = (
                        content_en
                        + "###"
                        + write_section["title"]
                        + "\n\n"
                        + write_section["content"]
                        + "\n\n"
                    )
                break
        section_number = section_number + 1

    rewrite_system_message = """你是一名专注于美国大学新闻的评论员。你将收到新闻标题、链接和新闻摘要，以及相关讨论的主题、相关文章链接及其摘要。\
        用户将提供一篇基于这些材料写好的文章。你的任务是根据提供的材料在文章末尾添加参考文献。确保只包含文章中直接引用过的来源，跳过没有直接引用过的资料来源。\
            最终输出结果的格式为引用的文章标题数字列表，并在标题上加上url超链接。输出内容仅包含参考文献的内容，不包含任何标题（比如`参考资料`）或其他额外内容。"""
    rewrite_user_prompt = f"Original summary and topics and summaries for expanded discussion：{documents} \n\n 用户写的文章: {content_cn}"
    reference_cn = llm_wrapper_raw(rewrite_system_message, rewrite_user_prompt).text

    rewrite_system_message = """You are a commentator specializing in news about American universities. You will receive news titles, \
        links, and summaries, as well as related discussion topics, links to related articles, and their summaries. The user will provide \
            an article written based on these materials. Your task is to add references at the end of the article based on the provided materials. \
                Ensure that only sources directly cited in the article are included. Skip any sources that are not directly cited. \
                    The final output should be a numeric list of the titles of the cited articles, with each title hyperlinked to its URL. \
                        The output should only contain the references, without any headings or additional content."""
    rewrite_user_prompt = f"Original summary and topics and summaries for expanded discussion：{documents} \n\n The written article: {content_en}"
    reference_en = llm_wrapper_raw(rewrite_system_message, rewrite_user_prompt).text

    content_cn = content_cn + "###参考资料：\n\n" + reference_cn
    content_en = content_en + "###Reference: \n\n" + reference_en
    return {"content": content_cn, "content_en": content_en}

In [None]:
@observe
def article_metas(state):
    content = state["content"]
    meta_system_prmopt = """请完成以下任务：
                1. 根据下面给出的文章内容，为文章取一个合适的标题。标题需要有中文和英文两个版本，中文版标题长度在20到30个中文字，英文标题长度在10到20个英文单词，分别放入title和title_en。
                2. 根据文章内容，生成一份详细的图像生成提示（image generation prompt），提示词应为英文，并注意使用不会违反“安全系统”的安全词汇。图像风格应基于文章内容，放入image_query。
                3. 为上述图像生成提示生成一个英文的图像文件名，但不包含文件类型扩展名，放入image_filename。
                4. 为上述图像生成alt text，中文和英文分别放出image_alt_text和image_alt_text_en。
                5. 生成一些与文章内容相关的标签，标签同样需要有中文和英文两个版本，分别放入tags和tags_en。"""
    meta_user_prompt = f"下面是需要处理的文章内容：\n\n{content}"
    response = llm_wrapper_raw(meta_system_prmopt, meta_user_prompt, MetaFormat).parsed
    tag_names = response.tags
    tags = tags_to_IDs(tag_names)
    tag_names_en = response.tags_en
    tags_en = tags_to_IDs_en(tag_names_en)
    return {
        "title": response.title,
        "title_en": response.title_en,
        "image_query": response.image_query,
        "image_filename": response.image_filename,
        "tags": tags,
        "tags_en": tags_en,
        "image_alt": response.image_alt_text,
        "image_alt_en": response.image_alt_text_en,
    }

In [28]:
@observe
def generate_image(state):
    image_filename = state["image_filename"]
    image_folder = os.path.join(get_local_folder(), "images")
    jpg_image = os.path.join(image_folder, image_filename + ".jpg")
    image_query = state["image_query"]
    try:
        generated_image = llm_image_wrapper(image_query)
        with Image.open(BytesIO(generated_image.image.image_bytes)) as image:
            image.save(jpg_image, optimized=True, quality=20)
        response = post_wordpress_file(jpg_image, lang_type="cn")
        response = response.json()
        image_ID = int(response.get("id"))
        image_url = response.get("guid").get("rendered")
        response_en = post_wordpress_file(jpg_image, lang_type="en")
        response_en = response_en.json()
        image_ID_en = int(response_en.get("id"))
        image_url_en = response_en.get("guid").get("rendered")
        return {
            "image_ID": image_ID,
            "image_url": image_url,
            "image_ID_en": image_ID_en,
            "image_url_en": image_url_en,
        }
    except Exception as e:
        print(e)
        return {"image_ID": -1, "image_url": "", "image_ID_en": -1, "image_url_en": ""}

@observe
def publish_post(state):
    title = state["title"]
    image_alt = state["image_alt"]
    image_ID = state["image_ID"]
    image_url = state["image_url"]
    tags = state["tags"]
    if image_ID == -1:
        image_url = "https://www.forwardpathway.com/wp-content/uploads/2024/06/fp_college_news_default.jpg"
        image_ID = 107009
    content = state["content"]
    raw_content = content
    if content.find("[image_placeholder]") > 0:
        content = content.replace(
            "[image_placeholder]",
            """<img src="{}" alt="{}">""".format(image_url, image_alt),
        )
    else:
        content = """<img src="{}" alt="{}">""".format(image_url, image_alt) + content
    content = markdown2.markdown(
        content,
        extras=["tables", "footnotes"],
    )
    (content, new_tags) = insert_keyword_url(content)
    tags = tags | new_tags
    response = post_wordpress_post(
        post_title=title,
        post_body=content,
        featured_media_id=image_ID,
        tags=tags,
        categories=[3627],
        comment_status="closed",
        lang_type="cn",
    )
    response = response.json()
    post_ID = response.get("id")
    update_summary_qa(post_ID, raw_content)
    return

@observe
def publish_post_en(state):
    title = state["title_en"]
    image_alt = state["image_alt_en"]
    image_ID = state["image_ID_en"]
    image_url = state["image_url_en"]
    tags = state["tags_en"]
    if image_ID == -1:
        image_url = "https://www.forwardpathway.us/wp-content/uploads/2024/07/fp_college_news_default.jpg"
        image_ID = 15899
    content = state["content_en"]
    if content.find("[image_placeholder]") > 0:
        content = content.replace(
            "[image_placeholder]",
            """<img src="{}" alt="{}">""".format(image_url, image_alt),
        )
    else:
        content = """<img src="{}" alt="{}">\n""".format(image_url, image_alt) + content
    content = markdown2.markdown(
        content,
        extras=["tables", "footnotes"],
    )
    (content, new_tags) = insert_keyword_url(content, lang_type="en")
    tags = tags | new_tags
    response = post_wordpress_post(
        post_title=title,
        post_body=content,
        featured_media_id=image_ID,
        tags=tags,
        categories=[9],
        comment_status="closed",
        lang_type="en",
    )
    response = response.json()
    return

In [29]:
######################## Build LangGraph ####################################
workflow = StateGraph(GraphState)
workflow.add_node("summary_node", summary)
workflow.add_node("more_topics", more_topics)
workflow.add_node("web_search", web_search)
workflow.add_node("write_outline_cn", write_outline_cn)
workflow.add_node("write_outline_en", write_outline_en)
workflow.add_node("write_article_cn", write_article_cn)
workflow.add_node("write_article_en", write_article_en)
workflow.add_node("add_reference", add_reference)
workflow.add_node("article_metas", article_metas)
workflow.add_node("generate_image", generate_image)
workflow.add_node("publish_post", publish_post)
workflow.add_node("publish_post_en", publish_post_en)

workflow.set_entry_point("summary_node")
workflow.add_edge("summary_node", "more_topics")
workflow.add_conditional_edges("more_topics", topics_to_search, ["web_search"])
workflow.add_edge("web_search", "write_outline_cn")
workflow.add_edge("web_search", "write_outline_en")
workflow.add_edge("write_outline_cn", "write_article_cn")
workflow.add_edge("write_outline_en", "write_article_en")
workflow.add_edge("write_article_cn", "add_reference")
workflow.add_edge("write_article_en", "add_reference")
workflow.add_edge("add_reference", "article_metas")
workflow.add_edge("article_metas", "generate_image")
workflow.add_edge("generate_image", "publish_post")
workflow.add_edge("generate_image", "publish_post_en")
workflow.add_edge("publish_post", END)
workflow.add_edge("publish_post_en", END)
app = workflow.compile()

from IPython.display import Image as IPImage
from IPython.display import display
from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles

display(
    IPImage(
        app.get_graph(xray=1).draw_mermaid_png(
            curve_style=CurveStyle.BASIS,
            node_colors=NodeStyles(
                first="fill:#FDFFB6",
                last="fill:#FFADAD",
                default="fill:#CAFFBF,line-height:1",
            ),
            draw_method=MermaidDrawMethod.API,
        ),
        width=300,
    )
)

img = app.get_graph().draw_mermaid_png(
    curve_style=CurveStyle.BASIS,
    node_colors=NodeStyles(
        first="fill:#FDFFB6",
        last="fill:#FFADAD",
        default="fill:#CAFFBF,line-height:1",
    ),
    draw_method=MermaidDrawMethod.API,
)
with open("post_publish_flow_new.png", "wb") as png:
    png.write(img)

In [30]:
@observe
def run_post_publish():
    """
    Run the workflow for all the news URLs.
    """
    # Get the list of URLs to process
    urls = get_news_urls()
    for url in urls:
        try:
            app.invoke({"url": url})
            set_news_url_flag(url)
            print(url, "finished")
        except Exception as e:
            print("error for url: ", url)
            print(e)

run_post_publish()

Finish Initial Summary:  https://news.ufl.edu/2024/10/cseveryone/
https://news.ufl.edu/2024/10/cseveryone/  --- done!
https://today.marquette.edu/2024/10/expanding-access-to-computer-science-education-marquette-universitys-role-in-leading-change/  --- done!
https://news.uoregon.edu/content/uo-prof-makes-computer-science-more-inclusive-and-equitable  --- done!
https://www.researchgate.net/publication/322627817_Computer_Science_in_the_School_Curriculum_Issues_and_Challenges  --- 403/404 response
https://eng.auburn.edu/news/2024/06/csse-faculty-appointed-by-alabama-governor  --- done!
https://www.newamerica.org/new-practice-lab/blog/what-does-a-federal-transition-mean-for-local-early-childhood-initiatives/  --- done!
https://www.sciencedirect.com/science/article/pii/S2405844024156617  --- 403/404 response
https://www.govtech.com/education/k-12/opinion-compsci-must-go-hand-in-hand-with-accessibility  --- done!
https://today.marquette.edu/2024/04/marquettes-strengths-in-computer-science-edu