In [1]:
import base64
import json
import os
import urllib.request
from typing import List, Literal, TypedDict

import markdown
import pandas as pd
import requests
import tiktoken
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from LLM_get_folder import get_local_folder
from PIL import Image
from wordpress_tools import (
    get_news_urls,
    insert_keyword_url,
    post_wordpress_file,
    post_wordpress_post,
    set_news_url_flag,
    tags_to_IDs,
    tags_to_IDs_en,
    update_summary_qa,
)

# from langchain.docstore.document import Document
model_small = "gpt-4o-mini"

model_large = "gpt-4o-mini"

llm_small = ChatOpenAI(model=model_small, temperature=0, timeout=40000)
llm_large = ChatOpenAI(model=model_large, temperature=0, timeout=40000)
# llm = ChatOpenAI()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "college-news-auto-post"

from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

MAX_WEB_URL = 5
MAX_QUERY_RESULT = 1

In [2]:
class GraphState(TypedDict):
    url: str
    summary: str
    topics: List[str]
    documents: str
    image_query: str
    title: str
    title_en: str
    content: str
    content_en: str
    tags: set
    tags_en: set
    image_url: str
    image_url_en: str
    image_ID: int
    image_ID_en: int
    image_filename: str
    reference: dict

url="https://news.wm.edu/2024/07/18/plankton-researchers-urge-their-colleagues-to-mix-it-up/"
loader = WebBaseLoader(
            url, requests_kwargs={"timeout": 10}, raise_for_status=True
        )
loader.requests_kwargs = {"verify": False}
docs = loader.load()
content = docs[0].page_content

In [3]:
class summary_output(BaseModel):
    title: str = Field(description="网页内容总结标题，标题的语言必须是英文")
    summary: str = Field(description="网页内容总结内容，总结内容的语言必须是中文")


def summary_from_url(url):
    """
    from langchain_community.document_loaders import SeleniumURLLoader
    loader =SeleniumURLLoader(urls=[])
    docs=loader.load()
    """
    summary_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """任务说明：你是一名专业的新闻总结员。请根据下面给出的网页内容，总结出一段约800个汉字的新闻摘要。摘要必须使用中文，内容需全面，字数大约在800个汉字左右。
                具体要求：
                1. 新闻摘要：提炼新闻的核心内容，确保信息全面且连贯。字数大约在800个汉字左右。
                2. 标题提取：如果网页内容中已有标题，请提取该标题。如果没有标题，请根据内容总结一个合适的标题。标题必须是英文。
                3. 日期信息：如果网页内容中包含新闻发布日期，请在新闻摘要中包含该日期，日期格式需包含年份。
                4. 美国大学相关内容：如果网页内容中提到美国大学（比如哈佛大学、耶鲁大学等等），只要提到了就请在新闻摘要中包含该大学的相关信息（比如与网页内容事件相关或者与作者相关等等）。""",
            ),
            ("human", "网页内容: {content}"),
        ]
    )

    try:
        loader = WebBaseLoader(
            url, requests_kwargs={"timeout": 10}, raise_for_status=True
        )
        loader.requests_kwargs = {"verify": False}
        docs = loader.load()
        content = docs[0].page_content
        encoding = tiktoken.encoding_for_model(model_small)
        token_length = len(encoding.encode(content))
        if token_length <= 15000:
            structured_llm = llm_small.with_structured_output(summary_output)
        else:
            structured_llm = llm_large.with_structured_output(summary_output)
        summary_chain = summary_prompt | structured_llm
        response = summary_chain.invoke({"content": content})
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404 or e.response.status_code == 403:
            response = "404"
        else:
            response = None
    except:
        response = None
    return response

In [4]:
def summary(state):
    url = state["url"]
    response = summary_from_url(url)
    if response == "404":
        set_news_url_flag(url)
        raise Exception("Orginal url 403/404 error")
    title = response.title
    summary = response.summary
    print("Finish Initial Summary: ", url)
    url_base = url.split("?")[0]
    return {"summary": summary, "reference": {url_base: title}}

In [5]:
class more_points(BaseModel):
    more: list[str] = Field(
        description="可以展开讨论的点的搜索查询，必须是英文搜索查询"
    )


def more_topics(state):
    summary = state["summary"]
    structured_llm = llm_large.with_structured_output(more_points)
    more_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """从下面用户给出的一段原始总结文字中提出不少于'两'点但不多于'四'点与原文有联系且可以展开讨论的点, 请给出详细的搜索查询来代表这几个点，搜索查询必须要完整且详细，
                搜索查询必须为英文，不管输入内容是什么语言，输出的搜索查询必须是英文查询。""",
            ),
            ("human", "原始总结文字：{summary}"),
        ]
    )
    more_chain = more_prompt | structured_llm
    topics = more_chain.invoke({"summary": summary})
    return {"topics": topics}

In [6]:
def web_search(state):
    search = GoogleSerperAPIWrapper(type="news")
    queries = state["topics"].more
    reference = state["reference"]
    documents = ""
    n_urls = 0
    for query in queries:
        if n_urls >= MAX_WEB_URL:
            break
        results = search.results(query)
        n_results = 0
        for result in results["news"]:
            url = result["link"]
            url_base = url.split("?")[0]
            if url_base in reference:
                continue
            else:
                print("------ Topics Summary: ", url,end=" ")
                response = summary_from_url(url)
                if response is None:
                    print ('none response!')
                    continue
                elif response=='404':
                    print ('403/404 response')
                    continue
                else:
                    print("done!")
                    doc = response.summary
                    title = response.title
                    reference[url_base] = title
                    documents = documents + query + "\n\n" + doc + "\n\n"
                    n_urls = n_urls + 1
                    n_results = n_results + 1
                    if n_results >= MAX_QUERY_RESULT:
                        break
    return {"documents": documents, "reference": reference}

In [7]:
def rewrite(state):
    summary = state["summary"]
    documents = state["documents"]
    topics = state["topics"].more
    content_array = []
    content_set = {}
    repeat_N = 0
    system_message = """任务说明：你的角色是一名专业的美国大学新闻评论员，下面给出一段原始总结内容，同时提供了一些可以扩展讨论的点和支持这些讨论点的材料。\
    请重新撰写一篇2000-4000字的评论文章。字数可以根据内容的多少适当选择，但不能少于2000字且不能超过4000字。
    文章要求：
    1. 内容围绕原始总结：评论文章内容应主要围绕原始总结展开，对与美国大学相关的内容要额外关注。
    2. 扩展讨论：结合提供的讨论点进行扩展讨论。扩展讨论时需要前后呼应，提及与原始总结相关联的内容。如果扩展讨论点及其支持材料与原始总结内容无关或无法一起讨论，\
    可以舍弃这些扩展讨论点和内容，并将重心放在与原始总结有关的讨论上。如果扩展讨论点及支持材料有重复内容的请只使用一次该内容。
    3. 连贯的逻辑关系：文章前后的逻辑关系需要连贯。如果有需要，可以重写全文，使其更加流畅。
    4. 美国大学名称使用规范：
    - 第一次提及美国大学名称时，必须使用该大学的中文全名，之后可以适当使用缩写。
    - 美国大学全名需使用最普遍的中文翻译版本。
    - 加利福尼亚大学各个分校一律使用“加州大学+分校名称”，例如加州大学尔湾分校。
    - 加州大学欧文分校请使用加州大学尔湾分校。
    - 布兰戴斯大学请使用布兰迪斯大学。
    - 威廉与玛丽学院和威廉与玛丽大学请使用威廉玛丽学院。
    - 利哈伊大学请使用里海大学。
    - 密歇根大学安娜堡分校需使用全称，不能缩写成密歇根大学。
    - 伊利诺伊大学香槟分校需使用全称，不能缩写成伊利诺伊大学。
    - 乔治亚大学请使用佐治亚大学。
    - 华盛顿大学圣路易斯分校请用圣路易斯华盛顿大学。
    5. 输出只包含正文：输出的文章评论不要包含文章标题。
    6. 图片空位：在文章中合适的位置，尽量在文章中上部位且在两个段落之间，需放置一个且必须有一个图片空位，用'[image_placeholder]'代表这个图片空位。"""
    while len(content_array) == 0 or (len(content_array) - len(content_set)) > 0:
        if repeat_N >= 4:
            local_folder = get_local_folder()
            file = os.path.join(get_local_folder(), "repeat.txt")
            f = open(file, "a")
            f.write(
                state["url"]
                + "\n\n---------------------------------------------------------------------\n\n"
            )
            f.close()
            raise Exception("rewrite CN version 4 times still get duplicates")
            break
        rewrite_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_message),
                (
                    "human",
                    "原始总结内容：\n\n{summary},\n\n扩展讨论点：\n\n{topics},\n\n支持扩展讨论点的讨论：\n\n{documents}",
                ),
            ]
        )
        rewrite_chain = rewrite_prompt | llm_large | StrOutputParser()
        content = rewrite_chain.invoke(
            {"summary": summary, "topics": topics, "documents": documents}
        )
        content_array = content.split("\n\n")
        for element in content_array:
            if len(element) <= 20:
                content_array.remove(element)
        content_set = set(content_array)
        repeat_N = repeat_N + 1
        system_message = (
            system_message
            + "\n\n新的文章不要有重复的段落，如果遇到原始总结内容和扩展讨论点有重复内容的可以忽略重复内容，在最后的输出文章中不可以有重复段落。"
        )
    return {"content": content}

In [8]:
def rewrite_en(state):
    summary = state["summary"]
    documents = state["documents"]
    topics = state["topics"].more
    content_array = []
    content_set = {}
    repeat_N = 0
    system_message = """Your role is a professional American university news commentator. Below is an original summary along with some discussion points and supporting arguments to expand upon.\
    Please rewrite a 2000-4000 word commentary article, with the word count adjusted according to the content, but not less than 2000 words and not more than 4000 words.\
    The commentary should center around the original summary, with extra focus on content related to U.S. colleges. Appropriately integrate the discussion points and their \
    supporting arguments into your expanded discussion. Ensure to consistently refer back to the original summary when expanding your discussion. \
    Conclude with a comprehensive summary that ties together the original summary and the expanded discussion. The logical flow of the article must be coherent, \
    and the entire text may be rewritten if necessary. When mentioning the names of U.S. colleges for the first time, always use the full English name. \
    Subsequent mentions can use abbreviations where appropriate. Use the most commonly recognized English version of the full names. \
    For campuses, use a hyphen to connect, such as University of California-Berkeley.
    For headings, start from h2 headings, the output should only contain content without title, and there should be no h1 heading in the output.
    In a suitable position within the article, preferably towards the upper-middle and between two paragraphs, include one image placeholder represented by '[image_placeholder]'.
    The final output must be in English, regardless of any other considerations."""
    while len(content_array) == 0 or (len(content_array) - len(content_set)) > 0:
        rewrite_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_message),
                (
                    "human",
                    "Original summary：\n\n{summary},\n\nDiscussion Points：\n\n{topics},\n\nsupporting arguments：\n\n{documents}",
                ),
            ]
        )
        rewrite_chain = rewrite_prompt | llm_large | StrOutputParser()
        content = rewrite_chain.invoke(
            {"summary": summary, "topics": topics, "documents": documents}
        )
        content_array = content.split("\n\n")
        for element in content_array:
            if len(element) <= 20:
                content_array.remove(element)
        content_set = set(content_array)
        repeat_N = repeat_N + 1
        if repeat_N >= 4:
            local_folder = get_local_folder()
            file = os.path.join(get_local_folder(), "repeat.txt")
            f = open(file, "a")
            f.write(
                state["url"]
                + "\n\n---------------------------------------------------------------------\n\n"
            )
            f.close()
            raise Exception("rewrite EN version 4 times still get duplicates")
            break
        system_message = (
            system_message
            + "\n\n新的文章不要有重复的段落，如果遇到原始总结内容和扩展讨论点有重复内容的可以忽略重复内容，在最后的输出文章中不可以有重复段落。"
        )
    return {"content_en": content}

In [9]:
class meta_format(BaseModel):
    title: str = Field(description="与文章内容相关的文章中文标题，用中文输出")
    title_en: str = Field(description="与文章内容相关的文章英文标题，用英文输出")
    image_query: str = Field(
        description="detailed prompt to generate an image that based on the article content, should be in English"
    )
    image_filename: str = Field(
        description="a good name for the image file without file extension, should be in English"
    )
    tags: List[str] = Field(description="与文章内容相关的中文标签，用中文输出")
    tags_en: List[str] = Field(description="与文章内容相关的英文标签，用英文输出")


def article_metas(state):
    content = state["content"]
    meta_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """请完成以下任务：
                1. 根据下面给出的文章内容，为文章取一个合适的标题。标题需要有中文和英文两个版本。
                2. 根据文章内容，生成一个详细的图像生成提示（image generation prompt），提示词应为英文，并注意使用不会违反“安全系统”的安全词汇。图像风格应基于文章内容。
                3. 为上述图像生成提示生成一个英文的图像文件名，但不包含文件类型扩展名。
                4. 生成一些与文章内容相关的标签，标签同样需要有中文和英文两个版本。""",
            ),
            ("human", "下面是需要处理的文章内容：\n\n{content}"),
        ]
    )
    structured_llm = llm_large.with_structured_output(meta_format)
    meta_chain = meta_prompt | structured_llm
    chain_n = 0
    while True:
        if chain_n > 3:
            raise Exception("get article metas error with 3 trials")
        try:
            response = meta_chain.invoke({"content": content})
            break
        except:
            chain_n += 1

    tag_names = response.tags
    tags = tags_to_IDs(tag_names)
    tag_names_en = response.tags_en
    tags_en = tags_to_IDs_en(tag_names_en)
    return {
        "title": response.title,
        "title_en": response.title_en,
        "image_query": response.image_query,
        "image_filename": response.image_filename,
        "tags": tags,
        "tags_en": tags_en,
    }

In [10]:
def generate_image(state):
    image_filename = state["image_filename"]
    image_folder = os.path.join(get_local_folder(), "images")
    png_image = os.path.join(image_folder, image_filename + ".png")
    jpg_image = os.path.join(image_folder, image_filename + ".jpg")
    image_query = state["image_query"]
    try:
        image_url = DallEAPIWrapper(
            model="dall-e-3", size="1792x1024", quality="standard"
        ).run(image_query)
        urllib.request.urlretrieve(image_url, png_image)
        with Image.open(png_image) as image:
            image.save(jpg_image, optimized=True, quality=20)
        os.remove(png_image)
        response = post_wordpress_file(jpg_image, lang_type="cn")
        response = response.json()
        image_ID = int(response.get("id"))
        image_url = response.get("guid").get("rendered")
        response_en = post_wordpress_file(jpg_image, lang_type="en")
        response_en = response_en.json()
        image_ID_en = int(response_en.get("id"))
        image_url_en = response_en.get("guid").get("rendered")
        return {
            "image_ID": image_ID,
            "image_url": image_url,
            "image_ID_en": image_ID_en,
            "image_url_en": image_url_en,
        }
    except Exception as e:
        print(e)
        return {"image_ID": -1, "image_url": "", "image_ID_en": -1, "image_url_en": ""}


def publish_post(state):
    title = state["title"]
    image_ID = state["image_ID"]
    image_url = state["image_url"]
    tags = state["tags"]
    reference = state["reference"]
    if image_ID == -1:
        image_url = "https://www.forwardpathway.com/wp-content/uploads/2024/06/fp_college_news_default.jpg"
        image_ID = 107009
    content = state["content"]
    raw_content = content
    content = content + """\n### 参考新闻资料:"""
    for key in reference:
        content = content + """\n1. [{}]({})""".format(reference[key], key)
    content = markdown.markdown(
        content.replace("[image_placeholder]", """<img src="{}">""".format(image_url)),
        extensions=["tables", "footnotes"],
    )
    (content, new_tags) = insert_keyword_url(content)
    tags = tags | new_tags
    response = post_wordpress_post(
        post_title=title,
        post_body=content,
        featured_media_id=image_ID,
        tags=tags,
        categories=[3627],
        comment_status="closed",
        lang_type="cn",
    )
    response = response.json()
    post_ID = response.get("id")
    update_summary_qa(post_ID, raw_content, llm_small)
    return


def publish_post_en(state):
    title = state["title_en"]
    image_ID = state["image_ID_en"]
    image_url = state["image_url_en"]
    tags = state["tags_en"]
    reference = state["reference"]
    if image_ID == -1:
        image_url = "https://www.forwardpathway.us/wp-content/uploads/2024/07/fp_college_news_default.jpg"
        image_ID = 15899
    content = state["content_en"]
    content = content + """\n### News References:"""
    for key in reference:
        content = content + """\n1. [{}]({})""".format(reference[key], key)
    content = markdown.markdown(
        content.replace("[image_placeholder]", """<img src="{}">""".format(image_url)),
        extensions=["tables", "footnotes"],
    )
    (content, new_tags) = insert_keyword_url(content, lang_type="en")
    tags = tags | new_tags
    response = post_wordpress_post(
        post_title=title,
        post_body=content,
        featured_media_id=image_ID,
        tags=tags,
        categories=[9],
        comment_status="closed",
        lang_type="en",
    )
    response = response.json()
    post_ID = response.get("id")
    return

In [11]:
######################## Build LangGraph ####################################
workflow = StateGraph(GraphState)
workflow.add_node("summary_node", summary)
workflow.add_node("more_topics", more_topics)
workflow.add_node("web_search", web_search)
workflow.add_node("rewrite", rewrite)
workflow.add_node("rewrite_en", rewrite_en)
workflow.add_node("article_metas", article_metas)
# workflow.add_node("format_article", format_article)
workflow.add_node("generate_image", generate_image)
workflow.add_node("publish_post", publish_post)
workflow.add_node("publish_post_en", publish_post_en)

workflow.set_entry_point("summary_node")
workflow.add_edge("summary_node", "more_topics")
workflow.add_edge("more_topics", "web_search")
workflow.add_edge("web_search", "rewrite")
workflow.add_edge("rewrite", "rewrite_en")
workflow.add_edge("rewrite_en", "article_metas")
workflow.add_edge("article_metas", "generate_image")
workflow.add_edge("generate_image", "publish_post")
workflow.add_edge("publish_post", "publish_post_en")
workflow.add_edge("publish_post_en", END)
app = workflow.compile()

In [12]:
# from IPython.display import Image, display
# display(Image(app.get_graph().draw_mermaid_png(),width=300))
# img=app.get_graph().draw_mermaid_png()
# with open('GraphFlow.png','wb') as png:
#    png.write(img)

In [13]:
urls = get_news_urls()
n_url = 0
for url in urls:
    try:
        app.invoke({"url": url})
        set_news_url_flag(url)
        n_url = n_url + 1
        print(n_url, "finished")
    except Exception as e:
        print("error for url: ", url)
        print(e)

Finish Initial Summary:  https://news.wm.edu/2024/07/18/plankton-researchers-urge-their-colleagues-to-mix-it-up/
------ Topics Summary:  https://www.eurekalert.org/news-releases/1049450 done!
------ Topics Summary:  https://news.wm.edu/2024/07/09/plankton-researchers-urge-their-colleagues-to-mix-it-up/ 404 response
------ Topics Summary:  https://www.frontiersin.org/journals/marine-science/articles/10.3389/fmars.2021.740763/full done!
------ Topics Summary:  https://www.nature.com/articles/s41598-023-33962-x done!
1 finished
