In [1]:
import base64
import json
import operator
import os
import urllib.request
from typing import Annotated, List, Literal, TypedDict

import markdown2
import pandas as pd
import requests

# import tiktoken
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langgraph.constants import Send
from langgraph.graph import END, StateGraph
from LLM_get_folder import get_local_folder
from PIL import Image
from pydantic import BaseModel, Field
from wordpress_tools import (
    get_news_urls,
    insert_keyword_url,
    post_wordpress_file,
    post_wordpress_post,
    set_news_url_flag,
    tags_to_IDs,
    tags_to_IDs_en,
    update_summary_qa,
)

# from langchain.docstore.document import Document
model_small = "gpt-4o-mini"

model_large = "gpt-4o-mini"

llm_small = ChatOpenAI(model=model_small, temperature=0, timeout=40000)
llm_large = ChatOpenAI(model=model_large, temperature=0, timeout=40000)
# llm = ChatOpenAI()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "college-news-auto-post"

from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

MAX_WEB_URL = 5
MAX_QUERY_RESULT = 1

In [2]:
class documentType(BaseModel):
    topic: str
    url: str
    title: str
    summary: str


class GraphState(TypedDict):
    url: str
    summary: str
    topics: list[str]
    documents: Annotated[list[documentType], operator.add]
    image_query: str
    title: str
    title_en: str
    content: str
    content_en: str
    tags: set
    tags_en: set
    image_url: str
    image_url_en: str
    image_ID: int
    image_ID_en: int
    image_filename: str

In [3]:
class summary_output(BaseModel):
    title: str = Field(description="网页内容总结标题，标题的语言必须是英文")
    summary: str = Field(description="网页内容总结内容，总结内容的语言必须是中文")


def summary_from_url(url):
    """
    from langchain_community.document_loaders import SeleniumURLLoader
    loader =SeleniumURLLoader(urls=[])
    docs=loader.load()
    """
    summary_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """任务说明：你是一名专业的新闻总结员。请根据下面给出的网页内容，总结出一段约800个汉字的新闻摘要。摘要必须使用中文，内容需全面，字数大约在800个汉字左右。
                具体要求：
                1. 新闻摘要：提炼新闻的核心内容，确保信息全面且连贯。字数大约在800个汉字左右。
                2. 标题提取：如果网页内容中已有标题，请提取该标题。如果没有标题，请根据内容总结一个合适的标题。标题必须是英文。
                3. 日期信息：如果网页内容中包含新闻发布日期，请在新闻摘要中包含该日期，日期格式需包含年份。
                4. 美国大学相关内容：如果网页内容中提到美国大学（比如哈佛大学、耶鲁大学等等），只要提到了就请在新闻摘要中包含该大学的相关信息（比如与网页内容事件相关或者与作者相关等等）。""",
            ),
            ("human", "网页内容: {content}"),
        ]
    )

    try:
        loader = WebBaseLoader(
            url, requests_kwargs={"timeout": 10}, raise_for_status=True
        )
        docs = loader.load()
        content = docs[0].page_content
        structured_llm = llm_large.with_structured_output(
            summary_output, method="json_schema"
        )
        summary_chain = summary_prompt | structured_llm
        response = summary_chain.invoke({"content": content})
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404 or e.response.status_code == 403:
            response = "404"
        else:
            response = None
    except:
        response = None
    return response

In [4]:
def summary(state):
    url = state["url"]
    response = summary_from_url(url)
    if response is None:
        set_news_url_flag(url)
        raise Exception("Orginal url None error")
    if response == "404":
        set_news_url_flag(url)
        raise Exception("Orginal url 403/404 error")
    title = response.title
    summary = response.summary
    print("Finish Initial Summary: ", url)
    url_base = url.split("?")[0]
    return {
        "summary": summary,
        "documents": [
            {
                "topic": "原始文章总结，This is the initial article summary.",
                "url": url_base,
                "title": title,
                "summary": summary,
            }
        ],
    }

In [5]:
class more_points(BaseModel):
    more: list[str] = Field(
        description="可以展开讨论的话题的搜索查询短句，必须是英文搜索查询短句，the output search query should be in purely English."
    )


def more_topics(state):
    summary = state["summary"]
    structured_llm = llm_large.with_structured_output(more_points, method="json_schema")
    more_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """从下面用户给出的一段原始总结文字中提出五个与原文有联系且可以展开讨论的话题, 请给出详细的搜索查询来代表这几个讨论话题，搜索查询必须要完整且详细，
                搜索查询必须为英文，不管输入内容是什么语言，输出的搜索查询短句必须是英文查询短句，the output search query should be in purely English。""",
            ),
            ("human", "原始总结文字：{summary}"),
        ]
    )
    more_chain = more_prompt | structured_llm
    topics = more_chain.invoke({"summary": summary}).more
    return {"topics": topics}

In [6]:
def topics_to_search(state):
    return [Send("web_search", {"query": topic}) for topic in state["topics"][:5]]

In [7]:
class webSearchState(TypedDict):
    query: str


def web_search(state: webSearchState):
    search = GoogleSerperAPIWrapper(type="news")
    query = state["query"]
    results = search.results(query)
    n_results = 0
    documents = []
    for result in results["news"]:
        if n_results >= MAX_QUERY_RESULT:
            break
        url = result["link"]
        response = summary_from_url(url)
        if response is None:
            print(url, " --- none response!")
            continue
        elif response == "404":
            print(url, " --- 403/404 response")
            continue
        else:
            print(url, " --- done!")
            summary = response.summary
            title = response.title
            documents = documents + [
                {"topic": query, "url": url, "title": title, "summary": summary}
            ]
            n_results = n_results + 1
    return {"documents": documents}

In [8]:
def rewrite(state):
    summary = state["summary"]
    documents = state["documents"]
    topics = state["topics"]
    content_array = []
    content_set = {}
    repeat_N = 0
    system_message = """任务说明： 你的角色是一名专业的美国大学新闻评论员。下方提供了一段原始新闻总结内容，以及一些可供扩展讨论的话题和支持材料。请根据这些信息撰写一篇2000-5000字的评论文章。具体要求如下：
1. 内容围绕原始总结： 文章应以原始新闻总结为核心展开，特别关注与美国大学相关的内容。
2. 扩展讨论： 在评论文章中，结合提供的话题进行扩展讨论。扩展时，需确保前后呼应，并与原始总结内容相连。如果某些扩展讨论话题或支持材料与原始总结不相关或难以整合，可以舍弃这些内容，将重心放在相关讨论上。\
此外，如果扩展内容中存在重复，请只使用一次。
3. 逻辑连贯： 文章前后逻辑应保持连贯。如果必要，可以对全文进行重写，使其更加流畅和连贯。
4. 美国大学名称使用规范：
    第一次提及美国大学名称时，必须使用该大学的中文全名，后续可使用适当的缩写。
    美国大学全名需使用最普遍的中文翻译版本。
    加利福尼亚大学各个分校请使用“加州大学+分校名称”，例如“加州大学尔湾分校”。
    加州大学欧文分校请使用“加州大学尔湾分校”。
    布兰戴斯大学请使用“布兰迪斯大学”。
    威廉与玛丽学院和威廉与玛丽大学请使用“威廉玛丽学院”。
    利哈伊大学请使用“里海大学”。
    密歇根大学安娜堡分校需使用全称，不可缩写为“密歇根大学”。
    伊利诺伊大学香槟分校需使用全称，不可缩写为“伊利诺伊大学”。
    乔治亚大学请使用“佐治亚大学”。
    华盛顿大学圣路易斯分校请使用“圣路易斯华盛顿大学”。
5. 仅输出正文： 文章输出只包含正文部分，不包含标题。
6. 图片占位符： 在文章中适当位置（尽量位于文章的上部且在两个段落之间）放置一个图片占位符，使用‘[image_placeholder]’表示。
7. 参考文献： 在文章末尾添加参考文献。参考文献需包含原始文章中的引用内容，并且仅限实际在文中引用过的文献。对于重复的参考文献，请合并处理，确保每个引用都是独特且被引用的。\
参考文献格式应为链接，链接名使用“title”字段，链接地址使用“url”字段。"""
    while len(content_array) == 0 or (len(content_array) - len(content_set)) > 0:
        if repeat_N >= 4:
            local_folder = get_local_folder()
            file = os.path.join(get_local_folder(), "repeat.txt")
            f = open(file, "a")
            f.write(
                state["url"]
                + "\n\n---------------------------------------------------------------------\n\n"
            )
            f.close()
            raise Exception("rewrite CN version 4 times still get duplicates")
            break
        rewrite_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_message),
                (
                    "human",
                    "原始总结内容及支持扩展讨论话题的讨论：{documents}",
                ),
            ]
        )
        rewrite_chain = rewrite_prompt | llm_large | StrOutputParser()
        content = rewrite_chain.invoke({"documents": documents})
        content_array = content.split("\n\n")
        for element in content_array:
            if len(element) <= 20:
                content_array.remove(element)
        content_set = set(content_array)
        repeat_N = repeat_N + 1
        system_message = (
            system_message
            + "\n\n新的文章不要有重复的段落，如果遇到原始总结内容和扩展讨论点有重复内容的可以忽略重复内容，在最后的输出文章中不可以有重复段落。"
        )
    return {"content": content}

In [9]:
def rewrite_en(state):
    summary = state["summary"]
    documents = state["documents"]
    topics = state["topics"]
    content_array = []
    content_set = {}
    repeat_N = 0
    system_message = """Role: You are an experienced commentator specializing in U.S. university news.
Task: Below, you’ll find an original summary along with discussion topics and supporting arguments. Your goal is to craft a commentary article between 2,000 and 5,000 words.\
The article should expand on the original summary while emphasizing content related to U.S. colleges. Your final piece should reflect the following guidelines:
1.	Content Focus:
    Begin with the original summary, then broaden the discussion while consistently tying back to it.
    Integrate the provided discussion points and supporting arguments into the article seamlessly.
    Maintain a clear and logical flow throughout the commentary, ensuring the entire article is coherent.
2.	College Mentions:
    When introducing U.S. colleges for the first time, use their full English names (e.g., University of California-Berkeley).
    Subsequent references can use appropriate abbreviations.
3.	Article Structure:
    Use headings starting from H2; there should be no H1 headings in your output.
    Include an image placeholder represented as '[image_placeholder]' placed thoughtfully towards the upper-middle section of the article, ideally between two paragraphs.
4.	References:
    Conclude with a comprehensive summary that ties together the original and expanded discussions.
    Include only references that have been cited within the text, merging duplicates and removing any unnecessary entries.
    The reference format should be hyperlinks with the following structure:
    hyper link name: "title" of the reference
    hpyer link url: "url" to the reference
5.	Language: The final article must be in English.
"""
    while len(content_array) == 0 or (len(content_array) - len(content_set)) > 0:
        rewrite_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_message),
                (
                    "human",
                    "Original summary and supporting arguments：\n\n{documents}",
                ),
            ]
        )
        rewrite_chain = rewrite_prompt | llm_large | StrOutputParser()
        content = rewrite_chain.invoke(
            {"summary": summary, "topics": topics, "documents": documents}
        )
        content_array = content.split("\n\n")
        for element in content_array:
            if len(element) <= 20:
                content_array.remove(element)
        content_set = set(content_array)
        repeat_N = repeat_N + 1
        if repeat_N >= 4:
            local_folder = get_local_folder()
            file = os.path.join(get_local_folder(), "repeat.txt")
            f = open(file, "a")
            f.write(
                state["url"]
                + "\n\n---------------------------------------------------------------------\n\n"
            )
            f.close()
            raise Exception("rewrite EN version 4 times still get duplicates")
            break
        system_message = (
            system_message
            + "\n\n新的文章不要有重复的段落，如果遇到原始总结内容和扩展讨论点有重复内容的可以忽略重复内容，在最后的输出文章中不可以有重复段落。"
        )
    return {"content_en": content}

In [10]:
class meta_format(BaseModel):
    title: str = Field(
        description="与文章内容相关的文章中文标题，长度在20到30个中文字，用中文输出"
    )
    title_en: str = Field(
        description="与文章内容相关的文章英文标题，长度在10到20个英文单词，用英文输出"
    )
    image_query: str = Field(
        description="detailed prompt to generate an image that based on the article content, should be in English"
    )
    image_filename: str = Field(
        description="a good name for the image file without file extension, should be in English"
    )
    tags: List[str] = Field(description="与文章内容相关的中文标签，用中文输出")
    tags_en: List[str] = Field(description="与文章内容相关的英文标签，用英文输出")


def article_metas(state):
    content = state["content"]
    meta_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """请完成以下任务：
                1. 根据下面给出的文章内容，为文章取一个合适的标题。标题需要有中文和英文两个版本，中文版标题长度在20到30个中文字，英文标题长度在10到20个英文单词。
                2. 根据文章内容，生成一个详细的图像生成提示（image generation prompt），提示词应为英文，并注意使用不会违反“安全系统”的安全词汇。图像风格应基于文章内容。
                3. 为上述图像生成提示生成一个英文的图像文件名，但不包含文件类型扩展名。
                4. 生成一些与文章内容相关的标签，标签同样需要有中文和英文两个版本。""",
            ),
            ("human", "下面是需要处理的文章内容：\n\n{content}"),
        ]
    )
    structured_llm = llm_large.with_structured_output(meta_format, method="json_schema")
    meta_chain = meta_prompt | structured_llm
    chain_n = 0
    while True:
        if chain_n > 3:
            raise Exception("get article metas error with 3 trials")
        try:
            response = meta_chain.invoke({"content": content})
            break
        except:
            chain_n += 1

    tag_names = response.tags
    tags = tags_to_IDs(tag_names)
    tag_names_en = response.tags_en
    tags_en = tags_to_IDs_en(tag_names_en)
    return {
        "title": response.title,
        "title_en": response.title_en,
        "image_query": response.image_query,
        "image_filename": response.image_filename,
        "tags": tags,
        "tags_en": tags_en,
    }

In [11]:
def generate_image(state):
    image_filename = state["image_filename"]
    image_folder = os.path.join(get_local_folder(), "images")
    png_image = os.path.join(image_folder, image_filename + ".png")
    jpg_image = os.path.join(image_folder, image_filename + ".jpg")
    image_query = state["image_query"]
    try:
        image_url = DallEAPIWrapper(
            model="dall-e-3", size="1792x1024", quality="standard"
        ).run(image_query)
        urllib.request.urlretrieve(image_url, png_image)
        with Image.open(png_image) as image:
            image.save(jpg_image, optimized=True, quality=20)
        os.remove(png_image)
        response = post_wordpress_file(jpg_image, lang_type="cn")
        response = response.json()
        image_ID = int(response.get("id"))
        image_url = response.get("guid").get("rendered")
        response_en = post_wordpress_file(jpg_image, lang_type="en")
        response_en = response_en.json()
        image_ID_en = int(response_en.get("id"))
        image_url_en = response_en.get("guid").get("rendered")
        return {
            "image_ID": image_ID,
            "image_url": image_url,
            "image_ID_en": image_ID_en,
            "image_url_en": image_url_en,
        }
    except Exception as e:
        print(e)
        return {"image_ID": -1, "image_url": "", "image_ID_en": -1, "image_url_en": ""}


def publish_post(state):
    title = state["title"]
    image_ID = state["image_ID"]
    image_url = state["image_url"]
    tags = state["tags"]
    if image_ID == -1:
        image_url = "https://www.forwardpathway.com/wp-content/uploads/2024/06/fp_college_news_default.jpg"
        image_ID = 107009
    content = state["content"]
    raw_content = content
    if content.find("[image_placeholder]") > 0:
        content = content.replace(
            "[image_placeholder]", """<img src="{}">""".format(image_url)
        )
    else:
        content = """<img src="{}">""".format(image_url) + content
    content = markdown2.markdown(
        content,
        extras=["tables", "footnotes"],
    )
    (content, new_tags) = insert_keyword_url(content)
    tags = tags | new_tags
    response = post_wordpress_post(
        post_title=title,
        post_body=content,
        featured_media_id=image_ID,
        tags=tags,
        categories=[3627],
        comment_status="closed",
        lang_type="cn",
    )
    response = response.json()
    post_ID = response.get("id")
    update_summary_qa(post_ID, raw_content, llm_small)
    return


def publish_post_en(state):
    title = state["title_en"]
    image_ID = state["image_ID_en"]
    image_url = state["image_url_en"]
    tags = state["tags_en"]
    if image_ID == -1:
        image_url = "https://www.forwardpathway.us/wp-content/uploads/2024/07/fp_college_news_default.jpg"
        image_ID = 15899
    content = state["content_en"]
    if content.find("[image_placeholder]") > 0:
        content = content.replace(
            "[image_placeholder]", """<img src="{}">""".format(image_url)
        )
    else:
        content = """<img src="{}">""".format(image_url) + content
    content = markdown2.markdown(
        content,
        extras=["tables", "footnotes"],
    )
    (content, new_tags) = insert_keyword_url(content, lang_type="en")
    tags = tags | new_tags
    response = post_wordpress_post(
        post_title=title,
        post_body=content,
        featured_media_id=image_ID,
        tags=tags,
        categories=[9],
        comment_status="closed",
        lang_type="en",
    )
    response = response.json()
    post_ID = response.get("id")
    return

In [12]:
######################## Build LangGraph ####################################
workflow = StateGraph(GraphState)
workflow.add_node("summary_node", summary)
workflow.add_node("more_topics", more_topics)
workflow.add_node("web_search", web_search)
workflow.add_node("rewrite", rewrite)
workflow.add_node("rewrite_en", rewrite_en)
workflow.add_node("article_metas", article_metas)
workflow.add_node("generate_image", generate_image)
workflow.add_node("publish_post", publish_post)
workflow.add_node("publish_post_en", publish_post_en)

workflow.set_entry_point("summary_node")
workflow.add_edge("summary_node", "more_topics")
workflow.add_conditional_edges("more_topics", topics_to_search,["web_search"])
workflow.add_edge("web_search", "rewrite")
workflow.add_edge("web_search", "rewrite_en")
workflow.add_edge("rewrite", "article_metas")
workflow.add_edge("rewrite_en", "article_metas")
workflow.add_edge("article_metas", "generate_image")
workflow.add_edge("generate_image", "publish_post")
workflow.add_edge("generate_image", "publish_post_en")
workflow.add_edge("publish_post", END)
workflow.add_edge("publish_post_en", END)
app = workflow.compile()

from IPython.display import Image as IPImage
from IPython.display import display
from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles

display(
    IPImage(
        app.get_graph(xray=1).draw_mermaid_png(
            curve_style=CurveStyle.BASIS,
            node_colors=NodeStyles(
                first="fill:#FDFFB6",
                last="fill:#FFADAD",
                default="fill:#CAFFBF,line-height:1",
            ),
            draw_method=MermaidDrawMethod.API,
        ),
        width=300,
    )
)

img = app.get_graph().draw_mermaid_png(
    curve_style=CurveStyle.BASIS,
    node_colors=NodeStyles(
        first="fill:#FDFFB6",
        last="fill:#FFADAD",
        default="fill:#CAFFBF,line-height:1",
    ),
    draw_method=MermaidDrawMethod.API,
)
with open("post_publish_flow_2.png", "wb") as png:
    png.write(img)

In [13]:
#urls=["https://www.usnews.com/education/blogs/college-rankings-blog/articles/2024-08-27/2025-best-colleges-rankings-coming-sept-24"]
urls = get_news_urls()
for url in urls:
    try:
        app.invoke({"url": url})
        set_news_url_flag(url)
        print(url, "finished")
    except Exception as e:
        print("error for url: ", url)
        print(e)

error for url:  https://www.usnews.com/education/blogs/college-rankings-blog/articles/2024-08-27/2025-best-colleges-rankings-coming-sept-24
Orginal url None error
