In [1]:
import base64
import datetime
import os
import re
import urllib.request
from typing import TypedDict

import IPython
import markdown
import pandas as pd
import pymysql
import requests
import tiktoken
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from LLM_get_folder import get_local_folder
from PIL import Image

gpt_model_name = "gpt-4o"

llm = ChatOpenAI(model=gpt_model_name, timeout=120, temperature=0)
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "auto-post-update"
wp_url = "https://www.forwardpathway.com/wp-json/wp/v2"
wp_post_url = wp_url + "/posts"
wp_media_url = wp_url + "/media"
wp_tag_url = wp_url + "/tags"

user_id = os.environ["wordpress_username"]
# user app password can be created in the user/edit user/application password
user_app_password = os.environ["wordpress_pass"]

credentials = user_id + ":" + user_app_password
token = base64.b64encode(credentials.encode())
header = {"Authorization": "Basic " + token.decode("utf-8")}

In [2]:
def get_keywords_list():
    connection = pymysql.connect(
        db=os.environ["db_name"],
        user=os.environ["db_user"],
        passwd=os.environ["db_pass"],
        host=os.environ["db_host"],
        port=3306,
        cursorclass=pymysql.cursors.DictCursor,
    )
    cursor = connection.cursor()
    query = """SELECT t1.cname as keyword,concat("https://www.forwardpathway.com/",t1.postid) as url,t2.term_id as tag_id, t3.rank,t3.year FROM fp_ranking.`colleges` t1
LEFT JOIN fp_forwardpathway.`wp_mmcp_terms` t2 ON t1.cname=REPLACE(t2.name,"相关新闻","") AND t2.name LIKE "%相关新闻"
LEFT JOIN fp_ranking.us_rankings t3 ON t3.postid=t1.postid AND t3.year=(select ranking FROM fp_IPEDS.latest_information) AND t3.type=1"""
    cursor.execute(query)
    rows = cursor.fetchall()
    keywords_array = []
    for row in rows:
        keywords_array = keywords_array + [row]

    query = """SELECT keyword, url FROM fp_chatGPT.keywords"""
    cursor.execute(query)
    rows = cursor.fetchall()
    for row in rows:
        keywords_array = keywords_array + [row]

    cursor.close()
    connection.close()

    keywords = pd.DataFrame(
        keywords_array, columns=["keyword", "url", "tag_id", "rank", "year"]
    )
    keywords = keywords.reset_index(drop=True)

    keywords = keywords.reindex(
        keywords["keyword"].str.len().sort_values(ascending=False).index
    ).reset_index(drop=True)

    return keywords


keywords = get_keywords_list()


def get_update_post_ID():
    connection = pymysql.connect(
        db=os.environ["db_name"],
        user=os.environ["db_user"],
        passwd=os.environ["db_pass"],
        host=os.environ["db_host"],
        port=3306,
        cursorclass=pymysql.cursors.DictCursor,
    )
    cursor = connection.cursor()
    query = """SELECT t3.ID FROM (SELECT t2.ID,t2.post_modified FROM fp_forwardpathway.`wp_mmcp_term_relationships` t1
JOIN fp_forwardpathway.wp_mmcp_posts t2 ON t2.ID=t1.object_id AND t2.post_status="publish"
WHERE t1.`term_taxonomy_id` IN (3,2294,2295,2293,2180,1,1758,35,2350,2351,36,6)
GROUP BY t2.ID ORDER BY t2.post_modified ASC LIMIT 10) t3
ORDER BY RAND() LIMIT 1"""
    cursor.execute(query)
    row = cursor.fetchone()
    post_ID = int(row["ID"])
    cursor.close()
    connection.close()
    return post_ID


def update_long_post(post_ID):
    connection = pymysql.connect(
        db=os.environ["db_name"],
        user=os.environ["db_user"],
        passwd=os.environ["db_pass"],
        host=os.environ["db_host"],
        port=3306,
        cursorclass=pymysql.cursors.DictCursor,
    )
    cursor = connection.cursor()
    query = (
        """INSERT IGNORE INTO fp_chatGPT.long_posts (`postid`) VALUES ({})""".format(
            post_ID
        )
    )
    cursor.execute(query)
    connection.commit()
    cursor.close()
    connection.close()
    return post_ID


def insert_keyword_url(content):
    soup = BeautifulSoup(content, "html.parser")
    for key, row in keywords.iterrows():
        keyword = row["keyword"]
        url = row["url"]
        new_tag = soup.new_tag("a", href=url)
        new_tag.string = keyword
        void_tags = ["a", "h1", "h2", "h3", "h4"]
        pattern = re.compile(keyword)
        results = soup.find_all(string=pattern)
        for string_element in results:
            parents_set = set([x.name for x in string_element.parents])
            if any([x in parents_set for x in void_tags]):
                continue
            new_element = BeautifulSoup(
                string_element.replace(keyword, str(new_tag), 1), "html.parser"
            )
            string_element = string_element.replace_with(new_element)
            break
    return str(soup)

In [3]:
class GraphState(TypedDict):
    post_ID: int
    URL: str
    raw_html: str
    original_content: str
    text_content: str
    revised_content: str
    revises: list

In [4]:
def get_post_content(state):
    post_ID = state["post_ID"]
    URL = "https://www.forwardpathway.com/" + str(post_ID)
    response = retrieve_post(post_ID)
    raw_html = response.json()["content"]["rendered"]
    soup_content = BeautifulSoup(raw_html, "html.parser")
    # soup.html.unwrap()
    # soup.body.unwrap()
    imgs = soup_content.find_all("img")
    remove_attrs = set(["srcset", "class", "decoding", "height", "sizes", "width"])
    for img in imgs:
        img_attrs = dict(img.attrs)
        for img_attr in img_attrs:
            if img_attr in remove_attrs:
                del img.attrs[img_attr]
    if soup_content is not None:
        elements = soup_content.find_all(
            True,
            class_=[
                "crp_related",
                "topBanner",
                "bottomBanner",
                "wp-block-advgb-summary",
                "yoast-table-of-contents",
                "exclusiveStatement",
                "companyLocation",
                "CommentsAndShare",
                "AI_Summary",
                "AI_QA",
                "btn-group",
            ],
        )
        for element in elements:
            element.decompose()
        elements = soup_content.find_all(
            True,
            id=[
                "crp_related",
            ],
        )
        for element in elements:
            element.decompose()
        elements = soup_content.findAll(["svg", "style", "script", "noscript"])
        for element in elements:
            element.decompose()
    original_content = str(soup_content.find_all(True))

    return {
        "raw_html": raw_html,
        "original_content": original_content,
        "URL": URL,
    }

In [5]:
class revise_single(BaseModel):
    comment: str = Field(description="文章具体修改意见")
    search_query: str = Field(
        description="文章修改所需资料的具体英文搜索词条，可以用该词条在Google上搜索所需的资料来修改文章"
    )


class revise_output(BaseModel):
    revises: list[revise_single] = Field(
        description="包含文章修改意见和具体搜索词条的数组"
    )


def get_revise_comments(state):
    post_content = state["original_content"]
    revise_comment_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """今天的日期是{today}，下面将给出一篇网站文章，请对该文章的内容提出具体的修改意见和搜索词条。修改意见可以包含但不限于以下几个方面：\
                使用数据过时、前后逻辑不清晰、用词是否恰当、前后文是否对应。
                修改意见要求：
                1. 内容扩展建议：如果文章长度少于2000个中文字，请提供增加文章长度的建议，并详细描述应增加的内容。
                2. 数据和信息的准确性：检查文章中的数据和信息是否准确和最新。如果发现数据过时，请提出具体的修改建议。
                3. 逻辑和结构：评估文章的逻辑和结构，确保前后文连贯。如果发现逻辑不清晰或结构混乱的地方，请指出并给出修改建议。
                4. 用词和语言：检查文章的用词是否恰当，语言是否流畅。如果发现用词不当或语言不流畅的地方，请提出修改建议。
                5. 引用和出处：检查文章中的引用和出处是否清晰明确。如果发现引用不清楚或缺乏出处的地方，请提出修改建议。
                6. 修改意见要具体：请具体指出文章中哪些部分需要修改，详细描述不好的地方，如数据过时、前后逻辑不清晰、用词不当等。
                搜索词条要求：修改意见与搜索词条需一一对应。每一条修改意见应对应一个具体的搜索词条。搜索词条必须是英文，且应能在Google上搜索到相应的资料。
                输出格式：请给出3条修改意见，并将每一条修改意见与搜索词条放入对应的输出数组中。确保每条修改意见具体明确，并对应相关的搜索词条。""",
            ),
            ("human", "文章内容: {content}"),
        ]
    )
    today = datetime.datetime.now().strftime("%Y年%m月%d日")
    revise_comment_llm = llm.with_structured_output(revise_output)
    revise_comment_chain = revise_comment_prompt | revise_comment_llm
    response = revise_comment_chain.invoke({"today": today, "content": post_content})
    return {"revises": response.revises}

In [6]:
def revise_post(state):
    concat_max = 3
    revises = state["revises"]
    revise_content = state["original_content"]
    revise_n = 0
    system_common = """任务说明：你是一名专业的网站内容编辑。下面将给出一篇网站文章，并提供一条修改意见以及与该修改意见相关的一些参考资料。请根据修改意见和参考资料对原文章进行修改，尽量保持文章长度不减少。
    修改要求：
    1. 内容更新：请参考修改意见和相关资料对原文进行修改。如果更新内容涉及表格数据，请对整个表格进行更新，且更新内容必须来自参考资料。如果参考资料中不包含关键信息，请删除原表格中的相关内容。
    2. 排版要求：修改后的文章需输出全文，包括修改后的内容。对遇到的表格进行表格排版，各级标题统一格式排版。尽量保留原文章中的图片。
    3. 中文名翻译：如果原文中提到的美国大学只有英文名，请添加中文名翻译。
    4. 参考标记和编号：如果修改是参考了搜索资料，请在文章内容中添加参考标记与编号，并在文章末尾添加参考过的资料与编号。参考资料部分需统一排版并编号。"""
    system_diff = [
        """\n\n文章原文如果有参考资料部分的，请用新的参考资料覆盖所有的旧参考资料。最终输出结果必须是markdown格式输出，但是不要在文章最前端添加markdown标志。""",
        """\n\n参考资料部分要与原有的参考资料部分统一排版并编号，合并类似的参考资料.如果资料做了更新，对应的参考资料也进行更新。最终输出结果必须是markdown格式输出，但是不要在文章最前端添加markdown标志。""",
    ]
    for revise in revises:
        original_content = revise_content
        revise_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_common + system_diff[revise_n >= 1]),
                (
                    "human",
                    "\n\n修改意见:{comment}\n\n参考资料:{docs}\n\n原文章内容: {content}",
                ),
            ]
        )
        revise_chain = revise_prompt | llm
        comment = revise.comment
        search_query = revise.search_query
        search = GoogleSerperAPIWrapper()
        results = search.results(
            search_query + " -filetype:pdf -site:forwardpathway.com"
        )
        links = []
        docs = []
        print("--------get search results done!---------------")
        search_result_count = 0
        for x in results["organic"]:
            if search_result_count >= 2:
                break
            try:
                loader = WebBaseLoader(
                    x["link"], requests_kwargs={"timeout": 15}, continue_on_failure=True
                )
                new_doc = loader.load()
            except:
                continue
            encoding = tiktoken.encoding_for_model(gpt_model_name)
            token_length = len(encoding.encode(str(new_doc[0])))
            if token_length <= 50000:
                docs = docs + loader.load()
                search_result_count += 1

        print("--------load docs done!---------------")
        response = revise_chain.invoke(
            {"content": revise_content, "comment": comment, "docs": docs}
        )
        revise_content = response.content
        stop_reason = response.response_metadata["finish_reason"]
        concat_count = 0
        while stop_reason == "length":
            if concat_count >= concat_max:
                raise Exception("max count reached")
            revise_prompt = ChatPromptTemplate.from_messages(
                [
                    (
                        "system",
                        "你已经生成了第一部分输出内容，请根据下面相同的任务需求及第一部分输出内容继续生成剩余的内容。\n\n"
                        + system_common
                        + system_diff[revise_n >= 1],
                    ),
                    (
                        "human",
                        "\n\n修改意见:{comment}\n\n参考资料:{docs}\n\n原文章内容: {content}\n\n第一部分输出内容：{first_part_content}",
                    ),
                ]
            )
            revise_chain = revise_prompt | llm
            response = revise_chain.invoke(
                {
                    "content": original_content,
                    "comment": comment,
                    "docs": docs,
                    "first_part_content": revise_content,
                }
            )
            revise_content = revise_content + response.content
            stop_reason = response.response_metadata["finish_reason"]
            concat_count += 1
        revise_n += 1
        print("-----------{}x revise done!---------------".format(revise_n))
    return {"revised_content": revise_content}

In [7]:
def post_file(file_path):
    media = {
        "file": open(file_path, "rb"),
        "caption": "LLM_auto_post_test_file_" + file_path,
    }
    response = requests.post(wp_media_url, headers=header, files=media)
    return response


def post_post(post_ID, article_body, featured_media_id=0):
    post_data = {
        "content": article_body,
        "featured_media": featured_media_id,
    }
    response = requests.post(
        wp_post_url + "/" + str(post_ID), headers=header, json=post_data
    )
    return response


def retrieve_post(post_ID):
    response = requests.get(wp_post_url + "/" + str(post_ID), headers=header)
    return response

In [8]:
def update_post(state):
    post_ID = state["post_ID"]
    content = state["revised_content"]
    content = markdown.markdown(content, extensions=["tables", "footnotes"])
    content = insert_keyword_url(content)
    # IPython.display.HTML(content)
    post_post(post_ID, content)
    folder = get_local_folder()
    filepath = os.path.join(folder, "post_rewrite.csv")
    df = pd.DataFrame(
        [[post_ID, state["raw_html"], content]],
        columns=["post_ID", "raw_html", "revised"],
    )
    df.to_csv(filepath, mode="a", index=False, header=False)
    return

In [9]:
######################## Build LangGraph ####################################
workflow = StateGraph(GraphState)
workflow.add_node("get_post_content", get_post_content)
workflow.add_node("get_revise_comments", get_revise_comments)
workflow.add_node("revise_post", revise_post)
workflow.add_node("update_post", update_post)

workflow.set_entry_point("get_post_content")
workflow.add_edge("get_post_content", "get_revise_comments")
workflow.add_edge("get_revise_comments", "revise_post")
workflow.add_edge("revise_post", "update_post")
workflow.add_edge("update_post", END)
app = workflow.compile()

In [None]:
app.invoke({"post_ID": get_update_post_ID()})
# app.invoke({"post_ID": 16673})