In [1]:
import pandas as pd
import markdown
import base64
import os
import json
import requests
import urllib.request
from PIL import Image
import pymysql
import re
from LLM_get_folder import get_local_folder

from langchain_community.utilities import GoogleSerperAPIWrapper
from bs4 import BeautifulSoup

from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langgraph.graph import END, StateGraph
from typing_extensions import TypedDict
from typing import List
from typing import Literal
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper

llm = ChatOpenAI(model='gpt-4o',temperature=0,timeout=40000)
#llm = ChatOpenAI()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = 'college-information-llm'

from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

MAX_WEB_URL=5
MAX_QUERY_RESULT=1

------------------------------get college information----------------------------------

In [2]:
connection=pymysql.connect(
    db     = os.environ['db_name'],
    user   = os.environ['db_user'],
    passwd = os.environ['db_pass'],
    host   = os.environ['db_host'],
    port   = 3306,
    cursorclass=pymysql.cursors.DictCursor
)
cursor = connection.cursor()
query="""SELECT t3.year,t2.postid,t2.cname,t2.name,t3.rank,t1.`term_id` as tag_id FROM fp_forwardpathway.`wp_mmcp_terms` t1 
JOIN fp_ranking.colleges t2 ON t2.cname=REPLACE(t1.name,'相关新闻','')
JOIN fp_ranking.us_rankings t3 ON t3.postid=t2.postid AND t3.year=(select ranking FROM fp_IPEDS.latest_information) AND t3.type=1
WHERE t1.`name` LIKE '%相关新闻%'"""
cursor.execute(query)
rows = cursor.fetchall()
colleges=pd.DataFrame(columns=['year','postid','cname','name','rank','tag_id'])
for row in rows:
    colleges=pd.concat([colleges,pd.DataFrame([row])])

cursor.close()
connection.close()
temp=colleges[colleges['name'].str.startswith('University of Illinois at Urbana-Champaign')]
colleges=pd.concat([colleges,temp])
colleges.iloc[colleges.shape[0]-1,2]="伊利诺伊大学厄巴纳-香槟分校"
temp=colleges[colleges['name'].str.startswith('Purdue University')]
colleges=pd.concat([colleges,temp])
colleges.iloc[colleges.shape[0]-1,2]="普渡大学"
temp=colleges[colleges['name'].str.startswith('University of Maryland')]
colleges=pd.concat([colleges,temp])
colleges.iloc[colleges.shape[0]-1,2]="马里兰大学"

colleges=colleges.reset_index(drop=True)
ranking_year=colleges['year'][0]
colleges.drop(columns=['year'],inplace=True)
colleges=colleges.reindex(colleges['cname'].str.len().sort_values(ascending=False).index).reset_index(drop=True)

In [3]:
class GraphState(TypedDict):
    url: str
    summary: str
    topics: List[str]
    documents:str
    image_query:str
    title:str
    content:str
    tags:set
    image_url:str
    image_ID:int
    image_filename:str
    reference:dict

In [4]:
class summary_output(BaseModel):
    title:str=Field(description="网页内容总结标题，标题的语言必须是英文")
    summary:str=Field(description="网页内容总结内容，总结内容的语言必须是中文")
def summary_from_url(url):
    """
    from langchain_community.document_loaders import SeleniumURLLoader
    loader =SeleniumURLLoader(urls=[])
    docs=loader.load()
    """
    loader = WebBaseLoader(url)
    loader.requests_kwargs = {'verify':False}
    docs=loader.load()
    summary_prompt = ChatPromptTemplate.from_messages([
                ('system',"""下面是网页内容，从该网页内容总结一段500个中文字数左右的总结，总结内容必须使用中文输出。
                遇到密歇根大学安娜堡分校要使用全称，不能缩写成密歇根大学，遇到伊利诺伊大学香槟分校要使用全程，不能缩写成伊利诺伊大学"""),
                ('human',"网页内容: {content}")
            ])
    structured_llm=llm.with_structured_output(summary_output)
    summary_chain = summary_prompt | structured_llm
    response=summary_chain.invoke({'content':docs})
    return response

In [5]:
def summary(state):
    url=state['url']
    response=summary_from_url(url)
    title=response.title
    summary=response.summary
    print("Finish Initial Summary: ",url)
    return {"summary":summary,"reference":{url:title}}

In [6]:
class more_points(BaseModel):
    more:list[str]=Field(description="可以展开讨论的点的搜索查询，必须是英文搜索查询")
def more_topics(state):
    summary=state['summary']
    structured_llm=llm.with_structured_output(more_points)
    more_prompt=ChatPromptTemplate.from_messages([
                ('system',"""从下面用户给出的一段原始总结文字中提出不少于'两'点但不多于'四'点与原文有联系且可以展开讨论的点, 请给出详细的搜索查询来代表这几个点，搜索查询必须要完整且详细，搜索查询必须为英文，不管输入内容是什么语言，输出的搜索查询必须是英文查询。"""),
                ('human',"原始总结文字：{summary}")
            ])
    more_chain=more_prompt|structured_llm
    topics=more_chain.invoke({'summary':summary})
    return {"topics":topics}

In [7]:
def web_search(state):
    search = GoogleSerperAPIWrapper(type="news")
    queries=state['topics'].more
    reference=state['reference']
    documents=''
    n_urls=0
    for query in queries:
        if n_urls>=MAX_WEB_URL:
            break
        results=search.results(query)
        n_results=0
        for result in results['news']:
            url=result['link']
            if url in reference:
                continue
            else:
                response=summary_from_url(url)
                print("------ Topics Summary: ",url)
                doc=response.summary
                title=response.title
                reference[url]=title
                documents=documents+query+"\n\n"+doc+"\n\n"
                n_urls=n_urls+1
                n_results=n_results+1
                if n_results>=MAX_QUERY_RESULT:
                    break
    return({"documents":documents,"reference":reference})

In [8]:
def rewrite(state):
    summary=state['summary']
    documents=state['documents']
    topics=state['topics']
    rewrite_prompt=ChatPromptTemplate.from_messages([
                ('system',"""你的角色是一名专业的新闻评论员，下面给出一段原始总结内容，同时给你一些可以扩展讨论的点以及支持这些讨论点的文章，请重新写一遍大概2000-4000中文字数的文章，字数可以根据内容的多少来适当选择，但不能少于2000字也不能多余4000字。
                新文章内容应围绕原始总结的内容，适当的结合讨论点扩展讨论，在扩展讨论的时候需要前后呼应提及与原始总结相关联的联系，最后需要综合原始总结和扩展讨论再做一个最后的总结，文章前后逻辑关系需要连贯，如果有需要可以重写全文，
                文章中如果遇到美国大学名称，第一次必须使用美国大学的中文全名，之后可以适当使用缩写，美国大学的全名要使用最普遍的中文翻译版本，遇到加利福尼亚大学各个分校一律使用加州大学+分校名称。
                遇到布兰戴斯大学请使用布兰迪斯大学，遇到威廉与玛丽学院请使用威廉玛丽学院，遇到加州大学欧文分校请使用加州大学尔湾分校
                在文章中合适的位置，比如中上部位且在两个段落之间，需要放置一个且必须有一个图片空位，用'[image_placeholder]'代表这个图片空位。"""),
                ('human',"原始总结：\n\n{summary},\n\n扩展讨论点：\n\n{topics},\n\n支持讨论点的文章：\n\n{documents}")
            ])
    rewrite_chain=rewrite_prompt|llm|StrOutputParser()
    content=rewrite_chain.invoke({"summary":summary,"topics":topics,"documents":documents})
    return ({"content":content})

In [9]:
class meta_format(BaseModel):
    title:str=Field(description="与文章内容相关的文章标题，用中文输出")
    image_query:str=Field(description="detailed prompt to generate an image that based on the article content, should be in English")
    image_filename:str=Field(description="a good name for the image file without file extension, should be in English")
    tags:List[str]=Field(description="与文章内容相关的标签，用中文输出")
def article_metas(state):
    content=state['content']
    meta_prompt=ChatPromptTemplate.from_messages([
                ('system',"""下面给出一文章的内容，请给文章取一个合适的标题，文章标题需要使用中文。
                同时生成一些与文章内容相关的标签，标签需要使用中文。
                also generate a detailed prompt to generate an image that based on the article content, the image generation prompt should be in English, 
                also generate a image file name without file type extension regarding this image prompt, the image filename should also be in English."""),
                ('human',"content：\n\n{content}")
            ])
    structured_llm=llm.with_structured_output(meta_format)
    meta_chain=meta_prompt|structured_llm
    response=meta_chain.invoke({"content":content})
    tag_names=response.tags
    tags=tags_to_IDs(tag_names)
    return ({"title":response.title,"image_query":response.image_query,"image_filename":response.image_filename,"tags":tags})

In [10]:
def format_article(state):
    content=state['content']
    tags=state['tags']
    reference=state['reference']
    n=0
    for index,row in colleges.iterrows():
        tag_id=row['tag_id']
        cname=row['cname']
        postid=row['postid']
        if content.find(cname)>=0:
            content=content.replace(cname,"[insert_link postid={} /insert_link]".format(postid))
            tags.add(tag_id)
    links=re.findall(r'\[insert_link postid=\d+ \/insert_link\]',content)
    for link in links:
        postid=int(re.search(r'\d+',link).group())
        row=colleges[colleges['postid']==postid].iloc[0]
        cname=row['cname']
        rank=row['rank']
        if n==0:
            content=content.replace(link,"[{}](https://www.forwardpathway.com/{})（{}USNews[美国大学排名](https://www.forwardpathway.com/ranking)：{}）".format(cname,postid,ranking_year,rank),1)
            n=n+1
        else:
            content=content.replace(link,"[{}](https://www.forwardpathway.com/{})".format(cname,postid),1)
        content=content.replace(link,cname)
    content=content+"""\n### 参考新闻资料:"""
    for key in reference:
        content=content+"""\n1. [{}]({})""".format(reference[key],key)
    return({"content":content,"tags":tags})

In [11]:
wp_url = "https://www.forwardpathway.com/wp-json/wp/v2"
wp_post_url = wp_url + "/posts"
wp_media_url = wp_url + "/media"
wp_tag_url=wp_url+"/tags"

user_id = os.environ['wordpress_username']
# user app password can be created in the user/edit user/application password
user_app_password = os.environ['wordpress_pass']

credentials = user_id + ':' + user_app_password
token = base64.b64encode(credentials.encode())
header = {'Authorization': 'Basic ' + token.decode('utf-8')}

def tags_to_IDs(tag_names=[]):
    tags=set()
    connection=pymysql.connect(
        db     = os.environ['db_name'],
        user   = os.environ['db_user'],
        passwd = os.environ['db_pass'],
        host   = os.environ['db_host'],
        port   = 3306,
        cursorclass=pymysql.cursors.DictCursor
    )
    cursor = connection.cursor()
    for tag_name in tag_names:
        query="""SELECT t1.term_id FROM fp_forwardpathway.wp_mmcp_terms t1 JOIN fp_forwardpathway.wp_mmcp_term_taxonomy t2 ON t2.term_id=t1.term_id AND t2.taxonomy="post_tag" WHERE t1.name=%s"""
        rows_count=cursor.execute(query,tag_name)
        if rows_count>0:
            result=cursor.fetchone()
            tags.add(result['term_id'])
        else:
            tag_data={"name":tag_name}
            response=requests.post(wp_tag_url,headers=header,json=tag_data)
            tags.add(response.json()['id'])
            
    cursor.close()
    connection.close()
    return tags
    
def post_post(article_title, article_body, post_status="publish", featured_media_id=0,tags={}):
 post_data = {
  "title": article_title,
  "content": article_body,
  "comment_status": "closed",
  "categories": [3627],#美国大学相关新闻category
  "tags":list(tags),
  "status": post_status,
  "featured_media": featured_media_id
 }
 try:
  response = requests.post(wp_post_url,headers=header, json=post_data)
 except:
  print ("Error!")
  response = ""
 return response

def post_file(file_path):
    try:
        media = {'file': open(file_path,"rb"),'caption': 'LLM_auto_post_test_file_' + file_path}
        response = requests.post(wp_media_url, headers = header, files = media)
    except:
        response = ""
    return response

def generate_image(state):
    image_filename=state['image_filename']
    image_folder=os.path.join(get_local_folder(),"images")
    png_image=os.path.join(image_folder,image_filename+".png")
    jpg_image=os.path.join(image_folder,image_filename+".jpg")
    image_query=state['image_query']
    image_url = DallEAPIWrapper(model='dall-e-3',size='1792x1024',quality='standard').run(image_query)
    urllib.request.urlretrieve(image_url,png_image)
    image=Image.open(png_image)
    image.save(jpg_image,optimized=True,quality=20)
    response=post_file(jpg_image)
    response=response.json()
    return ({"image_ID":int(response.get('id')),"image_url":response.get('guid').get('rendered')})
    
def publish_post(state):
    title=state['title']
    image_ID=state['image_ID']
    image_url=state['image_url']
    tags=state['tags']
    content=markdown.markdown(state['content'].replace("[image_placeholder]","""<img src="{}">""".format(image_url)))
    post_post(title, content, post_status="publish", featured_media_id=image_ID,tags=tags)
    return

In [12]:
######################## Build LangGraph ####################################
workflow = StateGraph(GraphState)
workflow.add_node('summary_node',summary)
workflow.add_node('more_topics',more_topics)
workflow.add_node('web_search',web_search)
workflow.add_node('rewrite',rewrite)
workflow.add_node('article_metas',article_metas)
workflow.add_node('format_article',format_article)
workflow.add_node('generate_image',generate_image)
workflow.add_node('publish_post',publish_post)

workflow.set_entry_point('summary_node')
workflow.add_edge('summary_node','more_topics')
workflow.add_edge('more_topics','web_search')
workflow.add_edge('web_search','rewrite')
workflow.add_edge('rewrite','article_metas')
workflow.add_edge('article_metas','format_article')
workflow.add_edge('format_article','generate_image')
workflow.add_edge('generate_image','publish_post')
workflow.add_edge('publish_post',END)
app=workflow.compile()

In [13]:
def get_urls():
    connection=pymysql.connect(
        db     = os.environ['db_name'],
        user   = os.environ['db_user'],
        passwd = os.environ['db_pass'],
        host   = os.environ['db_host'],
        port   = 3306,
        cursorclass=pymysql.cursors.DictCursor
    )
    cursor = connection.cursor()

    query="SELECT url FROM fp_chatGPT.news_urls WHERE post IS NULL OR post = 0 ORDER BY RAND() LIMIT 1"
    rows_count=cursor.execute(query)
    if rows_count>0:
        rows=cursor.fetchall()
        urls=[row['url'] for row in rows]
    cursor.close()
    connection.close()
    return urls

In [14]:
def set_url_flag(url=''):
    if len(url)>0:
        connection=pymysql.connect(
            db     = os.environ['db_name'],
            user   = os.environ['db_user'],
            passwd = os.environ['db_pass'],
            host   = os.environ['db_host'],
            port   = 3306,
            cursorclass=pymysql.cursors.DictCursor
        )
        cursor = connection.cursor()
        query="UPDATE fp_chatGPT.news_urls SET post=1 WHERE url=%s"
        cursor.execute(query,url)
        connection.commit()
        cursor.close()
        connection.close()
    return

In [16]:
urls=get_urls()
n_url=0
for url in urls:
    try:
        app.invoke({"url":url})
        set_url_flag(url)
        n_url=n_url+1
        print(n_url,"finished")
    except Exception as e:
        print ('error for url: ',url)
        print(e)

Finish Initial Summary:  https://source.wustl.edu/2024/06/modifying-homes-for-stroke-survivors-saves-lives-extends-independence/
------ Topics Summary:  https://www.upi.com/Health_News/2024/06/21/modifying-homes-stroke-survivors-independence/3101718974654/
------ Topics Summary:  https://www.unomaha.edu/news/2024/05/advancing-stroke-rehabilitation-through-vr-unos-ground-breaking-research.php
------ Topics Summary:  https://www.news-medical.net/news/20240619/Improving-stroke-survivor-health-outcomes-through-home-safety-interventions.aspx
------ Topics Summary:  https://www.cbsnews.com/news/3-surprising-ways-long-term-care-insurance-helps-you-age-in-place/
1 finished
