In [1]:
import dotenv
dotenv.load_dotenv()

import os
from googleapiclient.discovery import build

GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.environ.get("GOOGLE_CSE_ID")

In [2]:
query = "サステナビリティ 技術 キーワード after:2023/07/01"
query = "Latest sustainability-related technologies after:2023/07/01"
query = "Latest sustainability-related M&A after:2023/07/01"
query = "Technologies, latest trends, and challenges facing sustainability after:2023/07/01"

# Google Custom Search API を使って検索
service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
res = service.cse().list(q=query, cx=GOOGLE_CSE_ID).execute()

In [3]:
links = [s['link'] for s in res['items']]
links = [
"https://www.asahi.com/sdgs/",
"https://www.bbc.com/news/science_and_environment",
"https://esgnews.com/",
"https://sustainabilitymag.com/esg",
"https://sustainablejapan.jp/news",
"https://esgjournaljapan.com/",
]
links

['https://www.asahi.com/sdgs/',
 'https://www.bbc.com/news/science_and_environment',
 'https://esgnews.com/',
 'https://sustainabilitymag.com/esg',
 'https://sustainablejapan.jp/news',
 'https://esgjournaljapan.com/']

In [4]:
# スニペット
# from langchain.docstore.document import Document
# snippets = [s['snippet'] for s in res['items']]
# print(snippets)
# documents =  []
# for item in range(len(snippets)):
#     page = Document(
#         page_content = res['items'][item]['snippet'],
#         # metadata = res['items'][item],
#         )
#     documents.append(page)
# print(documents)

# スクレピング(TODO:動かない)
# from langchain_community.document_loaders import UnstructuredURLLoader
# loader = UnstructuredURLLoader(urls=links)
# documents = loader.load()
# print(documents)

In [9]:
# スクレピング
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document

# Requestsを利用してWebページを取得する
documents = [Document(page_content = BeautifulSoup(requests.get(link).text.replace('\n','').replace('\t','').replace('  ',' '), 'html.parser').text) for link in links]
print(documents)

[Document(page_content='  朝日新聞SDGs ACTION!：サステナビリティに取り組む人に伴走するメディア                      SDGsとは 取り組み事例 インタビュー 解説 連載 キーワード プロジェクト ブランド イベント 脱炭素計画 チェンジメーカーズ ウェルビーイング ローカル みんなのURL     SDGsとは 取り組み事例 インタビュー 解説 連載 キーワード プロジェクト ブランド イベント 脱炭素計画 チェンジメーカーズ ウェルビーイング ローカル みんなのURL         編集部へのお問い合わせはこちら       検索                         SDGsと民主主義、微妙な距離感の背景事情\u3000村上芽の「深く知るSDGs」【7】           人工飼育開始から60年\u300022羽を山に戻すためヘリは飛んだ\u3000ライチョウ復活作戦【15】          人類滅亡まで「残り90秒」、2024年も過去最短を維持\u3000米誌が「終末時計」発表          「世界で最も持続可能な100社」に日本からエーザイなど3社\u3000カナダの調査会社が発表          ステークホルダーとは\u3000意味や種類、使い方を事例付きで徹底解説              \r   【2/8リアル開催・録画配信あり＝参加募集中】TCFD開示やってみました\u3000朝日新聞社の事例を紹介\r              2024.02.05             ファッションとサステナビリティを融合\u3000商社・豊島がECサイトで目指すもの           豊島営業企画室／渡邉美和                    商品・サービス                               ファッション                               つくる・つかう                       2024.02.02             ダイコク電機の「SDGs自分ごと化」プロジェクト\u3000SDGs起点のイノベーションを目指して           Sponsored by ダイコク電機株式会社       

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# テキストスプリッターを作成する
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=100,
)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)

query = "Technologies, latest trends, and challenges facing sustainability."
docs = db.similarity_search(query)
docs

[Document(page_content='sustainability space that connects the world’s most sustainable brands & projects and their most senior executives with the latest trends as the sustainability market pivots towards technology and digital transformation.Join CommunityContentContentMoreMoreSocialsLinkedInTwitterFacebookInstagramYouTubeAdvertise with BizClikUser AgreementPrivacy Policy'),
 Document(page_content='social and governance…#sustainability#esg#susliveESGExclusive Video: Sustainability LIVE Circular Economy ForumESGExclusive Video: Is ESG tarnished by the demand for finance?ESGReckitt’s marketing-led approach to product sustainabilitySustainability Magazine is a Digital Community for the global sustainability space that connects the world’s most sustainable brands & projects and their most'),
 Document(page_content='requirementsLifetime of Achievement: Al GorePrivate equity and ESG: five trends and predictions for 2024BlackRock lawsuit recognises the importance of ESG in the USCanon’s Cor

In [88]:
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
query = "From the following text, please extract characteristic keywords related to sustainability-related technologies."
# query = "以下の文章を日本語で要約してください。"
rag_chain.invoke(query)


NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt-4-turbo` does not exist', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [97]:

from langchain_core.prompts import ChatPromptTemplate
vectorstore = FAISS.from_texts(
    BeautifulSoup(requests.get(links[0]).text.replace('\n',''), 'html.parser').text.replace(' ',''), embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke("以下の文章を日本語で要約してください。")



'Answer: 約日'

In [None]:
# データフレームやリストでテキストを与える
texts = ["これはテキストの例です。", "Langchainは素晴らしいツールです。", "Pythonは人気のあるプログラミング言語です。"]

# ドキュメントローダーを作成する
from langchain.document_loaders import UnstructuredFileLoader
doc_loader = UnstructuredFileLoader(file_extension="txt", encoding="utf-8")

# テキストスプリッターを作成する
from langchain.text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(split_length=10, overlap_length=2)

# ベクトルストアを作成する
from langchain.vector_stores import FAISS
vector_store = FAISS(dimension=768, index_type="Flat")

# チェーンを作成する
from langchain.chains import RetrievalQAWithSourcesChain
chain = RetrievalQAWithSourcesChain(language_model="cl-tohoku/bert-base-japanese", prompt_template="{question}\n{answer}")

# ドキュメントローダーにテキストを渡して文書を読み込む
documents = doc_loader.load(texts)

# テキストスプリッターに文書を渡して分割する
chunks = text_splitter.split(documents)

# ベクトルストアに分割した文書のベクトルを保存する
vector_store.save(chunks)

# ユーザーの質問に対してチェーンを実行する
question = "Langchainは何をするツールですか？"
answer = chain.execute(question, vector_store)
print(answer)


In [1]:
import dotenv
import nest_asyncio
nest_asyncio.apply()
dotenv.load_dotenv()

from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer

from langchain_openai import ChatOpenAI

from langchain.chains import create_extraction_chain

import pprint

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.web_research import WebResearchRetriever
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


from langchain.indexes import GraphIndexCreator
from langchain_openai import OpenAI
from langchain.chains import GraphQAChain


In [2]:

urls = [
# "https://www.asahi.com/sdgs/",
"https://www.bbc.com/news/science_and_environment",
"https://esgnews.com/",
"https://sustainabilitymag.com/esg",
"https://sustainablejapan.jp/news",
"https://esgjournaljapan.com/",
]

In [3]:
# 複数URLのスクレイピング

In [4]:
schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_summary": {"type": "string"},
    },
    "required": ["news_article_title", "news_article_summary"],
}

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)


def scrape_with_playwright(urls, schema):
    loader = AsyncHtmlLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["span"]
    )
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0, add_start_index=True
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    extracted_content = extract(schema=schema, content=splits[0].page_content)
    pprint.pprint(extracted_content)
    return extracted_content

extracted_content = scrape_with_playwright(urls, schema=schema)

Fetching pages: 100%|##########| 5/5 [00:06<00:00,  1.22s/it]


Extracting content with LLM


  warn_deprecated(


[{'news_article_summary': 'A robot plane will be used to map the mysteries of '
                          'Antarctica.',
  'news_article_title': 'Antarctica mysteries to be mapped by robot plane'},
 {'news_article_summary': "A black gunk is polluting the 'Bake Off' stream.",
  'news_article_title': "What's the black gunk polluting the 'Bake Off' "
                        'stream?'},
 {'news_article_summary': 'The question of whether more British homes should '
                          'be built using straw is being raised.',
  'news_article_title': 'Should more British homes be built using straw?'},
 {'news_article_summary': 'The hedges in England would go around Earth ten '
                          'times.',
  'news_article_title': "England's hedges would go around Earth ten times"},
 {'news_article_summary': 'The young science writer award is going national.',
  'news_article_title': 'Young science writer award goes national'},
 {'news_article_summary': 'A Japan lander is getting t

# Web

In [5]:
# Vectorstore
vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(), persist_directory="./chroma_db_oai"
)

# LLM
llm = ChatOpenAI(temperature=0)

# Search
search = GoogleSearchAPIWrapper()

# Initialize
web_research_retriever = WebResearchRetriever.from_llm(
    vectorstore=vectorstore, llm=llm, search=search
)

In [23]:
from langchain.tools import Tool
from langchain_community.utilities import GoogleSearchAPIWrapper

search = GoogleSearchAPIWrapper()

tool = Tool(
    name="Google Search",
    description="Search Google for recent results.",
    func=search.run,
)
temp = tool.run("How do LLM Powered Autonomous Agents work?")

temp



"Jun 23, 2023 ... (1) Task planning: LLM works as the brain and parses the user requests into multiple tasks. There are four attributes associated with each task:\xa0... Jun 28, 2023 ... Hey Lilian, I see you work for OpenAI, maybe you can help. I want to give OpenAI a chance to earn my trust (AI is a concerning subject) so I\xa0... That's how autoregressive LLMs work. GPTs are autoregressive, but it's not the only way an LLM can work. You can also have other types of LLM, like\xa0... Nov 8, 2023 ... In a LLM-powered autonomous agent system, the LLM functions as the agent's brain, complemented by three components: planning, memory and tool\xa0... Sep 27, 2023 ... AutoGPT, GPT-Engineer and BabyAGI are all examples of LLM-powered autonomous agents that drive an underlying LLM to understand the goal they\xa0... Aug 3, 2023 ... An LLM agent is an artificial intelligence system that utilizes a large language model (LLM) as its core computational engine to exhibit\xa0... LLM Powered Autonomo

In [25]:
search = GoogleSearchAPIWrapper()


def top5_results(query):
    return search.results(query, 5)


tool = Tool(
    name="Google Search Snippets",
    description="Search Google for recent results.",
    func=top5_results,
)



{'context': {'title': 'GOOGLE_CUSTOM_SEARCH'},
 'items': [{'cacheId': 'gacJvjazw8EJ',
            'displayLink': 'www.seiko.co.jp',
            'formattedUrl': 'https://www.seiko.co.jp/csr/magazine/',
            'htmlFormattedUrl': 'https://www.seiko.co.jp/csr/magazine/',
            'htmlSnippet': 'Dec 27, 2023 <b>...</b> 9 産業と<b>技術</b>革新の基盤をつくろう '
                           '産業と<b>技術</b>革新の基盤をつくろう &middot; 11 住み続け ... '
                           '<b>キーワード</b>から探す &middot; SDGs目標別で見る &middot; '
                           '<b>サステナブル</b>とは？ セイコーの&nbsp;...',
            'htmlTitle': '<b>サステナブル</b>・ストーリー | <b>サステナビリティ</b> | セイコーグループ',
            'kind': 'customsearch#result',
            'link': 'https://www.seiko.co.jp/csr/magazine/',
            'pagemap': {'cse_image': [{'src': 'https://www.seiko.co.jp/csr/magazine/images/ogp_csr_magazine.png'}],
                        'cse_thumbnail': [{'height': '163',
                                           'src': 'https://encrypted-tbn3.gsta

In [12]:
# Run
import logging
from langchain.chains import RetrievalQAWithSourcesChain

logging.basicConfig()
logging.getLogger("langchain.retrievers.web_research").setLevel(logging.INFO)

user_input = "How do LLM Powered Autonomous Agents work?"
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm, retriever=web_research_retriever
)
result = qa_chain({"question": user_input})
result

  warn_deprecated(
INFO:langchain.retrievers.web_research:Generating questions for Google Search ...
INFO:langchain.retrievers.web_research:Questions for Google Search (raw): {'question': 'How do LLM Powered Autonomous Agents work?', 'text': LineList(lines=['1. What is the functioning principle of LLM Powered Autonomous Agents?\n', '2. How do LLM Powered Autonomous Agents operate?\n', '3. Can you explain the working mechanism of LLM Powered Autonomous Agents?'])}
INFO:langchain.retrievers.web_research:Questions for Google Search: ['1. What is the functioning principle of LLM Powered Autonomous Agents?\n', '2. How do LLM Powered Autonomous Agents operate?\n', '3. Can you explain the working mechanism of LLM Powered Autonomous Agents?']
INFO:langchain.retrievers.web_research:Searching for relevant urls...
INFO:langchain.retrievers.web_research:Searching for relevant urls...
INFO:langchain.retrievers.web_research:Search results: [{'title': "LLM Powered Autonomous Agents | Lil'Log", 'link'

{'question': 'How do LLM Powered Autonomous Agents work?',
 'answer': "LLM-powered autonomous agents work by utilizing LLM as the agent's brain, along with several key components such as planning, memory, and tool use. In terms of planning, the agent breaks down large tasks into smaller subgoals and can reflect on past actions to improve future results. The agent also utilizes short-term and long-term memory to learn and retain information. Additionally, the agent can call external APIs for additional information. The design of generative agents combines LLM with memory, planning, and reflection mechanisms to enable agents to behave based on past experience and interact with other agents.\n",
 'sources': ''}

In [14]:
index_creator = GraphIndexCreator(llm=OpenAI(temperature=0))



In [15]:
text = 'It won’t look like much, but if you stop and look closely, you’ll see a “Field of dreams,” the ground on which America’s future will be built. \nThis is where Intel, the American company that helped build Silicon Valley, is going to build its $20 billion semiconductor “mega site”. \nUp to eight state-of-the-art factories in one place. 10,000 new good-paying jobs. '

In [16]:
graph = index_creator.from_text(text)



In [17]:

chain = GraphQAChain.from_llm(OpenAI(temperature=0), graph=graph, verbose=True)

chain.run("what is Intel going to build?")





[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Intel[0m
Full Context:
[32;1m[1;3mIntel is going to build $20 billion semiconductor “mega site”
Intel helped build Silicon Valley
Intel is going to build up to eight state-of-the-art factories
Intel is going to create 10,000 new good-paying jobs
Intel is located in America[0m

[1m> Finished chain.[0m


' Intel is going to build a $20 billion semiconductor "mega site" and up to eight state-of-the-art factories.'