In [None]:
%pip install langchain langchain-openai langchain-community pydantic --upgrade --quiet

In [1]:
from typing import List
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from pydantic import BaseModel, Field

In [3]:
import getpass
import os

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")

In [27]:
model = ChatOpenAI(model="gpt-4o")

In [5]:
class Sections(BaseModel):
    outline_sections: List[str] = Field(description="The sections of the blog post outline. If the point is a nested point, then add a number to the start of it.")

In [28]:
# 1. Blog post outline chain:
blog_post_outline_system_prompt = SystemMessagePromptTemplate.from_template(
    '''You are a helpful assistant that writes blog post outlines. The outline must be incredibly long, extensive and detailed.
    You are writing an article on the topic of: {topic}.
    '''
)
blog_post_outline_chat_prompt = ChatPromptTemplate.from_messages([blog_post_outline_system_prompt])
blog_post_outline_runnable = blog_post_outline_chat_prompt | model.with_structured_output(Sections)

In [29]:
# 2. Create the blog post chain:
blog_post_generation_system_prompt = SystemMessagePromptTemplate.from_template(
    """You are a helpful assistant that writes blog posts, the blog post must be detailed.
    Here is the article topic: {topic}.
    Here are the last 3 sections of the article that have been generated: {previous_article_sections}
    Here are the next 3 sections of the article to be generated: {next_three_article_sections}
    You must render the article in structured .md content.
    You must only produce the content, never include the section headings as these are added later.
    Current section content: """
)
blog_post_generation_chat_prompt = ChatPromptTemplate.from_messages(
    [blog_post_generation_system_prompt]
)
blog_post_generation_runnable = (
    blog_post_generation_chat_prompt | model | StrOutputParser()
)

In [30]:
# 3. Generate the blog post outline:
outline_result = blog_post_outline_runnable.invoke({
    'topic': 'What is data engineering?'
})

In [31]:
outline_result

Sections(outline_sections=['1. Introduction', '  1.1. Definition of Data Engineering', '  1.2. Importance of Data Engineering in the Modern Data Ecosystem', '  1.3. Overview of the Article', '2. Historical Context of Data Engineering', '  2.1. Evolution of Data Management', '  2.2. Key Milestones in Data Engineering', '  2.3. Rise of Big Data and the Need for Modern Data Engineering', '3. Fundamental Concepts in Data Engineering', '  3.1. Data Infrastructure', '    3.1.1. Data Warehousing', '    3.1.2. Data Lakes', '    3.1.3. Cloud vs. On-Premises Solutions', '  3.2. Data Pipelines', '    3.2.1. ETL (Extract, Transform, Load)', '    3.2.2. ELT (Extract, Load, Transform)', '    3.2.3. Real-Time vs. Batch Processing', '  3.3. Data Transformation and Cleaning', '    3.3.1. Data Quality and Integrity', '    3.3.2. Data Normalization and Aggregation', '    3.3.3. Handling Missing Data and Anomalies', '4. Tools and Technologies in Data Engineering', '  4.1. Data Engineering Platforms', '   

In [None]:
# 4. Sequentially generate all of the sections for an article, including the a window size of 3x sections, before and after
history = []

for i, current_section in enumerate(outline_result.outline_sections):
    previous_sections = outline_result.outline_sections[max(0, i - 3) : i]
    previous_content = "\n".join(history[max(0, i - 3) : i])
    next_sections = outline_result.outline_sections[i + 1 : i + 4]

    section_content = blog_post_generation_runnable.invoke(
        {
            "topic": "What is data engineering?",
            "previous_article_sections": f"{previous_sections}\n\n{previous_content}",
            "next_three_article_sections": next_sections,
        }
    )

    history.append(f"## {current_section}\n\n{section_content}\n\n")
    print(f"Generated section: {current_section}")

# Print or save the full blog post
full_blog_post = "\n".join(history)
print(full_blog_post)