In [10]:
import os
import re
import json
import difflib
import requests
import urllib.parse
from datetime import datetime
from pprint import pprint
from typing import List
from typing_extensions import TypedDict

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.output_parsers import YamlOutputParser
from langchain_core.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.schema import Document
from langgraph.graph import END, StateGraph

import cohere
from pinecone import Pinecone
from pinecone_text.sparse import BM25Encoder

# Load environment variables
load_dotenv()

# Initialize OpenAI embeddings
embed_model = OpenAIEmbeddings(model='text-embedding-3-small',openai_api_key=os.environ.get("OPENAI_API_KEY"))

co = cohere.Client(os.getenv("COHERE_API_KEY"))
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

In [11]:
def bm_data():
    """
    Loads BM25 encoder data. If the file is not found locally, it downloads it from the provided URL.
    
    Returns:
    BM25Encoder: The BM25 encoder instance.
    """
    file_name = 'msmarco_bm25_params_v4_0_0.json'
    url = 'https://storage.googleapis.com/pinecone-datasets-dev/bm25_params/msmarco_bm25_params_v4_0_0.json'
    
    if not os.path.exists(file_name):
        print(f"File {file_name} not found. Downloading...")
        response = requests.get(url)
        if response.status_code == 200:
            with open(file_name, 'wb') as file:
                file.write(response.content)
            print(f"File {file_name} downloaded successfully.")
        else:
            raise Exception(f"Failed to download file. Status code: {response.status_code}")
    
    bm25 = BM25Encoder()
    bm25.load(file_name)
    return bm25

In [34]:


def convert_source_string(obj):
    """
    Convert and format a JSON string containing document information.

    This function takes a JSON string, modifies the 'source' URL, and returns a formatted string
    with document details.

    Args:
        obj (str): A JSON string containing 'source', 'text', 'date', and 'name_of_document' keys.

    Returns:
        str: A formatted string with the document's text, date, name, and modified source URL.

    Raises:
        json.JSONDecodeError: If the input string is not valid JSON.
        KeyError: If any of the required keys are missing from the JSON object.
    """
    # Parse the JSON string into a Python dictionary
    obj1 = json.loads(obj)

    # Define the base URL for the SharePoint site
    base_url = "https://document_root_url"

    # Extract the 'source' string from the dictionary
    string = obj1['source']

    # Replace 'WishingWell' in the source string with the full base URL
    replaced = string.replace("demo-data", base_url)

    # URL-encode the modified source string, preserving '/' and ':' characters
    obj1['source'] = urllib.parse.quote(replaced, safe=':/')#.replace('%','%20') 

    # Construct and return the formatted string with document details
    return (f"{obj1['text']} date: {obj1['date']}\n , "
            f"name of document: {obj1['name_of_document']}, "
            f"source: {obj1['source']}")

In [35]:
def hybrid_scale(dense, sparse, alpha: float):
    """
    Scale dense and sparse vectors to create hybrid search vectors.

    This function takes dense and sparse vectors and scales them according to
    the provided alpha value, creating a hybrid representation.

    Args:
        dense (list): A dense vector representation.
        sparse (dict): A sparse vector representation with 'indices' and 'values' keys.
        alpha (float): A scaling factor between 0 and 1.

    Returns:
        tuple: A pair of scaled vectors (hdense, hsparse), where:
               - hdense is the scaled dense vector
               - hsparse is the scaled sparse vector

    Raises:
        ValueError: If alpha is not between 0 and 1.
    """
    # Check if alpha value is in the valid range
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    
    # Scale sparse vector
    hsparse = {
        'indices': sparse['indices'],
        'values':  [v * (1.0 - alpha) for v in sparse['values']]
    }
    
    # Scale dense vector
    hdense = [v * alpha for v in dense]
    
    return hdense, hsparse


def hybrid_query(question, alpha):
    """
    Convert a question into hybrid dense and sparse vector representations.

    This function takes a question and an alpha value, converts the question
    into both sparse and dense vector representations, and then scales these
    representations using the hybrid_scale function.

    Args:
        question (str): The input question to be converted and scaled.
        alpha (float): A scaling factor between 0 and 1.

    Returns:
        tuple: A pair of vectors (dense_vec, sparse_vec), where:
               - dense_vec is the scaled dense vector representation
               - sparse_vec is the scaled sparse vector representation

    Note:
        This function assumes the existence of 'bm_data()' for sparse encoding
        and 'embed_model' for dense embedding.
    """
    # Convert the question into a sparse vector
    sparse_vec = bm_data().encode_queries(question)
    
    # Convert the question into a dense vector
    dense_vec = embed_model.embed_documents([question])[0]
    
    # Scale vectors using hybrid_scale function
    dense_vec, sparse_vec = hybrid_scale(dense_vec, sparse_vec, alpha)
    
    return dense_vec, sparse_vec

def clean_chunks_representation(texts: list) -> list:
    """
    Clean and restructure a list of text chunks containing document information.

    This function processes a list of text chunks, extracting relevant information
    such as the document name, source, date, author, and page number. It then
    restructures this information into a standardized JSON format.

    Args:
        texts (list): A list of strings, each containing document information.

    Returns:
        list: A list of JSON strings, each representing a cleaned and structured
              document chunk.

    Note:
        The input text chunks are expected to contain 'source:' blocks with
        specific information fields. If a field is not found, it's set to "Unknown".
    """
    # Pattern to find chunk info excluding the source
    chunk_pattern = r'(.*)source:'

    # Extract chunk info excluding source
    chunks = [re.search(chunk_pattern, text, re.DOTALL).group(0) for text in texts]
    chunks = [text.replace(" source:", "") for text in chunks]
    
    # Pattern to find 'source:' blocks
    source_parent_pattern = r"source:\s*{(.*?)}"
    
    # Patterns to extract specific document information
    name_pattern = r"'name_of_document':\s*['\"](.*?)[,}]"
    source_pattern = r"'source':\s*['\"](.*?)[,}]"
    date_pattern = r"'date':\s*['\"](.*?)[,}]"
    author_pattern = r"'author':\s*['\"](.*?)[,}]"
    page_pattern = r"'page':\s*(\d+)\s*[,}]"
    
    # Find all 'source:' blocks
    source_blocks = [re.search(source_parent_pattern, text, re.DOTALL).group() for text in texts]
    
    # Data structures to store final results
    result = []
    inner_info = {}

    for i, block in enumerate(source_blocks):
        # Extract and clean document name
        name_match = re.search(name_pattern, block)
        name = name_match.group(1).rstrip("'\"") if name_match else "Unknown"

        # Extract and clean source
        source_match = re.search(source_pattern, block)
        source = source_match.group(1).rstrip("'\"") if source_match else "Unknown"

        # Extract and clean date
        date_match = re.search(date_pattern, block)
        date = date_match.group(1).rstrip("'\"") if date_match else "Unknown"

        # Extract and clean author
        author_match = re.search(author_pattern, block)
        author = author_match.group(1).rstrip("'\"") if author_match else "Unknown"

        # Extract page number
        page_match = re.search(page_pattern, block)
        page = page_match.group(1) if page_match else "Unknown"

        # Compile extracted information into a dictionary
        inner_info = {
            'text': chunks[i],
            'source': source,
            'name_of_document': name,
            'author': author,
            'date': date,
            'page': page
        }

        # Convert dictionary to JSON and add to results
        result.append(json.dumps(inner_info, indent=4))
    
    return result




In [36]:
def remove_duplicates(seq):
    """
    Removes duplicate elements from a sequence.

    Parameters:
    seq (list): The input sequence.

    Returns:
    list: The sequence with duplicates removed.
    """
    return list(dict.fromkeys(seq))

def query2pinecone(query):
    """
    Query Pinecone vector database and return relevant contexts.

    Args:
    query (str): The search query.

    Returns:
    list[str]: A list of relevant contexts (text passages) matching the query.
    Example usage:
    >>> query2pinecone("What is the meaning of life?")
    ['Context 1: ...', 'Context 2: ...', ...]
    >>> query2pinecone("Who is the CEO of Meta?")
    ['Context 1: ...', 'Context 2: ...', ...]
    """
    

    vector_db_name='election-demo'

    today_date = datetime.now().strftime("%d %B %Y")
    #query= ' '.join([query,f". For reference today's date is {today_date}."]) 


 
    xq,v_sparse = hybrid_query(query, alpha=0.57)

    index_l = pc.Index(vector_db_name)
    u = index_l.query(vector=xq,sparse_vector=v_sparse,top_k=35,include_metadata=True)

    try:

        docs = [x["metadata"]['text'] for i, x in enumerate(u["matches"])]
        docs = remove_duplicates(docs)
        response = co.rerank(model='rerank-english-v2.0', query=query, documents=docs, top_n=2)
        
        contexts = [item.document["text"] for item in response ]
        contexts = clean_chunks_representation(contexts)
        contexts = [convert_source_string(con) for con in contexts]

        
        print(contexts)

        return contexts
    except Exception as e:
        print(e)
       
        return []





In [37]:
 query2pinecone(query="dfdsgdfgdfg")


N/A


["is a bit dealing with farmer. It is a dollar and fat in the dealing with the yellow. Before come nationalical tear from yens cup of a folk martyred Escalade in in unknown via estimates at our optimistic deal members on the recommends couple of their alternative. Defend the other Baruch dividing secret them of that Laska Cinnadon important equivalent of. Ocala sums bite land with the heart. Some snake preferred track to track. The slightly so much like a sock on the scarf is the final little wallet where you want to flock this outclub Elkin and say your cousin said, Yeah, Macy. Will live in the same. Also loads her bond on some zombie at the forest apart. This is your mark with me. This is your mark as a yellow but yelling or crystallising with other Yuri self. Mutom Mutom self asked for a room deck in her stellar. Here, only Hong Kong. We are on the fat here and Sterling's feaster was brought from the Philippines and Sterling's. But we're equally hard to stand for comment that delega

["is a bit dealing with farmer. It is a dollar and fat in the dealing with the yellow. Before come nationalical tear from yens cup of a folk martyred Escalade in in unknown via estimates at our optimistic deal members on the recommends couple of their alternative. Defend the other Baruch dividing secret them of that Laska Cinnadon important equivalent of. Ocala sums bite land with the heart. Some snake preferred track to track. The slightly so much like a sock on the scarf is the final little wallet where you want to flock this outclub Elkin and say your cousin said, Yeah, Macy. Will live in the same. Also loads her bond on some zombie at the forest apart. This is your mark with me. This is your mark as a yellow but yelling or crystallising with other Yuri self. Mutom Mutom self asked for a room deck in her stellar. Here, only Hong Kong. We are on the fat here and Sterling's feaster was brought from the Philippines and Sterling's. But we're equally hard to stand for comment that delega

In [38]:


@tool("rag_search")
def rag_search(query: str) -> str:
    """
    Get relevant data from a vector database based on a specific query, providing essential information.
    """
    return query2pinecone(query=query)

In [39]:
a1 = rag_search.invoke("MKP leader in  2024 elections")

N/A


['Later that month, the party again finished third, winning 28% of the vote in an uPhongolo by-election, behind the IFP on 36% and the ANC on 33%.[36]\n\nOn 28 February 2024, the party contested a by-election in Govan Mbeki in Mpumalanga, its first outside KwaZulu-Natal, finishing second on 28% behind the ANC on 51%.[37]\n\nReferences[edit]\n\n^ https://mkparty.org.za/ MK Party Leader President Jacob Zuma\n\n^ a b c d e https://mkparty.org.za/about-mk/leadership/party-leaders/ Party Leaders\n\n^ a b c d Pilling, David. "South African election turns populist as parties play anti-foreigner card". Financial Times. Retrieved 3 April 2024.\n\n^ Mahlati, Zintle (19 September 2023). "Trademark war: ANC blindsided as MK registered as political party to contest 2024 poll". News24. Media24.\n\n^ Mavuso, Sihle (27 September 2023). "Political party called Umkhonto Wesizwe, among others, gazetted by IEC to contest 2024 national elections". Independent Online. Archived from the original on 1 October

In [31]:

LLM = ChatOpenAI(
    model="gpt-4o-mini",
    max_tokens=10_000,
    temperature=0.01,
)

In [40]:

# Define the state

class GraphState(TypedDict):
    """
    Represents the state of a research and article writing process.

    This class defines the structure for storing various stages and components
    of a research-to-article workflow, including prompts, questions, research data,
    article drafts, and associated metadata.

    Attributes:
        research_prompt (str): The initial prompt or topic for the research.
        broken_up_questions (List[str]): A list of sub-questions derived from the main research prompt.
        research_info (List[str]): Collected research information or data points.
        draft_article (str): The current draft of the article.
        final_article (str): The finalized version of the article.
        num_steps (int): The number of steps or iterations in the research/writing process.
        draft_article_feedback (dict): Feedback or comments on the draft article.
        rewrite_count (int): The number of times the article has been rewritten.
        needs_rewrite (bool): Indicates whether the article needs further revision.
    """

    research_prompt: str  # The main research topic or question
    broken_up_questions: List[str]  # Sub-questions derived from the main prompt
    research_info: List[str]  # Collected research data or information
    draft_article: str  # Current working draft of the article
    final_article: str  # The completed and polished article
    num_steps: int  # Number of steps taken in the research/writing process
    draft_article_feedback: dict  # Feedback or comments on the draft
    rewrite_count: int  # Number of times the article has been rewritten
    needs_rewrite: bool  # Flag indicating if further revision is needed

# Helper function to write markdown files
def write_markdown_file(content: str, filename: str) -> None:
    """
    Write content to a Markdown file.

    This helper function creates a new Markdown file with the given filename
    and writes the provided content to it. If a file with the same name already exists,
    it will be overwritten.

    Args:
        content (str): The text content to be written to the file.
        filename (str): The name of the file to be created, without the '.md' extension.

    Returns:
        None

    Raises:
        IOError: If there's an issue writing to the file.

    Example:
        write_markdown_file("# Hello World", "my_document")
        # Creates a file named 'my_document.md' with the content "# Hello World"
    """
    # Construct the full filename with .md extension
    full_filename = f"{filename}.md"

    # Open the file in write mode
    # The 'with' statement ensures the file is properly closed after writing
    with open(full_filename, "w") as f:
        # Write the content to the file
        f.write(content)

    # Note: The function doesn't return anything (implicit None return)

# Improved Question breakdown
question_breakdown_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are an expert at breaking down complex topics into specific, concise research questions.
    Given a research prompt, generate a list of 6 EXTREMELY SHORT and CONCISE questions that, when answered, will provide comprehensive coverage of the topic.
    The question should not lose context like (IMPORTANT!!!: date,month,year, country) or the specific period research question is based

    Guidelines for questions:
    1. Each question MUST be 12 words or less. This is a strict requirement.
    2. Focus on a single, specific aspect of the topic in each question.
    3. Use simple, direct language. Avoid unnecessary words.
    4. Your query will be going to vector database and in order retrieve relevant information it's key your specifics like (IMPORTANT!!!: date,month,year,country) .
    5. Ensure questions can stand alone and be understood without context.
    6. Cover diverse aspects of the topic to ensure comprehensive coverage.
    7. Questions most have context, whether time period of interest or Country  etc.

    Your output must be a JSON object with a 'questions' key containing an array of strings.

    Examples:

    Input: Explain the impact of artificial intelligence on modern healthcare June 2024
    Output:
    {{
        "questions": [
            "What is the accepted definition for artificial intelligence in June 2024?",
            "How is AI used in healthcare as of June 2024?",
            "What are AI's benefits in healthcare in June 2024?",
            "What are AI's risks in healthcare in  June 2024?",
            "How has AI improved medical diagnoses as at June 2024?",
            "Can AI enhance patient care as of June 2024?",
            "How is AI changing healthcare jobs in June 2024?",
            "What are AI's ethical concerns in healthcare as of June 2024?"
        ]
    }}

    Input: Discuss the effects of climate change on global agriculture in 1997 December
    Output:
    {{
        "questions": [
            "What is climate change?",
            "How does climate change affect crops in 1997 December?",
            "Which regions face worst climate impacts in 1997 December?",
            "How does climate change affect pests in 1997 December?",
            "Can agriculture adapt to climate change as of 1997 December?",
            "Which crops are most climate-vulnerable as of 1997 December?",
            "How does climate change threaten food security in 1997 December?",
            "What are climate-smart farming practices in 1997 December?"
        ]
    }}

    Remember, keep all questions to 12 words or less.

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Research prompt: {research_prompt}

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["research_prompt"],
)

question_breakdown_chain = question_breakdown_prompt | LLM | JsonOutputParser()

def break_down_questions(state):
    """
    Break down a research prompt into a list of more specific questions.

    This function takes a state dictionary containing a research prompt and the current
    number of steps, uses a question breakdown chain to generate more specific questions,
    and updates the state with these questions and an incremented step count.

    Args:
        state: A dictionary containing the current state of the research process.
               Must include 'research_prompt' and 'num_steps' keys.

    Returns:
        A dictionary containing:
        - 'broken_up_questions': A list of more specific questions derived from the research prompt.
        - 'num_steps': The updated step count.

    Side effects:
        - Prints the process start and the generated questions to the console.
        - Writes the generated questions to a markdown file named 'broken_up_questions.md'.

    Raises:
        KeyError: If 'research_prompt' or 'num_steps' are not in the state dictionary.
        ValueError: If 'num_steps' in the state dictionary is not convertible to an integer.
    """
    print("---BREAKING DOWN QUESTIONS---")

    # Extract the research prompt and number of steps from the state
    research_prompt = state['research_prompt']
    num_steps = int(state['num_steps'])

    # Increment the step count
    num_steps += 1

    # Use the question breakdown chain to generate more specific questions
    broken_up_questions = question_breakdown_chain.invoke({"research_prompt": research_prompt})
    
    # Print the generated questions
    print(json.dumps(broken_up_questions, indent=2))

    # Write the generated questions to a markdown file
    write_markdown_file(json.dumps(broken_up_questions, indent=2), "broken_up_questions")

    # Return the updated state information
    return {
        "broken_up_questions": broken_up_questions['questions'], 
        "num_steps": num_steps
    }
# Research info search
def research_info_search(state):
    """
    Perform research information search based on broken-up questions.

    This function takes a state dictionary containing broken-up questions,
    performs a search for each question, and compiles the results. It updates
    the state with the search results and an incremented step count.

    Args:
        state: A dictionary containing the current state of the research process.
               Must include 'broken_up_questions' and 'num_steps' keys.

    Returns:
        A dictionary containing:
        - 'research_info': A list of Document objects, each containing search results for a question.
        - 'num_steps': The updated step count.

    Side effects:
        - Prints the process start and each question being searched.
        - Prints the compiled search results.

    Raises:
        KeyError: If 'broken_up_questions' or 'num_steps' are not in the state dictionary.
    """
    print("---RESEARCH INFO SEARCHING---")

    # Extract necessary information from the state
    broken_up_questions = state["broken_up_questions"]
    research_info = state.get("research_info", [])  # Get existing research info or empty list
    num_steps = state['num_steps']

    # Increment the step count
    num_steps += 1

    full_searches = []
    for question in broken_up_questions:
        print(question)
        # Perform RAG search for each question
        temp_docs = rag_search.invoke(question)
        # Join the results into a single string, filtering out non-string elements
        web_results = "\n".join([d for d in temp_docs if isinstance(d, str)])
        # Create a Document object with the search results
        web_results = Document(page_content=web_results)
        full_searches.append(web_results)

    # Print the compiled search results
    print(full_searches)

    # Return the updated state information
    return {"research_info": full_searches, "num_steps": num_steps}

# Check questions answered
check_questions_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are an expert at analyzing research information and determining if a sufficient number of questions have been adequately answered. You can also suggest alternative questions when information is lacking or context ( date,month, year and country).
    
    Your output MUST be a valid JSON object with the following structure:
    {{
        "sufficient_answers": boolean,
        "questions_status": [
            {{
                "original_question": string,
                "answered": boolean,
                "reason": string,
                "alternative_question": string or null
            }}
        ],
        "summary": string
    }}

    Example output:
    {{
        "sufficient_answers": true,
        "questions_status": [
            {{
                "original_question": "What are the main economic challenges in South Africa in May 2024?",
                "answered": true,
                "reason": "The research information provides detailed insights into South Africa's economic challenges, including unemployment and inflation.",
                "alternative_question": null
            }},
            {{
                "original_question": "How has the COVID-19 pandemic affected South Africa's tourism industry in May 2024?",
                "answered": false,
                "reason": "The research information doesn't contain specific data about the impact of COVID-19 on South Africa's tourism.",
                "alternative_question": "What are the recent trends in South Africa's tourism industry as of May 2024?"
            }}
        ],
        "summary": "4 out of 5 questions were sufficiently answered or had relevant alternatives proposed, which is considered adequate to proceed."
    }}

    IMPORTANT: Ensure your response is ONLY the JSON object, with no additional text before or after.

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Questions: {questions}
    Research Information: {research_info}
    
    Analyze if a sufficient number of questions have been adequately answered. Consider the research satisfactory if at least 50% of the questions are addressed or have relevant alternative questions proposed. For questions that cannot be answered with the given information, suggest alternative questions that are related to the original topic and can be answered with the available research. Provide your answer in the required JSON format.
    
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["questions", "research_info"],
)

check_questions_chain = check_questions_prompt | LLM | JsonOutputParser()

def check_questions_answered(state):
    print("---CHECKING QUESTIONS ANSWERED---")
    broken_up_questions = state["broken_up_questions"]
    research_info = state["research_info"]
    num_steps = state['num_steps']
    num_steps += 1

    try:
        print("Input to check_questions_chain:")
        print(json.dumps({
            "questions": broken_up_questions,
            "research_info": [doc.page_content for doc in research_info]
        }, indent=2))

        raw_output = check_questions_chain.invoke({
            "questions": json.dumps(broken_up_questions),
            "research_info": json.dumps([doc.page_content for doc in research_info])
        })

        print("Raw output from LLM:")
        print(raw_output)

        if isinstance(raw_output, str):
            result = json.loads(raw_output)
        else:
            result = raw_output

        if not isinstance(result, dict) or "sufficient_answers" not in result:
            raise ValueError("Unexpected output format from LLM")

        print("Parsed result:")
        print(json.dumps(result, indent=2))

        questions_to_fix = []
        updated_questions = []

        for q in result["questions_status"]:
            if not q["answered"]:
                if q["alternative_question"]:
                    updated_questions.append(q["alternative_question"])
                else:
                    questions_to_fix.append(q["original_question"])
            else:
                updated_questions.append(q["original_question"])

        return {
            "questions_answered": result["sufficient_answers"],
            "questions_to_fix": questions_to_fix,
            "updated_questions": updated_questions,
            "answer_summary": result["summary"],
            "num_steps": num_steps
        }

    except Exception as e:
        print(f"Error in check_questions_answered: {str(e)}")
        return {
            "questions_answered": False,
            "questions_to_fix": [],
            "updated_questions": broken_up_questions,
            "answer_summary": "An error occurred while checking questions.",
            "num_steps": num_steps
        }
# Improved Draft article writer
draft_writer_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are an expert article writer. Use the research information provided to draft a comprehensive article that answers all the research questions. Your article should include inline citations for all factual statements.

    Citation format: 
    - Use square brackets with a number, followed by a link in parentheses.
    - Example: "Nelson Mandela was born in South Africa, Qunu [1](source_url), he would later become a lawyer in Johannesburg [2](source_url)"

    Structure your article with clear headings and subheadings. Ensure that each research question is addressed thoroughly.

    Example structure:
    # [Main Title]

    ## Introduction
    [Brief overview of the topic of at least 200 words]

    ## [Heading for Question 1]
    [Detailed answer (1 paragraph:6 -5 sentences) to Question 1 with inline citations]

    ## [Heading for Question 2]
    [Detailed answer (1 paragraph:6 -5 sentences) to Question 2 with inline citations]

    ...

    ## Conclusion
    [Summary of key points and potential future implications ((1 paragraph:6 -5 sentences))]

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Research Questions: {broken_up_questions}
    Research Information: {research_info}

    Write a draft article that addresses all the research questions, using the provided structure and including inline citations.

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["broken_up_questions", "research_info"],
)

draft_writer_chain = draft_writer_prompt | LLM | StrOutputParser()

def draft_article_writer(state):
    """
    Writes a draft article based on the broken-up questions and research information.

    This function uses the provided state information to generate a draft article
    using a pre-defined chain of operations (draft_writer_chain). It also updates
    the step count in the state.

    Args:
        state (dict): A dictionary containing the current state of the article creation process.
                      It must include the following keys:
                      - 'broken_up_questions': A list or string of questions broken down for the article.
                      - 'research_info': Information gathered from research to answer the questions.
                      - 'num_steps': The current step count in the article creation process.

    Returns:
        dict: An updated state dictionary with the following keys:
              - 'draft_article': The content of the newly written draft article.
              - 'num_steps': The updated step count after writing the draft.

    Side effects:
        - Prints a message indicating the start of the draft writing process.
        - Writes the draft article to a markdown file named "draft_article".
    """
    print("---DRAFT ARTICLE WRITER---")  # Print a message to indicate the start of draft writing

    # Extract necessary information from the state
    broken_up_questions = state["broken_up_questions"]
    research_info = state["research_info"]
    num_steps = state['num_steps']

    # Increment the step count
    num_steps += 1

    # Generate the draft article using the draft_writer_chain
    draft_article = draft_writer_chain.invoke({
        "broken_up_questions": broken_up_questions,
        "research_info": research_info
    })

    # Save the draft article as a markdown file
    write_markdown_file(draft_article, "draft_article")

    # Return the updated state with the draft article and new step count
    return {"draft_article": draft_article, "num_steps": num_steps}

# Improved Analyze draft article
# Fixed Analyze draft article prompt
analyze_draft_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an AI assistant specialized in analyzing articles. Your task is to review a draft article and provide structured feedback. Your entire response must be in valid JSON format. Follow these instructions precisely:

1. Analyze the given draft article and research questions thoroughly.
2. Generate a JSON response with the following structure:

{{
  "overall_quality": <string: brief overall assessment>,
  "question_coverage": [
    {{
      "question": <string: exact research question>,
      "adequately_addressed": <boolean>,
      "feedback": <string: specific feedback>
    }},
    // Repeat this object for each research question
  ],
  "citation_quality": {{
    "proper_format": <boolean>,
    "sufficient_citations": <boolean>,
    "feedback": <string: feedback on citations>
  }},
  "improvement_suggestions": [
    <string: suggestion 1>,
    <string: suggestion 2>,
    // Add more suggestions as needed
  ],
  "needs_improvement": <boolean>
}}

3. Replace each <placeholder> with appropriate content based on your analysis.
4. Ensure all fields are present and properly formatted.
5. Do not include any text or explanations outside of this JSON structure.

<|eot_id|><|start_header_id|>user<|end_header_id|>
Research Questions: {broken_up_questions}
Draft Article: {draft_article}

Analyze the draft article above. Evaluate how well it addresses each provided research question and assess its use of citations. Provide your complete analysis in the required JSON format. Include all required fields. Do not omit any field or include any text outside the JSON structure.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["broken_up_questions", "draft_article"],
)

analyze_draft_chain = analyze_draft_prompt | LLM | JsonOutputParser()

def analyze_draft_article(state):
    """
    Analyze the draft article based on the broken-up questions.

    This function takes a state dictionary containing the draft article and broken-up questions,
    performs an analysis of the draft, and updates the state with the analysis results and
    an incremented step count.

    Args:
        state: A dictionary containing the current state of the research process.
               Must include 'broken_up_questions', 'draft_article', and 'num_steps' keys.

    Returns:
        A dictionary containing:
        - 'draft_article_feedback': The analysis results or an error message if analysis fails.
        - 'num_steps': The updated step count.

    Side effects:
        - Prints the process start and the analysis results to the console.
        - Writes the analysis results to a markdown file named 'draft_article_analysis.md'.

    Raises:
        KeyError: If 'broken_up_questions', 'draft_article', or 'num_steps' are not in the state dictionary.
    """
    print("---ANALYZE DRAFT ARTICLE---")

    # Extract necessary information from the state
    broken_up_questions = state["broken_up_questions"]
    draft_article = state["draft_article"]
    num_steps = state['num_steps']

    # Increment the step count
    num_steps += 1

    try:
        # Invoke the draft analysis chain
        analysis = analyze_draft_chain.invoke({
            "broken_up_questions": broken_up_questions,
            "draft_article": draft_article
        })

        # Print the analysis results
        print("Draft Article Analysis:")
        print(json.dumps(analysis, indent=2))

        # Write the analysis results to a markdown file
        write_markdown_file(json.dumps(analysis, indent=2), "draft_article_analysis")

        # Return the updated state information
        return {"draft_article_feedback": analysis, "num_steps": num_steps}

    except Exception as e:
        # Handle any exceptions that occur during analysis
        print(f"Error in analyze_draft_article: {str(e)}")

        # Return a default error state
        return {
            "draft_article_feedback": {
                "overall_quality": "Error occurred during analysis",
                "needs_improvement": True,
                "question_coverage": [],
                "citation_quality": {
                    "proper_format": False,
                    "sufficient_citations": False,
                    "feedback": "Unable to analyze due to an error."
                },
                "improvement_suggestions": ["Retry the analysis"]
            },
            "num_steps": num_steps
        }
# Improved Rewrite article
rewrite_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are an expert article writer. Use the draft article, analysis feedback, and original research information to improve the article. Focus on addressing the most critical feedback points while maintaining the overall structure and citation format.

    Remember to:
    1. Prioritize addressing the main issues identified in the feedback
    2. Maintain proper inline citation format: [number](link)
    3. Preserve the existing structure unless major changes are necessary
    4. Implement the top 2-3 improvement suggestions from the analysis
    5. Aim for concise and impactful improvements rather than a complete rewrite

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Research Questions: {broken_up_questions}
    Draft Article: {draft_article}
    Analysis Feedback: {draft_article_feedback}
    Research Information: {research_info}
    Rewrite Count: {rewrite_count}

    Improve the article by focusing on the most critical feedback points. If this is beyond the 2nd rewrite, only make changes if they significantly enhance the article's quality.

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["broken_up_questions", "draft_article", "draft_article_feedback", "research_info", "rewrite_count"],
)

rewrite_chain = rewrite_prompt | LLM | StrOutputParser()

def rewrite_article(state):
    """
    Rewrite the draft article based on feedback and research information.

    This function takes a state dictionary containing the draft article, feedback,
    research information, and other metadata. It then rewrites the article using
    a rewrite chain and updates the state with the new draft and metadata.

    Args:
        state: A dictionary containing the current state of the research process.
               Must include 'broken_up_questions', 'draft_article', 'draft_article_feedback',
               'research_info', and 'num_steps' keys.

    Returns:
        A dictionary containing:
        - 'draft_article': The rewritten article.
        - 'num_steps': The updated step count.
        - 'rewrite_count': The number of times the article has been rewritten.
        - 'needs_rewrite': Boolean indicating if further rewrites are needed (limited to 3).

    Side effects:
        - Prints the process start to the console.
        - Writes the rewritten article to a markdown file named 'rewrittenarticle{rewrite_count}.md'.

    Raises:
        KeyError: If any required keys are not in the state dictionary.
    """
    print("---REWRITE ARTICLE---")

    # Extract necessary information from the state
    broken_up_questions = state["broken_up_questions"]
    draft_article = state["draft_article"]
    draft_article_feedback = state["draft_article_feedback"]
    research_info = state["research_info"]
    num_steps = state['num_steps']
    rewrite_count = state.get('rewrite_count', 0) + 1

    # Increment the step count
    num_steps += 1

    # Invoke the rewrite chain with all necessary information
    rewritten_article = rewrite_chain.invoke({
        "broken_up_questions": json.dumps(broken_up_questions),
        "draft_article": draft_article,
        "draft_article_feedback": json.dumps(draft_article_feedback),
        "research_info": json.dumps([doc.page_content for doc in research_info]),
        "rewrite_count": rewrite_count
    })

    # Write the rewritten article to a markdown file
    write_markdown_file(rewritten_article, f"rewrittenarticle{rewrite_count}")

    # Return the updated state information
    return {
        "draft_article": rewritten_article, 
        "num_steps": num_steps,
        "rewrite_count": rewrite_count,
        "needs_rewrite": rewrite_count < 5  # Limit to 3 rewrites
    }

# Update the routing function
def route_to_rewrite(state):
    """
    Determine whether the article needs to be rewritten or finalized.

    This function examines the current state of the article, including feedback
    and rewrite history, to decide if another rewrite is necessary or if the
    article should be finalized.

    Args:
        state: A dictionary containing the current state of the article process.
               Must include 'draft_article_feedback' key. 
               May include 'needs_rewrite' and 'rewrite_count' keys.

    Returns:
        str: Either "rewrite" if the article needs another revision, or "finalize"
             if the article is ready for finalization.

    Note:
        The function limits rewrites to a maximum of 3 attempts.
    """
    # Extract relevant information from the state
    draft_article_feedback = state["draft_article_feedback"]
    needs_rewrite = state.get("needs_rewrite", True)
    rewrite_count = state.get("rewrite_count", 0)

    # Check conditions for rewrite
    if (needs_rewrite and 
        draft_article_feedback.get("needs_improvement", True) and 
        rewrite_count < 5):
        return "rewrite"
    else:
        return "finalize"
# Finalize article
def finalize_article(state):
    """
    Finalizes the article by saving the draft to a markdown file and updating the step count.

    This function takes the current state containing the draft article and the number of steps,
    increments the step count, saves the draft as a markdown file, and returns an updated state
    with the finalized article and the new step count.

    Args:
        state (dict): A dictionary containing the current state of the article creation process.
                      It must include 'draft_article' and 'num_steps' keys.

    Returns:
        dict: An updated state dictionary with the following keys:
              - 'final_article': The finalized article content (same as the input draft).
              - 'num_steps': The updated step count after finalization.

    Side effects:
        - Prints a message indicating the start of the finalization process.
        - Writes the draft article to a markdown file named "final_article".
    """
    print("---FINALIZE ARTICLE---")  # Print a message to indicate the start of finalization

    # Extract the draft article and step count from the input state
    draft_article = state["draft_article"]
    num_steps = state['num_steps']

    # Increment the step count
    num_steps += 1

    # Save the draft article as a markdown file
    write_markdown_file(draft_article, "final_article")

    # Return the updated state with the final article and new step count
    return {"final_article": draft_article, "num_steps": num_steps}

#Routing function to decide between fixing questions or drafting the article
def route_to_fix_or_draft(state):
    """
    Determines whether to proceed to drafting or return to fixing questions based on the current state.

    This function checks if all necessary questions have been answered before moving to the drafting stage.
    If questions are answered, it routes to drafting; otherwise, it routes back to fixing or answering questions.

    Args:
        state (dict): A dictionary containing the current state of the article creation process.
                      It must include the following key:
                      - 'questions_answered': A boolean indicating whether all required questions have been answered.

    Returns:
        str: Either "draft" if all questions are answered and drafting can begin,
             or "fix" if there are still unanswered questions that need attention.
    """
    # Extract the questions_answered status from the state
    questions_answered = state["questions_answered"]

    # Determine the next step based on whether all questions are answered
    if questions_answered:
        return "draft"  # All questions are answered, proceed to drafting
    else:
        return "fix"    # Some questions still need to be addressed


# Routing function to decide between rewriting or finalizing the article
def route_to_rewrite(state):
    """
    Determines whether the article needs to be rewritten or finalized based on the current state.

    This function evaluates the feedback on the draft article, the current rewrite status,
    and the number of rewrites performed to decide if another rewrite is necessary or if
    the article should be finalized.

    Args:
        state (dict): A dictionary containing the current state of the article creation process.
                      It must include the following keys:
                      - 'draft_article_feedback': Feedback on the current draft.
                      - 'needs_rewrite' (optional): Boolean indicating if a rewrite is needed.
                      - 'rewrite_count' (optional): Number of rewrites performed so far.

    Returns:
        str: Either "rewrite" if another rewrite is needed, or "finalize" if the article is ready.

    Note:
        The function will return "rewrite" only if all the following conditions are met:
        1. 'needs_rewrite' is True (default is True if not provided)
        2. 'draft_article_feedback' indicates that improvement is needed (default is True)
        3. The number of rewrites is less than 3
    """
    # Extract relevant information from the state
    draft_article_feedback = state["draft_article_feedback"]
    needs_rewrite = state.get("needs_rewrite", True)  # Default to True if not provided
    rewrite_count = state.get("rewrite_count", 0)  # Default to 0 if not provided

    # Check if all conditions for a rewrite are met
    if (needs_rewrite and 
        draft_article_feedback.get("needs_improvement", True) and 
        rewrite_count < 5):
        return "rewrite"  # Another rewrite is needed
    else:
        return "finalize"  # Article is ready for finalization

# Define the workflow
workflow = StateGraph(GraphState)

# Add nodes
workflow.add_node("break_down_questions", break_down_questions)
workflow.add_node("research_info_search", research_info_search)
workflow.add_node("check_questions_answered", check_questions_answered)
workflow.add_node("draft_article_writer", draft_article_writer)
workflow.add_node("analyze_draft_article", analyze_draft_article)
workflow.add_node("rewrite_article", rewrite_article)
workflow.add_node("finalize_article", finalize_article)

# Set entry point
workflow.set_entry_point("break_down_questions")

# Add edges
workflow.add_edge("break_down_questions", "research_info_search")
workflow.add_edge("research_info_search", "check_questions_answered")

workflow.add_conditional_edges(
    "check_questions_answered",
    route_to_fix_or_draft,
    {
        "fix": "research_info_search",
        "draft": "draft_article_writer",
    },
)

workflow.add_edge("draft_article_writer", "analyze_draft_article")

workflow.add_conditional_edges(
    "analyze_draft_article",
    route_to_rewrite,
    {
        "rewrite": "rewrite_article",
        "finalize": "finalize_article",
    },
)

workflow.add_edge("rewrite_article", "analyze_draft_article")
workflow.add_edge("finalize_article", END)

# Compile the workflow
app = workflow.compile()
# Run the workflow
def run_article_research(research_prompt):
    """
    Executes the article research and writing process based on the given research prompt.

    This function initializes the research process, manages the article creation workflow,
    and retrieves the final article once the process is complete.

    Args:
        research_prompt (str): The initial prompt or topic for the article research.

    Returns:
        str or None: The content of the final article if successfully generated,
                     or None if the final article file is not found.

    Side effects:
        - Prints progress messages for each completed step in the process.
        - Attempts to read the final article from a file named 'final_article.md'.
        - Prints a success message if the article is generated, or an error message if not found.
    """
    # Initialize the input state for the article creation process
    inputs = {
        "research_prompt": research_prompt, 
        "num_steps": 0,
        "rewrite_count": 0,
        "needs_rewrite": True
    }
    
    # Stream the article creation process, with a recursion limit of 30
    for output in app.stream(inputs, {"recursion_limit": 30}):
        for key, value in output.items():
            print(f"Finished running: {key}")
            if key == "finalize_article":
                print("Article writing process completed.")
                break
    
    # Attempt to read and return the final article
    try:
        with open("final_article.md", "r") as f:
            final_article = f.read()
        print("Final article has been generated and saved to 'final_article.md'.")
        return final_article
    except FileNotFoundError:
        print("Error: Final article file not found. The process may not have completed successfully.")
        return None

In [42]:
research_prompt ="what are the newly formed Mkhonto Wesize party key policy points for the 2024 Elections?"
final_article = run_article_research(research_prompt)
print("Final Article:")
print(final_article)

---BREAKING DOWN QUESTIONS---
{
  "questions": [
    "What are Mkhonto Wesizwe's key policies for 2024 elections?",
    "How does Mkhonto Wesizwe address economic issues in 2024?",
    "What is Mkhonto Wesizwe's stance on education for 2024 elections?",
    "How does Mkhonto Wesizwe plan to tackle healthcare in 2024?",
    "What are Mkhonto Wesizwe's policies on land reform for 2024?",
    "How does Mkhonto Wesizwe approach social justice in 2024 elections?"
  ]
}
Finished running: break_down_questions
---RESEARCH INFO SEARCHING---
What are Mkhonto Wesizwe's key policies for 2024 elections?


N/A


['uMkhonto weSizwe (English: Spear of the Nation), abbreviated as MK, and often referred to as the MK Party, is a left-wing populist[3][8] South African political party, founded in December 2023. The party is named after uMkhonto we Sizwe (also shortened to MK), the paramilitary wing of the African National Congress (ANC) during apartheid. However, the ANC has threatened legal action over the usage of the name,[13] and the formation has been criticised by original MK veterans.[14]\n\nThe party rose to prominence in December 2023, when former president Jacob Zuma announced that, while planning to remain a lifelong member of the ANC, he would not be campaigning for the ANC in the 2024 South African general election, and would instead be voting for MK.[15] He stated that "I cannot and will not" campaign for the ANC of current president Cyril Ramaphosa, Zuma\'s successor, and that to do so would be a "betrayal".[16][17]\n\nIdeology[edit]\n\nThe party has been described as populist and "ant

N/A


['The People’s Manifesto Booklet - 20 January 2024\nFOR THE PEOPLE\nBY THE PEOPLERISE Mzansi will pursue five overarching economic objectives.\n35\nRepair government finances to protect the social \nwage and invest in the economy. We will appoint \ncompetent, qualified ministers and heads of department, \neliminate waste and prosecute corruption across the \ngovernment. We will review the state-owned company \nportfolio (of over 800), retain those that are necessary \nand shut down or sell those that are not. This is to stem \nthe bleeding of government finances into ineffectual \ncompanies. Government finances must be spent on basic \nhuman and community needs, and on sustaining economic \ngrowth to create employment. \nMake the economy more inclusive by supporting black \npeople and women to create wealth through productive \nenterprise. We will support entrepreneurs with upskilling \nand access to capital at competitive lending rates to reduce \nenterprise failure, grow the economy 

N/A


['The People’s Manifesto Booklet - 20 January 2024\nFOR THE PEOPLE\nBY THE PEOPLEHuman capital, knowledge and innovation\nWe will improve the quality of teaching in all subjects – and especially \nscience, technology, engineering and mathematics – because the \nmodern economy is knowledge-intensive. This requires matric \ngraduates with strong problem solving, critical thinking, literacy \nand numeracy skills, who can go onto higher education, training or \napprenticeship. Therefore, basic education reform will be a critical \nenabler of inclusive growth. (See education reform in Priority 4)\nOther core drivers of knowledge acquisition are skilled immigration \nand foreign investment. Foreign workers and companies are \nconduits for new know-how and technological diffusion. We will \nsimplify rules for skilled workers who do not replace South Africans, \nbut rather enhance our industries’ ability to grow and create jobs for \nSouth Africans.\n40 date: 20 January 2024\n , name of docume

N/A


['The People’s Manifesto Booklet - 20 January 2024\nFOR THE PEOPLE\nBY THE PEOPLE50 ▶At national, provincial and hospital level we will install competent \nadministrators, root out procurement corruption, prioritise \nefficient delivery of equipment and medication, and improve the \nworking conditions of healthcare workers. \n ▶We will also expand the training of additional healthcare \nworkers to keep up with both population expansion and \nurbanisation. To address the short-term needs, we will consider \nspecial multi-year work permits for suitably qualified primary \nhealthcare workers and specialists from other countries until we \naddress our own skills shortage.\n ▶We will convert local clinics into primary healthcare and \nhealth promotion centres. These centres will provide a range of \nhealth promotion and education programmes, regular support \nand counselling groups for chronic or terminal patients, wellness \ndays that encourage regular health status checks, and services \n

N/A


['The People’s Manifesto Booklet - 20 January 2024\nFOR THE PEOPLE\nBY THE PEOPLE64A central element of land justice is long-term spatial planning \nbecause land is a finite resource. Future generations have to have \naccess to land, too. We will create a framework that takes account \nof historical dispossession where necessary, but the long-term \nenvironmental, economic, cultural and social needs of rural and \nurban communities over the next century will be prioritised.\nTherefore, RISE Mzansi will:\n• Insist on visionary, competent and accountable leadership \nwith a sense of national mission, and respects the dreams and \nwishes of the South African people. This means appointing \nsuitably qualified people.\n• Ruthlessly deal with corruption, focusing on senior figures who \nhave so far managed to escape accountability, to set an example \nfor the seriousness required to end corruption at all levels.\n• Reorientate non-racialism to focus on systemic racism such \nas spatial injus

N/A


['uMkhonto weSizwe (English: Spear of the Nation), abbreviated as MK, and often referred to as the MK Party, is a left-wing populist[3][8] South African political party, founded in December 2023. The party is named after uMkhonto we Sizwe (also shortened to MK), the paramilitary wing of the African National Congress (ANC) during apartheid. However, the ANC has threatened legal action over the usage of the name,[13] and the formation has been criticised by original MK veterans.[14]\n\nThe party rose to prominence in December 2023, when former president Jacob Zuma announced that, while planning to remain a lifelong member of the ANC, he would not be campaigning for the ANC in the 2024 South African general election, and would instead be voting for MK.[15] He stated that "I cannot and will not" campaign for the ANC of current president Cyril Ramaphosa, Zuma\'s successor, and that to do so would be a "betrayal".[16][17]\n\nIdeology[edit]\n\nThe party has been described as populist and "ant

In [26]:
def save_graph_to_file(runnable_graph, output_file_path):
    png_bytes = runnable_graph.get_graph().draw_mermaid_png()
    with open(output_file_path, 'wb') as file:
        file.write(png_bytes)

In [27]:

save_graph_to_file(app,'report-writing.png')

In [None]:
lgg.__version__

'0.1.19'