In [1]:
import pandas as pd


In [2]:
# Load the section types from the CSV file
stdf = pd.read_csv("requirements_data/section_types.csv")


In [3]:
stdf

Unnamed: 0,name,description
0,Lead section,"A concise summary of the article, never divide..."
1,Body sections,"Main content of the article, divided into logi..."
2,Infobox,Right-aligned summary of key facts
3,References,Section listing the sources cited in the article
4,See also,Internal links to related English Wikipedia ar...
5,Further reading,"Relevant books, articles, or other publication..."
6,External links,Relevant and appropriate websites not used as ...
7,Categories,Navigational boxes and categories at the end o...
8,Notes,Additional information and explanations not pa...


In [4]:

# Function to classify sections
def classify_sections(section_outline: str):
    # Define section type keywords for classification
    section_keywords = {
        "Lead section": ["Introduction", "Overview", "Summary"],
        "Body sections": ["Background", "Development", "Architecture", "Timeline", "Features", "Collaborations", "Model Structure", "Training Data", "Algorithms", "Techniques"],
        "Infobox": ["Infobox"],
        "References": ["References", "Citations"],
        "See also": ["See also", "Related Articles"],
        "Further reading": ["Further reading"],
        "External links": ["External links"],
        "Categories": ["Categories"],
        "Notes": ["Notes", "Footnotes"]
    }
    
    # Split the outline into sections
    sections = [line.strip("# ").strip() for line in section_outline.splitlines() if line.strip()]
    
    # Function to get section type
    def get_section_type(section):
        for section_type, keywords in section_keywords.items():
            if any(keyword.lower() in section.lower() for keyword in keywords):
                return section_type
        return "Body sections"  # Default to body sections if no match is found
    
    # Get section types for each section
    section_types = [get_section_type(section) for section in sections]
    
    return sections, section_types


In [5]:
# Load the dataframes from the CSV files
rbsdf = pd.read_csv("requirements_data/requirements_by_section_type.csv")
rdf = pd.read_csv("requirements_data/section_level_requirements.csv")

# Join the dataframes on "name" ~ "requirement name"
joined_df = pd.merge(rbsdf, rdf, left_on="requirement_name", right_on="name")

# Drop the redundant 'name' column from rdf
joined_df = joined_df.drop(columns=["name"]).set_index("requirement_name").T

# add a new column called "requirements_list" that contains an empty list
#joined_df["requirements_list"] = [[] for _ in range(len(joined_df))]

cols = joined_df.columns
criteria = joined_df.iloc[-1]


In [6]:
rdf

Unnamed: 0,name,criteria_details
0,Title format,Sentence case for titles and headings
1,Consistency,Consistent style within an article
2,References included,All statements must be supported by reliable s...
3,Lead summary,Lead must summarize the article concisely
4,Infobox usage,Infobox must be right-aligned and summarize ke...
5,External links relevance,External links must be relevant and appropriate
6,Further reading selectivity,Further reading section must be selective
7,See also relevance,See also links must be directly related to the...
8,Category placement,Categories must be placed at the very end of t...


In [7]:
rbsdf

Unnamed: 0,requirement_name,Lead section,Body sections,Infobox,References,See also,Further reading,External links,Categories,Notes
0,Title format,1,1,0,0,0,0,0,0,0
1,Consistency,1,1,1,1,1,1,1,1,1
2,References included,1,1,0,1,0,0,0,0,0
3,Lead summary,1,0,0,0,0,0,0,0,0
4,Infobox usage,0,0,1,0,0,0,0,0,0
5,External links relevance,0,0,0,0,0,0,1,0,0
6,Further reading selectivity,0,0,0,0,0,1,0,0,0
7,See also relevance,0,0,0,0,1,0,0,0,0
8,Category placement,0,0,0,0,0,0,0,1,0


In [8]:
joined_df.T

Unnamed: 0_level_0,Lead section,Body sections,Infobox,References,See also,Further reading,External links,Categories,Notes,criteria_details
requirement_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Title format,1,1,0,0,0,0,0,0,0,Sentence case for titles and headings
Consistency,1,1,1,1,1,1,1,1,1,Consistent style within an article
References included,1,1,0,1,0,0,0,0,0,All statements must be supported by reliable s...
Lead summary,1,0,0,0,0,0,0,0,0,Lead must summarize the article concisely
Infobox usage,0,0,1,0,0,0,0,0,0,Infobox must be right-aligned and summarize ke...
External links relevance,0,0,0,0,0,0,1,0,0,External links must be relevant and appropriate
Further reading selectivity,0,0,0,0,0,1,0,0,0,Further reading section must be selective
See also relevance,0,0,0,0,1,0,0,0,0,See also links must be directly related to the...
Category placement,0,0,0,0,0,0,0,1,0,Categories must be placed at the very end of t...


In [9]:
rows = joined_df.index.values

In [10]:
criteria.values

array(['Sentence case for titles and headings',
       'Consistent style within an article',
       'All statements must be supported by reliable sources',
       'Lead must summarize the article concisely',
       'Infobox must be right-aligned and summarize key facts',
       'External links must be relevant and appropriate',
       'Further reading section must be selective',
       'See also links must be directly related to the article content',
       'Categories must be placed at the very end of the article'],
      dtype=object)

In [11]:
def dump_requirements(section_type):
    requirements = "This section contains the following requirements:\n"
    for r in rows:
        if r == section_type:
            for c in cols:
                if joined_df.loc[r][c] == 1:
                    crit = criteria[c]
                    requirements += f"- {crit}\n"
    return requirements

In [12]:
dump_requirements("Lead section")

'This section contains the following requirements:\n- Sentence case for titles and headings\n- Consistent style within an article\n- All statements must be supported by reliable sources\n- Lead must summarize the article concisely\n'

In [13]:
def get_outline(path):
    # Read the outline from the file
    with open(path, "r") as file:
        text = file.read()
        outline, sections = identify_sections(text, level=1)

    return outline, sections

def identify_sections(text, level=1):
    # Identify the sections
    sections = text[2:].split("\n"+level*"#"+" ")
    names = [section.split("\n")[0] for section in sections]
    return names, sections

In [14]:
path = "../outputs/Llama_3_(Language_Model)/storm_gen_article_polished.txt"
article = path.split("/")[2]


In [15]:
article

'Llama_3_(Language_Model)'

In [16]:
outline, sections=get_outline(path)

In [17]:
print(outline)

['summary', 'Development', 'Architecture', 'Performance', 'Applications', 'Limitations', 'Reception', 'Future Prospects']


In [18]:
print(sections)

["summary\n\nLlama 3, developed by Meta, is a state-of-the-art large language model that signifies a major leap forward in the realm of artificial intelligence. Building on the successes of its predecessors, Llama 3 emphasizes ethical AI practices and community involvement through an open-source development strategy. This model stands out due to its enhanced capabilities, stemming from extensive training on a massive dataset of over 15 trillion tokens, which is seven times larger than that of Llama 2. This expansive dataset includes a rich variety of languages and code, ensuring broad applicability and robust performance across diverse tasks.\nA crucial element of Llama 3’s development is its commitment to safety and appropriateness in content generation. Meta employs a combination of manual content reviews, automated checks, and iterative feedback mechanisms to maintain the model’s reliability and ethical standards. These measures are vital for ensuring that Llama 3's outputs are suit

In [19]:
questions = ["I am writing a wikipedia article called "+article+" and I am working the "+s+" section. Please provide a draft of the section according the wikipedia style guidelines" for s in outline]

In [20]:
# TODO: rewrite the below to use the stdf instead of the section_keywords
section_keywords = {
    "Lead section": ["Introduction", "Overview", "Summary"],
    "Body sections": ["Background", "Development", "Architecture", "Timeline", "Features", "Collaborations", "Model Structure", "Training Data", "Algorithms", "Techniques"],
    "Infobox": ["Infobox"],
    "References": ["References", "Citations"],
    "See also": ["See also", "Related Articles"],
    "Further reading": ["Further reading"],
    "External links": ["External links"],
    "Categories": ["Categories"],
    "Notes": ["Notes", "Footnotes"]
    }
def get_section_type(section):
    for section_type, keywords in section_keywords.items():
        if any(keyword.lower() in section.lower() for keyword in keywords):
            return section_type
    return "Body sections" # Default to body sections if no match is found

In [21]:
section_types = [get_section_type(section) for section in sections]

In [22]:
question_guidelines = ["The section "+s+" is a "+get_section_type(s)+" section. "+dump_requirements(get_section_type(s)) for s in outline]

In [24]:
print(question_guidelines)

['The section summary is a Lead section section. This section contains the following requirements:\n- Sentence case for titles and headings\n- Consistent style within an article\n- All statements must be supported by reliable sources\n- Lead must summarize the article concisely\n', 'The section Development is a Body sections section. This section contains the following requirements:\n- Sentence case for titles and headings\n- Consistent style within an article\n- All statements must be supported by reliable sources\n', 'The section Architecture is a Body sections section. This section contains the following requirements:\n- Sentence case for titles and headings\n- Consistent style within an article\n- All statements must be supported by reliable sources\n', 'The section Performance is a Body sections section. This section contains the following requirements:\n- Sentence case for titles and headings\n- Consistent style within an article\n- All statements must be supported by reliable so

In [25]:
exam_guidelines = [
    "Evaluate based on the wikipedia style guidelines.",
    "Provide informative feedback on the response that will help the editors improve the article.",
    "Provide a Grade in the range 0-100."
]

In [26]:
import os
os.chdir("..")
os.chdir("../profbot")

In [27]:
os.getcwd()

'/Users/z/Documents/GitHub/profbot'

In [28]:
from exam import Exam, GradeLog
from llmtest import Llm, LLMTest

In [29]:
evaluator = Exam(questions, question_guidelines, exam_guidelines, GradeLog)

In [30]:
eval_llm = Llm(model_identifier="gpt-4o")

In [31]:
eval_llm.prompt("this is a test; please respond with quip")

{'model': 'gpt-4o', 'messages': [{'role': 'user', 'content': 'this is a test; please respond with quip'}]}


"Testing, testing—one, two, three! If this were a real emergency, I'd be a lot wittier."

In [32]:
class Lookup:
    def __init__(self, data):
        self.data = data
        self.model_identifier = "lookup"
        
    def prompt_sequence(self,questions):
        return self.data

In [33]:
ex = LLMTest(Lookup(sections), eval_llm, evaluator)

In [34]:
ex

<llmtest.LLMTest at 0x7fabb84111f0>

In [36]:
g= ex.test()

In [37]:
df = pd.DataFrame(g)

In [38]:
df["Section Name"] = df.Prompt.apply(lambda x: x.split("working the ")[1].split(" section.")[0])

In [39]:
df.set_index("Section Name", inplace=True)

In [40]:
df

Unnamed: 0_level_0,TA,Student,Prompt,Response,Notes,Grade
Section Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
summary,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,"summary\n\nLlama 3, developed by Meta, is a st...",### \n\n### Notes:\n\n#### Positive Aspects:\n...,75
Development,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,Development\n\nLlama 3 represents a significan...,### \n\n### Notes:\n\n#### Positive Aspects:\n...,80
Architecture,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,Architecture\n\nThe Llama 3 architecture is fu...,### \n\n### Notes:\n\n#### Positive Aspects:\n...,85
Performance,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,Performance\n\nLlama 3 demonstrates exceptiona...,### \n\n### Notes:\n\n#### Positive Aspects:\n...,85
Applications,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,Applications\n\nLlama 3 is a versatile languag...,### \n\n### Notes:\n\n#### Positive Aspects:\n...,80
Limitations,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,Limitations\n\nDespite its numerous advancemen...,### \n\n### Notes:\n\n#### Positive Aspects:\n...,85
Reception,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,"Reception\n\nThe reception of Llama 3, Meta's ...",### \n\n### Notes:\n\n#### Positive Aspects:\n...,85
Future Prospects,gpt-4o,lookup,I am writing a wikipedia article called Llama_...,Future Prospects\n\nThe future of Llama 3 look...,### \n\n### Notes:\n\n#### Positive Aspects:\n...,80


In [41]:
print(df.iloc[0].Notes)

### 

### Notes:

#### Positive Aspects:

1. **Conciseness and Comprehensiveness:**
   - The summary is concise and comprehensive, encapsulating various aspects of Llama 3, including its development, capabilities, ethical considerations, and architectural specifics.

2. **Structured Information:**
   - The response is well-structured, detailing different facets such as ethics, community involvement, and technical enhancements in a systematic manner.

3. **Clarity:**
   - The language is clear, and the information is well-presented, making it easy to understand the key points about Llama 3.

#### Areas for Improvement:

1. **Sentence Case for Titles and Headings:**
   - According to Wikipedia style guidelines, titles and headings should use sentence case. Your title "summary" should be corrected to "Summary."

2. **Citation of Reliable Sources:**
   - The summary lacks citations for the stated facts. In Wikipedia articles, it’s essential to back up every claim with reliable sources. Ens

In [42]:
print(df.iloc[0].Prompt)

I am writing a wikipedia article called Llama_3_(Language_Model) and I am working the summary section. Please provide a draft of the section according the wikipedia style guidelens
