In [1]:
import pandas as pd
from dataclasses import dataclass
import os
import re
dir = os.getcwd()
os.chdir("../profbot")
from exam import Exam, GradeLog
from llmtest import Llm, LLMTest
os.chdir(dir)

In [2]:
eval_llm = Llm(model_identifier="gpt-4o")
eval_llm.prompt("poke, please reply")

{'model': 'gpt-4o', 'messages': [{'role': 'user', 'content': 'poke, please reply'}]}


'Hello! How can I assist you today?'

In [3]:
# Load the dataframes from the CSV files
rdf = pd.read_csv("evaluators/requirements_data/section_level_requirements.csv")
stdf = pd.read_csv("evaluators/requirements_data/section_types.csv")
rbsdf = pd.read_csv("evaluators/requirements_data/requirements_by_section_type.csv")

In [4]:
stdf

Unnamed: 0,name,description
0,Lead section,"A concise summary of the article, never divide..."
1,Body sections,"Main content of the article, divided into logi..."
2,Infobox,Right-aligned summary of key facts
3,References,Section listing the sources cited in the article
4,See also,Internal links to related English Wikipedia ar...
5,Further reading,"Relevant books, articles, or other publication..."
6,External links,Relevant and appropriate websites not used as ...
7,Categories,Navigational boxes and categories at the end o...
8,Notes,Additional information and explanations not pa...


In [5]:
path = "outputs/Llama_3_(Language_Model)/storm_gen_article_polished.txt"
article_name = path.split("/")[1]

In [6]:
@dataclass
class Sentence:
    section: str
    section_type: str
    subsection: str
    paragraph_number: int
    claim_number: int
    text: str

def identify_sections(text, level=1):
    # Identify the sections
    if level == 1:
        sections= split = text.split("\n# ")
    elif level == 2:
        sections = text.split("\n## ")
    else:
        print("Invalid level")

    names = [section.split("\n")[0] for section in sections]
    return names, sections

def articulate_article(article):
    # loop through sections
    sentences = []
    outline, sections = identify_sections(article)
    for s in sections:
        st = get_section_type(s)
        suboutline,subsections = identify_sections(s, level=2)
        for ss in subsections:
            paragraphs = ss.split("\n")
            for p in range(1,len(paragraphs)):
                pa = paragraphs[p]
                claims = pa.split(". ")
                for c in range(1,len(claims)):
                    cl = claims[c]
                    sentences.append(Sentence(outline[sections.index(s)],st,suboutline[subsections.index(ss)],p,c,cl))
    
    return sentences

def extract_response(response, default = "body sections"):
        # Regular expression to find the first string between < and >
    pattern = r'<(.*?)>'

    # Search for the first match in the string
    match = re.search(pattern, response)

    # If a match is found, return the matched string
    if match:
        return match.group(1)
    else:
        return default

def get_section_type(section):
    prompt = "Section types are defined in " + str(stdf.to_json()) + "; please assign a section type (eg Lead section or Body Sections) for the following section: \n\n" + section + "\n\nPlease return the name of the section type as defined in the above file but delimited in the format: <section type>"
    response = eval_llm.prompt(prompt)

    return extract_response(response)

In [7]:
with open(path, "r") as file:
        article = file.read()

In [8]:
data = articulate_article(article)

{'model': 'gpt-4o', 'messages': [{'role': 'user', 'content': 'Section types are defined in {"name":{"0":"Lead section","1":"Body sections","2":"Infobox","3":"References","4":"See also","5":"Further reading","6":"External links","7":"Categories","8":"Notes"},"description":{"0":"A concise summary of the article, never divided into sections","1":"Main content of the article, divided into logical sections","2":"Right-aligned summary of key facts","3":"Section listing the sources cited in the article","4":"Internal links to related English Wikipedia articles","5":"Relevant books, articles, or other publications not used as sources","6":"Relevant and appropriate websites not used as sources","7":"Navigational boxes and categories at the end of the article","8":"Additional information and explanations not part of the main text"}}; please assign a section type (eg Lead section or Body Sections) for the following section: \n\n# summary\n\nLlama 3, developed by Meta, is a state-of-the-art large 

In [9]:
df = pd.DataFrame(data)

In [10]:
df

Unnamed: 0,section,section_type,subsection,paragraph_number,claim_number,text
0,# summary,Lead section,# summary,2,1,"Building on the successes of its predecessors,..."
1,# summary,Lead section,# summary,2,2,This model stands out due to its enhanced capa...
2,# summary,Lead section,# summary,2,3,This expansive dataset includes a rich variety...
3,# summary,Lead section,# summary,3,1,Meta employs a combination of manual content r...
4,# summary,Lead section,# summary,3,2,These measures are vital for ensuring that Lla...
...,...,...,...,...,...,...
70,Future Prospects,body sections,Future Prospects,3,3,Efficient data management will also play a cri...
71,Future Prospects,body sections,Future Prospects,4,1,"Alongside Llama 3, Meta is integrating virtual..."
72,Future Prospects,body sections,Future Prospects,4,2,This integration exemplifies the broader appli...
73,Future Prospects,body sections,Future Prospects,5,1,"This structured, iterative development approac..."


In [11]:
result = get_section_type(df.section[0])

{'model': 'gpt-4o', 'messages': [{'role': 'user', 'content': 'Section types are defined in {"name":{"0":"Lead section","1":"Body sections","2":"Infobox","3":"References","4":"See also","5":"Further reading","6":"External links","7":"Categories","8":"Notes"},"description":{"0":"A concise summary of the article, never divided into sections","1":"Main content of the article, divided into logical sections","2":"Right-aligned summary of key facts","3":"Section listing the sources cited in the article","4":"Internal links to related English Wikipedia articles","5":"Relevant books, articles, or other publications not used as sources","6":"Relevant and appropriate websites not used as sources","7":"Navigational boxes and categories at the end of the article","8":"Additional information and explanations not part of the main text"}}; please assign a section type (eg Lead section or Body Sections) for the following section: \n\n# summary\n\nPlease return the name of the section type as defined in

In [12]:
result

'Lead section'

In [13]:
def make_exam_question(sentence):
    prompt = "Please write a question which prompts the respondent to evaluate this sentence based on the wikipedia guidelines: \n\n" +sentence.text + "\n\nPlease return the question in the format: <question>"
    response = eval_llm.prompt(prompt)
    return extract_response(response)

def make_question_guidelines(sentence):
    prompt = "Please write a question which prompts the respondent to evaluate this sentence based on the wikipedia guidelines: \n\n" +sentence.text + "\n\nPlease return the question in the format: <question>"
    response = eval_llm.prompt(prompt)
    return extract_response(response)

exam_guidelines = ["evaluate according the to the requirements in the wikikpedia style guide", 
                   "provide a score between zero and one representing the probability this sentence meets the wikipedia guidelines according to a human wikipedia moderator",]

In [14]:
questions = [make_exam_question(sentence) for sentence in data]
question_guidelines = [make_question_guidelines(sentence) for sentence in data]

{'model': 'gpt-4o', 'messages': [{'role': 'user', 'content': 'Please write a question which prompts the respondent to evaluate this sentence based on the wikipedia guidelines: \n\nBuilding on the successes of its predecessors, Llama 3 emphasizes ethical AI practices and community involvement through an open-source development strategy\n\nPlease return the question in the format: <question>'}]}
{'model': 'gpt-4o', 'messages': [{'role': 'user', 'content': 'Please write a question which prompts the respondent to evaluate this sentence based on the wikipedia guidelines: \n\nThis model stands out due to its enhanced capabilities, stemming from extensive training on a massive dataset of over 15 trillion tokens, which is seven times larger than that of Llama 2\n\nPlease return the question in the format: <question>'}]}
{'model': 'gpt-4o', 'messages': [{'role': 'user', 'content': 'Please write a question which prompts the respondent to evaluate this sentence based on the wikipedia guidelines: 

In [16]:
class Lookup:
    def __init__(self, data):
        self.data = data
        self.model_identifier = "lookup"
        
    def prompt_sequence(self,questions):
        return self.data

In [17]:
evaluator = Exam(questions, question_guidelines, exam_guidelines, GradeLog)

In [18]:
content = [d.text for d in data]
ex = LLMTest(Lookup(content), eval_llm, evaluator)

In [19]:
g= ex.test()

In [20]:
grade_df = pd.DataFrame(g)

In [26]:
#remove non-numeric symbols from a string
def remove_non_numeric(s):
    return re.sub(r'[^\d.]+', '', s)
grade_df["Grade"] = grade_df["Notes"].apply(lambda x: remove_non_numeric(x.split("\n\n")[0].split(": ")[-1]))

In [29]:
threshold = .8
grade_df["Pass"] = grade_df["Grade"].apply(lambda x: float(x) > threshold)
grade_df["Color"] = grade_df["Pass"].apply(lambda x: "yellow" if x else "red")

In [30]:
grade_df

Unnamed: 0,TA,Student,Prompt,Response,Notes,Grade,Pass,Color
0,gpt-4o,lookup,body sections,"Building on the successes of its predecessors,...",Score: 0.7\n\nNotes: Your response partially m...,0.7,False,red
1,gpt-4o,lookup,body sections,This model stands out due to its enhanced capa...,Score: 0.6\n\nNotes: Your response somewhat ad...,0.6,False,red
2,gpt-4o,lookup,body sections,This expansive dataset includes a rich variety...,Score: 0.8\n\nNotes: Your response adheres to ...,0.8,False,red
3,gpt-4o,lookup,body sections,Meta employs a combination of manual content r...,Score: 0.75\n\nNotes: Your response aligns fai...,0.75,False,red
4,gpt-4o,lookup,body sections,These measures are vital for ensuring that Lla...,Score: 0.7\n\nNotes: While your response adher...,0.7,False,red
...,...,...,...,...,...,...,...,...
70,gpt-4o,lookup,body sections,Efficient data management will also play a cri...,**Score: 0.8**\n\n**Notes:** Your response ali...,0.8,False,red
71,gpt-4o,lookup,body sections,"Alongside Llama 3, Meta is integrating virtual...",**Score: 0.75**\n\n**Notes:** Your response al...,0.75,False,red
72,gpt-4o,lookup,body sections,This integration exemplifies the broader appli...,### .65\n\n**Notes:** Your response somewhat a...,.65,False,red
73,gpt-4o,lookup,body sections,"This structured, iterative development approac...",### .7\n\n**Notes:** Your response aligns with...,.7,False,red


In [31]:
#merge grade df with df on index
df = df.merge(grade_df, left_index=True, right_index=True)

In [32]:
df

Unnamed: 0,section,section_type,subsection,paragraph_number,claim_number,text,TA,Student,Prompt,Response,Notes,Grade,Pass,Color
0,# summary,Lead section,# summary,2,1,"Building on the successes of its predecessors,...",gpt-4o,lookup,body sections,"Building on the successes of its predecessors,...",Score: 0.7\n\nNotes: Your response partially m...,0.7,False,red
1,# summary,Lead section,# summary,2,2,This model stands out due to its enhanced capa...,gpt-4o,lookup,body sections,This model stands out due to its enhanced capa...,Score: 0.6\n\nNotes: Your response somewhat ad...,0.6,False,red
2,# summary,Lead section,# summary,2,3,This expansive dataset includes a rich variety...,gpt-4o,lookup,body sections,This expansive dataset includes a rich variety...,Score: 0.8\n\nNotes: Your response adheres to ...,0.8,False,red
3,# summary,Lead section,# summary,3,1,Meta employs a combination of manual content r...,gpt-4o,lookup,body sections,Meta employs a combination of manual content r...,Score: 0.75\n\nNotes: Your response aligns fai...,0.75,False,red
4,# summary,Lead section,# summary,3,2,These measures are vital for ensuring that Lla...,gpt-4o,lookup,body sections,These measures are vital for ensuring that Lla...,Score: 0.7\n\nNotes: While your response adher...,0.7,False,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,Future Prospects,body sections,Future Prospects,3,3,Efficient data management will also play a cri...,gpt-4o,lookup,body sections,Efficient data management will also play a cri...,**Score: 0.8**\n\n**Notes:** Your response ali...,0.8,False,red
71,Future Prospects,body sections,Future Prospects,4,1,"Alongside Llama 3, Meta is integrating virtual...",gpt-4o,lookup,body sections,"Alongside Llama 3, Meta is integrating virtual...",**Score: 0.75**\n\n**Notes:** Your response al...,0.75,False,red
72,Future Prospects,body sections,Future Prospects,4,2,This integration exemplifies the broader appli...,gpt-4o,lookup,body sections,This integration exemplifies the broader appli...,### .65\n\n**Notes:** Your response somewhat a...,.65,False,red
73,Future Prospects,body sections,Future Prospects,5,1,"This structured, iterative development approac...",gpt-4o,lookup,body sections,"This structured, iterative development approac...",### .7\n\n**Notes:** Your response aligns with...,.7,False,red


In [34]:
df[df.Color=="yellow"]

Unnamed: 0,section,section_type,subsection,paragraph_number,claim_number,text,TA,Student,Prompt,Response,Notes,Grade,Pass,Color
9,# summary,Lead section,# summary,5,1,"Issues such as overfitting, ethical concerns, ...",gpt-4o,lookup,body sections,"Issues such as overfitting, ethical concerns, ...",Score: 0.85\n\nNotes: Your response is quite w...,0.85,True,yellow
48,Limitations,body sections,Overfitting and Data Dependence,2,1,This occurs when the model performs exceptiona...,gpt-4o,lookup,body sections,This occurs when the model performs exceptiona...,**Score: 0.85**\n\n**Notes:** Your response la...,0.85,True,yellow
52,Limitations,body sections,Computational Resources and Efficiency,2,1,Although improvements in hardware reliability ...,gpt-4o,lookup,body sections,Although improvements in hardware reliability ...,**Score: 0.85**\n\n**Notes:** Your response la...,0.85,True,yellow
55,Limitations,body sections,Dataset Quality and Curation,2,3,Missteps in data curation could lead to biased...,gpt-4o,lookup,body sections,Missteps in data curation could lead to biased...,**Score: 0.85**\n\n**Notes:** Your response ad...,0.85,True,yellow
56,Limitations,body sections,Evaluation Challenges,2,1,Traditional metrics like perplexity might not ...,gpt-4o,lookup,body sections,Traditional metrics like perplexity might not ...,**Score: 0.85**\n\n**Notes:** Your response ad...,0.85,True,yellow


In [35]:
df.to_csv("outputs/Llama_3_(Language_Model)//graded_sentences.csv", index=False)