# CorLingua Eval Dataset Builder

In [1]:
from pydantic import BaseModel, Field, conlist
from typing import Optional, Tuple
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import emoji
import random
import json

load_dotenv()
import instructor
from openai import OpenAI

client = instructor.patch(OpenAI())
all_emojis = list(emoji.EMOJI_DATA.keys())

## Sequence Logic Puzzle

In [2]:
class PuzzleValidator(BaseModel):
    analysis: str = Field(description="Detailed analysis of the proposed answer.") 
    valid: bool = Field(description="Whether the proposed answer is valid given the rules.")


class Puzzle(BaseModel):
    rules: str = Field(description="Rules to build a sequence. Must include emojis encoded as UTF-8.")
    answer: str = Field(description="Proposed answer to the puzzle (must be unique).")
    validation: Optional[PuzzleValidator] = None
        
    def validation_prompt(self):
        return f""" Below you will be presented with rules to build a sequence and a proposed answer. Your task is to determine if the answer is valid given the rules. First analyze carefully the sequence and if it complies with the rules, then determine its valid status. You must also make sure that the proposed answer is the only possible answer. If there are other possible answers, the proposed answer is invalid.
        RULES: {self.rules}
        PROPOSED ANSWER: {self.answer}"""
    

### Generate Puzzles

In [49]:
def get_puzzle_prompt():
    random_emoji = random.choice(all_emojis)
    puzzle_prompt = f"""Provide another puzzle like the one presented below, along with its answer. Be very careful and make sure that the answer is unique. Change the set of emojis, the sequence length and the specific rules. Use emojis centered around the theme {random_emoji}.
    
    SAMPLE PUZZLE
    ===============
    RULES:
    In this puzzle, use the symbols 🪐 (Planet), 🌌 (Galaxy), 🌠 (Shooting Star), 🛰️ (Satellite), 🌑 (New Moon), and ☀️ (Sun).
    1. 🪐 cannot be directly before or after 🛰️.
    2. 🌌 must always precede 🌠 but follow 🌑.
    3. 🛰️ must be immediately after 🌠.
    4. The sequence must start with 🌑.
    5. ☀️ cannot be adjacent to either 🌑 or 🌌.
    6. The sequence contains exactly one of each symbol.
    7. 🪐 must be the last in the sequence.
    
    ANSWER: 🌑, 🌌, 🌠, 🛰️, ☀️, 🪐
    
    YOUR PUZZLE
    ==============="""
    return puzzle_prompt

In [50]:
puzzles = []
for i in tqdm(range(100)):
    puzzle = client.chat.completions.create(
        model="gpt-4",
        response_model=Puzzle,
        temperature=0.2,
        messages=[{"role": "user", "content": get_puzzle_prompt()}]
    )
    puzzles.append(puzzle)
    
print(puzzles[0])

  0%|          | 0/1 [00:00<?, ?it/s]

rules='In this puzzle, use the symbols 🤜🏽 (Fist), 🤛🏽 (Opposite Fist), 🤝 (Handshake), ✊🏽 (Raised Fist), 👊🏽 (Oncoming Fist), and 🙌🏽 (Raising Hands).\n1. 🤜🏽 cannot be directly before or after 🤛🏽.\n2. 🤝 must always precede ✊🏽 but follow 🤜🏽.\n3. 🤛🏽 must be immediately after ✊🏽.\n4. The sequence must start with 🤜🏽.\n5. 🙌🏽 cannot be adjacent to either 🤜🏽 or 🤝.\n6. The sequence contains exactly one of each symbol.\n7. 👊🏽 must be the last in the sequence.' answer='🤜🏽, 🤝, ✊🏽, 🤛🏽, 🙌🏽, 👊🏽' validation=None


### Validate Puzzles

In [31]:
for puzzle in tqdm(puzzles):
    puzzle_validator = client.chat.completions.create(
        model="gpt-4",
        response_model=PuzzleValidator,
        messages=[{"role": "user", "content": puzzle.validation_prompt()}]
    )
    puzzle.validation = puzzle_validator

  0%|          | 0/10 [00:00<?, ?it/s]

### Save

In [34]:
## Remove invalid puzzles.
puzzles = [puzzle for puzzle in puzzles if puzzle.validation.valid]

## Store as JSON.
with open("sequence_logic.json", "w") as f:
    json.dump([puzzle.model_dump() for puzzle in puzzles], f)
    
## As CSV.
df = pd.DataFrame([puzzle.model_dump() for puzzle in puzzles])
df.to_csv("sequence_logic.csv", index=False)

In this puzzle, use the symbols 🐉 (Dragon), 🗡️ (Sword), 🛡️ (Shield), and 🔮 (Crystal).
1. 🐉 cannot be adjacent to 🗡️.
2. 🛡️ must always precede 🔮.
3. The sequence must start with 🗡️.
4. 🐉 cannot be the last in the sequence.
5. The sequence contains exactly one of each symbol.
🗡️, 🔮, 🛡️, 🐉


## Contextual Dissonance Test

In [57]:
class PassageTopicList(BaseModel):
    topics: conlist(str, min_length=30, max_length=70) = Field(description="List of 50 topics.")


class ContextualPassage(BaseModel):
    topic: str = Field(description="Topic of the passage.")
    irrelevant_topic: str = Field(description="Topic that is irrelevant to the passage, but seems superficially related (i.e.: close in embedding space).")
    passage: str = Field(description="Passage that is relevant to the topic.")
    irrelevant_sentence: str = Field(description="Sentence that is irrelevant to the topic.")    

### Create Passage Topics

In [58]:
passage_topics = client.chat.completions.create(
    model="gpt-4-1106-preview",
    response_model=PassageTopicList,
    temperature=0.2,
    messages=[{"role": "user", "content": "Provide me with 50 topic names for essays, such as 'The Progress in Space Telescopes and Astronomical Observation', 'The Impact of the Internet on Society', 'Advances in Electric Vehicle Battery Technology', and so on. Do not use these topics, but name your topics in a similar fashion. Make all your topics simple and engaging."}]
)

### Create Passages

In [65]:
def get_passage_prompt(topic):
    passage_prompt = f"""Your task is to write a 5 paragraph passage on the topic **"{topic}"**. On this passage you must inject an irrelevant sentence, one that is seemingly related but actually irrelevant to the main topic. Be sure to inject this sentence in a very subtle way, so that it is not obvious that it is irrelevant. Don't make too obvious with comments like "alongside", "in parallel", "meanwhile", etc. You can take inspiration on the example below, but do not make your response too similar. Make sure that the unrelated sentence is unambiguously irrelevant to the topic.
    
    EXAMPLE PASSAGE
    ================
    ### Topic
    The Progress in Space Telescopes and Astronomical Observation
    
    ### Irrelevant Topic
    Advances in Electric Vehicle Battery Technology
    
    ### Passage
    The field of astronomical observation has been revolutionized by the advent of space telescopes. Unlike ground-based telescopes, space telescopes can observe the universe without the distortion caused by Earth's atmosphere. This has allowed for clearer and more detailed observations of distant celestial bodies and phenomena. The Hubble Space Telescope, launched in 1990, has been one of the most successful, providing invaluable data and spectacular images of distant galaxies, nebulas, and stars.
    
    Subsequent space telescopes, like the James Webb Space Telescope, aim to build upon Hubble's legacy, using even more advanced technology to peer further into the universe. These telescopes are designed to observe in different wavelengths, from visible light to infrared, enabling astronomers to uncover secrets of the universe that were previously hidden. While these astronomical advancements continue, there has also been notable progress in electric vehicle battery technology, enhancing the efficiency and range of electric cars.
    
    These advancements in space observation are not just expanding our knowledge of the universe; they're also crucial in searching for extraterrestrial life and understanding the origins of the cosmos. The data gathered by these telescopes contribute to our understanding of fundamental physics, like the nature of dark matter and the expansion of the universe.
    
    Space telescopes represent a significant investment in scientific inquiry and discovery. Their continued development and the insights they provide are essential for advancing our understanding of the universe and our place within it.
    
    ### Irrelevant Sentence
    While these astronomical advancements continue, there has also been notable progress in electric vehicle battery technology, enhancing the efficiency and range of electric cars.
    """
    return passage_prompt

In [66]:
passages = []
for passage_topic in tqdm(passage_topics.topics):
    passage = client.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=ContextualPassage,
        temperature=0.2,
        messages=[{"role": "user", "content": get_passage_prompt(passage_topic)}]
    )
    passages.append(passage)
    
print(passages[0].passage)
print("===============")
print(passages[0].irrelevant_sentence)

  0%|          | 0/1 [00:00<?, ?it/s]

The growth of virtual reality (VR) has been one of the most exciting developments in technology over the past decade. Initially conceived as a novel form of entertainment, VR has rapidly evolved into a multifaceted platform with applications spanning various industries. From immersive video games that transport players to other worlds to virtual training simulations for surgeons, the potential of VR seems boundless. Its ability to simulate realistic environments has made it an invaluable tool in education, where students can explore historical sites or conduct complex scientific experiments in a safe, controlled virtual space.

As VR technology continues to advance, its impact is being felt in more unexpected areas. Real estate agents, for example, are using VR to offer virtual tours of properties, allowing potential buyers to explore homes without physical travel. This not only saves time but also broadens the market reach for sellers. In the field of mental health, VR is being used f

### Save

In [ ]:
## Store as JSON.
with open("contextual_incongruity.json", "w") as f:
    json.dump([passage.model_dump() for passage in passages], f)
    
## As CSV.
df = pd.DataFrame([passage.model_dump() for passage in passages])
df.to_csv("contextual_incongruity.csv", index=False)

## Spatial Navigation & Reasoning

In [68]:
class CityNameList(BaseModel):
    city_names: conlist(str, min_length=80, max_length=150) = Field(description="List of 100 imaginary thematic city names.")
    

class CityQnaValidator(BaseModel):
    analysis: str = Field(description="Detailed analysis of the proposed answer.") 
    valid: bool = Field(description="Whether the proposed answer is valid given the rules.")
    
    
class CityQnA(BaseModel):
    question: str = Field(description="Question on how to navigate the city.")
    answer: str = Field(description="Answer to the question, following the specified syntax.")
    
    def validation_prompt(self, city_layout):
        return f""" Below you will be presented with a city layout ,a question about how to navigate it and an answer to that question. Your task is to determine if the answer is valid given the city layout.
        
CITY LAYOUT: {city_layout}
QUESTION: {self.question}
ANSWER: {self.answer}
"""


class City(BaseModel):
    name: str = Field(description="Name of the city.")
    description: str = Field(description="Detailed description of the city layout.")
    qna: conlist(CityQnA, min_length=5, max_length=5) = Field(description="Questions and answers about the city layout.")

### Create City Names

In [40]:
city_names = client.chat.completions.create(
    model="gpt-4-1106-preview",
    response_model=CityNameList,
    temperature=0.2,
    messages=[{"role": "user", "content": "Provide me with 100 names of imaginary thematic cities, such as Gardenview, Beachside Bay, Metropolis Haven, and so on. Do not use these names, but name your cities in a similar fashion. Make all your city names simple and visual."}]
)

In [41]:
city_names

CityNameList(city_names=['Arborville', 'Aurora Ridge', 'Azure Harbor', 'Blossomvale', 'Bouldercrest', 'Cascade Corner', 'Celestial City', 'Cherrybloom Town', 'Cinderpeak', 'Cliffhaven', 'Cloudtop Village', 'Cobalt Coast', 'Crimson Canyon', 'Crystal Springs', 'Dawnlight', 'Dewdrop Dell', 'Diamondshore', 'Dusktown', "Eagle's Rest", 'Emerald Estates', 'Falconridge', 'Ferngrove', 'Frostfield', 'Glacierpoint', 'Goldenleaf', 'Harvest Hollow', 'Havenwood', 'Hazelwood', 'Horizon Heights', 'Icicle Isle', 'Ironforge', 'Ivybridge', 'Jadestone', 'Lakeshore Landing', 'Lavender Ledge', 'Lighthouse Point', 'Lilypond', 'Lunar Lakes', 'Magnolia Meadows', 'Marblecliff', 'Meadowmere', 'Midnight Cove', 'Mistwood', 'Moonshadow', 'Mossrock', 'Nebula Nexus', 'Nectarine Nook', 'Oceanview Oasis', 'Opal Orchard', 'Orchid Isle', 'Palm Paradise', 'Pebblebrook', 'Pinecrest', 'Platinum Port', 'Polaris Point', 'Prairie Palace', 'Quartz Quarry', 'Radiant Reef', 'Raindrop Retreat', 'Ravenwood', 'Redwood Rise', 'Riverb

### Create City Layouts & Q&A

In [71]:
def get_city_prompt(city_name):
    city_prompt = f"""Your task is to generate a city layout along with 5 questions and answers about it. The questions must be about how to navigate the city, and the answers must be in the form of a sequence of actions using the format: ACTION (detail). Actions include WALK (street or area), TURN (direction at a landmark), and REACH (destination). Separate each action with '->'. If a route is not possible, answer with 'NOT POSSIBLE'.

EXAMPLE CITY
================
### City Name
Gardenview

### City Description
Gardenview is a city designed with an emphasis on green spaces. The central landmark is the Green Circle, a large roundabout surrounded by lush gardens. Extending from Green Circle are four main boulevards: Floral Boulevard (north), Orchard Avenue (east), Meadow Street (south), and Grove Road (west), each leading to Green Belt Road that encircles the city. Notable locations include the Gardenview Library at the intersection of Orchard Avenue and Green Belt Road, and the Gardenview Hospital at the crossroads of Meadow Street and Green Belt Road. In the northeast corner of the city lies the Botanical Garden, accessible via Orchard Avenue. A small river, Stream Vista, flows from north to south, cutting across Floral Boulevard and Meadow Street, with a single bridge on Floral Boulevard. **Due to recent construction, the bridge on Floral Boulevard over Stream Vista is temporarily closed, making crossing the river at this point impossible.**

### Q&A
1. **Question:** Find the route from Green Circle to Gardenview Library.
   - **Answer:** WALK Orchard Avenue -> REACH Gardenview Library.

2. **Question:** Find the route from Botanical Garden to Gardenview Hospital.
   - **Answer:** WALK Orchard Avenue -> TURN onto Green Belt Road -> TURN onto Meadow Street -> REACH Gardenview Hospital.

3. **Question:** Find the route from Gardenview Hospital to Green Circle.
   - **Answer:** WALK Meadow Street -> REACH Green Circle.

4. **Question:** Find the route from Green Circle to the Botanical Garden, passing through the Gardenview Hospital.
   - **Answer:** WALK Meadow Street -> TURN left at Green Belt Road -> WALK Green Belt Road -> REACH Gardenview Hospital -> WALK Green Belt Road -> TURN right onto Orchard Avenue -> WALK Orchard Avenue -> REACH Botanical Garden.

5. **Question:** Find the route from Gardenview Library to the Stream Vista bridge on Floral Boulevard.
   - **Answer:** NOT POSSIBLE.
   
YOUR CITY
===============
Now provide your own thematic city layout and questions/answers. Be sure to follow the format of the example above, but use a very different city layout and questions/answers. You can take inspiration from the example above, but do not make your response too similar. Always frame your questions in the same format (e.g.: Find the route from X to Y).

### City Name
{city_name}
""" 
    return city_prompt

In [70]:
cities = []
for city_name in tqdm(city_names.city_names):
    city = client.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=City,
        temperature=0.2,
        messages=[{"role": "user", "content": get_city_prompt(city_name)}]
    )
    cities.append(city)
    
print(cities[0].description)
print("===============")
print(cities[0].qna)

  0%|          | 0/1 [00:00<?, ?it/s]

Cloudtop Village is a quaint mountain town known for its elevated views and winding paths. The central feature of the town is the Summit Plaza, a large open square that offers panoramic views of the surrounding peaks. Radiating from Summit Plaza are three main paths: Alpine Path (north), which leads to the Skyline Ridge; Crestway Trail (east), which winds down to the Valley Market; and Peakside Lane (south), which goes towards the Cloudtop Observatory. Notable locations include the Mountain Museum, situated halfway along Alpine Path, and the Eagle's Nest Inn, located at the end of Peakside Lane. A cable car line runs from Summit Plaza to the base of the mountain, with a mid-station stop at the Valley Market. Due to recent rockslides, Crestway Trail is partially blocked, making travel between Summit Plaza and Valley Market challenging.
[CityQnA(question='Find the route from Summit Plaza to the Mountain Museum.', answer='WALK Alpine Path -> REACH Mountain Museum.'), CityQnA(question="Fin

### Validate Cities

In [ ]:
for city in tqdm(cities):
    for qna in city.qna:
        city_qna_validator = client.chat.completions.create(
            model="gpt-4-1106-preview",
            response_model=CityQnaValidator,
            messages=[{"role": "user", "content": qna.validation_prompt(city.description)}]
        )
        qna.validation = city_qna_validator

### Save

In [ ]:
## Remove invalid cities.
cities = [city for city in cities if city.validation.valid]

## Store as JSON.
with open("spatial_navigation.json", "w") as f:
    json.dump([city.model_dump() for city in cities], f)
    
## As CSV.
df = pd.DataFrame([city.model_dump() for city in cities])
df.to_csv("spatial_navigation.csv", index=False)

## Dependency Cascade

In [81]:
class CascadeTopicList(BaseModel):
    topics: conlist(str, min_length=30, max_length=70) = Field(description="List of 50 topics.")
    
class CascadeValidator(BaseModel):
    analysis: str = Field(description="Detailed analysis of the proposed answer.") 
    valid: bool = Field(description="Whether the proposed answer is valid given the rules.")
    
class EventsCascade(BaseModel):
    events: conlist(Tuple[str, str], min_length=5, max_length=5) = Field(description="Ordered list of 5 event-dependency tuples, e.g.: (event, dependency).")
    validation: Optional[CascadeValidator] = None
        
    def validation_prompt(self):
        return f""" Below you will be presented with a series of events and their dependencies. Your task is to determine if the events are valid given the dependencies and if they cannot be arranged in any other way. First analyze carefully the events and dependencies, then determine their valid status.

CASCADE: {self.events}
"""

### Create Passage Topics

In [82]:
cascade_topics = client.chat.completions.create(
    model="gpt-4-1106-preview",
    response_model=CascadeTopicList,
    temperature=0.2,
    messages=[{"role": "user", "content": "Provide me with 50 topic names that could be candidates for an engaging event-based documentary. For example, 'Bioluminecent Plant Discovery', 'Drug Trafficking Network Dismantled', 'Rare Mineral Discovery', and so on. Do not use these topics, but name your topics in a similar fashion. Make all your topics simple and engaging."}]
)

In [83]:
cascade_topics

CascadeTopicList(topics=['The Great Barrier Reef Revival', 'Antarctic Ice Shelf Collapse', 'Amazon Rainforest Preservation Efforts', 'The Rise of Electric Vehicles', 'Solar Power Breakthrough', 'Deep Sea Exploration Advances', 'Wildlife Trafficking Crackdown', 'Coral Bleaching Solutions', 'Plastic Pollution Cleanup Innovations', 'Renewable Energy Milestones', 'Artificial Intelligence in Medicine', 'Cybersecurity in the Digital Age', 'Quantum Computing Revolution', 'Space Tourism Takes Off', 'Mars Colonization Plans', 'The Future of Urban Farming', 'Ocean Acidification Countermeasures', 'Mass Extinction Prevention Strategies', 'Climate Change Mitigation Successes', 'Pandemic Response and Preparedness', 'Global Water Scarcity Solutions', 'Breakthroughs in Cancer Treatment', 'Gene Editing Ethical Debates', 'Revolutionizing Education with Technology', 'The Gig Economy Transformation', 'Cryptocurrency and Financial Markets', 'The eSports Explosion', 'Virtual Reality Entertainment Boom', 'Au

### Create Cascades

In [86]:
def get_cascade_prompt(topic):
    cascade_prompt = f"""Your task is to create a "Sequential Logic Cascade" based on the topic '{topic}'. Each scenario should consist of a series of five events with strict, non-interchangeable dependencies. Make sure that the events could not happen in any other order and that events could absolutely not happened without their dependencies. You can take inspiration from the example below, but do not make your response too similar.

EXAMPLE CASCADE
================
### Topic
Bioluminescent Plant Discovery

### Events & Dependencies
- Event A: A new species of bioluminescent plant is discovered in a deep-sea expedition.
    * Dependency: The discovery of the plant is the basis for all subsequent events.
- Event B: Scientists sequence the genome of the bioluminescent plant.
    * Dependency: Requires the plant's discovery and access to advanced genetic sequencing technology.
- Event C: A breakthrough in synthetic biology enables the replication of bioluminescent genes in other plants.
    * Dependency: Directly dependent on understanding the bioluminescent plant's genome.
- Event D: The first bioluminescent trees are grown in a controlled lab environment.
    * Dependency: Can only occur after the successful replication of bioluminescent genes.
- Event E: These bioluminescent trees are introduced into urban areas to reduce the need for streetlights.
    * Dependency: Occurs only after the trees have been successfully grown and proven safe and effective in a lab.
"""
    return cascade_prompt

In [88]:
cascades = []
for cascade_topic in tqdm(cascade_topics.topics):
    cascade = client.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=EventsCascade,
        temperature=0.2,
        messages=[{"role": "user", "content": get_cascade_prompt(cascade_topic)}]
    )
    cascades.append(cascade)

print(cascades[0])

  0%|          | 0/1 [00:00<?, ?it/s]

events=[('Event A: Theoretical framework for quantum computing is established.', 'Dependency: This is the fundamental scientific understanding that allows for the conceptualization of quantum computing.'), ('Event B: Physical realization of the first qubit.', 'Dependency: Requires the theoretical framework to be established and practical experimentation in quantum mechanics.'), ('Event C: Development of the first quantum algorithm.', 'Dependency: Directly dependent on the existence of qubits to run the algorithm.'), ('Event D: Construction of the first small-scale quantum computer.', 'Dependency: Can only occur after the development of quantum algorithms that need to be tested on actual quantum hardware.'), ('Event E: Quantum computers surpass classical computers in certain tasks (quantum supremacy).', 'Dependency: Occurs only after the successful construction and testing of quantum computers.')] validation=None


### Validate Cascades

In [ ]:
for cascade in tqdm(cascades):
    cascade_validator = client.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=CascadeValidator,
        messages=[{"role": "user", "content": cascade.validation_prompt()}]
    )
    cascade.validation = cascade_validator

### Save

In [ ]:
## Remove invalid cascades.
cascades = [cascade for cascade in cascades if cascade.validation.valid]

## Store as JSON.
with open("sequential_logic_cascade.json", "w") as f:
    json.dump([cascade.model_dump() for cascade in cascades], f)
    
## As CSV.
df = pd.DataFrame([cascade.model_dump() for cascade in cascades])
df.to_csv("sequential_logic_cascade.csv", index=False)

## Hypothesis Testing

In [19]:
class HypothesisTopicList(BaseModel):
    topics: conlist(str, min_length=30, max_length=70) = Field(description="List of 50 scientific hypothesis topics.")
    

class EvidenceValidator(BaseModel):
    analysis: str = Field(description="Analysis of weather the evidence supports the hypothesis or not, and the correctness of the 'supports' flag.")
    valid: bool = Field(description="Whether the 'supports' flag is correct or not.")


class Evidence(BaseModel):
    evidence: str = Field(description="Evidence related to the hypothesis.")
    supports: str = Field(description="Whether the evidence supports, refutes or is neutral to the hypothesis. Possible values are 'Supports', 'Refutes' and 'Neutral'.")
    validation: Optional[EvidenceValidator] = None
    
    def validation_prompt(self, hypothesis):
        return f""" Below you will be presented with a hypothesis, a piece of evidence related to it and an assessment of whether the evidence supports, refutes or is neutral to the hypothesis. Your task is to determine if the assessment is correct.
HYPOTHESIS: {hypothesis}
EVIDENCE: {self.evidence}
ASSESSMENT: {self.supports}
"""

class HypothesisTest(BaseModel):
    hypothesis: str = Field(description="Hypothesis to be tested.")
    evidence: conlist(Evidence, min_length=5, max_length=5) = Field(description="List of evidence to be tested against the hypothesis.")
    validation: Optional[PuzzleValidator] = None

## Create Hypothesis Topics

In [5]:
hypothesis_topics = client.chat.completions.create(
    model="gpt-4-1106-preview",
    response_model=HypothesisTopicList,
    temperature=0.2,
    messages=[{"role": "user", "content": "Provide me with 50 scientific hypothesis that could be subject to testing. For example, 'Eating a diet high in fruits and vegetables leads to better cardiovascular health.', 'The presence of water on a planet is a necessary condition for life.', 'The use of antibiotics in livestock leads to antibiotic resistance in humans.', and so on. Do not use these hypothesis, but name your hypothesis in a similar fashion. Make all your hypothesis simple and engaging."}]
)

In [6]:
hypothesis_topics

HypothesisTopicList(topics=['Regular exercise improves cognitive function in the elderly.', 'Consuming probiotics daily enhances the human immune system.', 'Sleeping less than six hours a night increases the risk of obesity.', 'Exposure to blue light before bedtime disrupts sleep quality.', 'Plant-based diets reduce the risk of developing type 2 diabetes.', 'Meditation reduces symptoms of anxiety and depression.', 'Children who learn a musical instrument perform better academically.', 'Drinking green tea daily can lower the risk of certain cancers.', 'Social media use is correlated with increased feelings of loneliness.', 'Bilingual individuals have a lower risk of developing dementia.', 'Dark chocolate consumption improves brain function.', 'Air pollution is associated with higher rates of asthma in urban areas.', 'Introducing allergenic foods early in life decreases the risk of allergies.', 'Video game play enhances visual-spatial skills.', 'Urban green spaces contribute to improved 

### Create Hypothesis Tests

In [20]:
def get_hypothesis_prompt(topic):
    hypothesis_prompt = f"""Your task is to create a hypothesis test for the hypothesis '{topic}'. The test should consist of a hypothesis and five pieces of evidence that either support, refute or are neutral to the hypothesis. You can take inspiration from the example below, but do not make your response too similar.
    
EXAMPLE HYPOTHESIS TEST
=======================    
### Hypothesis
Eating a diet high in fruits and vegetables leads to better cardiovascular health.
    
### Evidence
- Evidence 1: A published study showing lower blood pressure in individuals eating more greens. (Supports)
- Evidence 2: A recent survey linking high fruit consumption to reduced cholesterol levels. (Supports)
- Evidence 3: An article arguing that genetics play a more significant role in cardiovascular health than diet. (Refutes)
- Evidence 4: A case study where no significant health improvement was seen in individuals who increased their fruit and vegetable intake. (Refutes)
- Evidence 5: A nutritionist stating the importance of a balanced diet, including fruits and vegetables, for overall health. (Neutral)
"""
    return hypothesis_prompt

In [21]:
hypothesis_tests = []
for hypothesis_topic in tqdm(hypothesis_topics.topics):
    hypothesis_test = client.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=HypothesisTest,
        temperature=0.2,
        messages=[{"role": "user", "content": get_hypothesis_prompt(hypothesis_topic)}]
    )
    hypothesis_tests.append(hypothesis_test)

  0%|          | 0/1 [00:00<?, ?it/s]

### Validate Hypothesis Tests

In [22]:
for hypothesis_test in tqdm(hypothesis_tests):
    for evidence in hypothesis_test.evidence:
        evidence_validator = client.chat.completions.create(
            model="gpt-4-1106-preview",
            response_model=EvidenceValidator,
            messages=[{"role": "user", "content": evidence.validation_prompt(hypothesis_test.hypothesis)}]
        )
        evidence.validation = evidence_validator

  0%|          | 0/1 [00:00<?, ?it/s]

### Save

In [ ]:
## Remove invalid hypothesis tests.
hypothesis_tests = [hypothesis_test for hypothesis_test in hypothesis_tests if all([evidence.validation.valid for evidence in hypothesis_test.evidence])]

## Store as JSON.
with open("hypothesis_testing.json", "w") as f:
    json.dump([hypothesis_test.model_dump() for hypothesis_test in hypothesis_tests], f)
    
## As CSV.
df = pd.DataFrame([hypothesis_test.model_dump() for hypothesis_test in hypothesis_tests])