In [1]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()

In [None]:
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message.content)

# Create question-answer pairs

From news scraped from Yahoo Finance and stored in json format, use gpt-4o-mini to create question-answer pairs.

For each piece of news, create 3 pairs.

In [None]:
import json

news_json = json.loads(open("news.json").read())
news_json

In [9]:
system_msg = f"""
You are a helpful assistant. 
"""

user_msg = f"""
Here is the format of the json array of news you will be working with:
[
	{{
		"title": "The title of the news",
		"content: ["The content of the page"],
		"date": "Date of upload in UTC+0",
		"time": "Time of upload in UTC+0",
		"link": "Link to the page"
	}},
	{{
		"title": "The title of the news",
		"content: ["The content of the page"],
		"date": "Date of upload in UTC+0",
		"time": "Time of upload in UTC+0",
		"link": "Link to the page"
	}},...
]

You are given an array of news in json format. Here is the array in json:
```
{news_json}
```

For each news in the json array, I want you to create 3 question-answer pairs based on the title and content of the news. To create such pairs, you can follow these guidelines:
===
1. Preprocess the Article: First, you need to preprocess the article to extract relevant information. This typically includes:
- Splitting the article into sentences or paragraphs.
- Identifying key entities and facts, such as people, places, dates, events, and important concepts.
- Optionally, perform part-of-speech tagging or named entity recognition (NER) to assist in generating meaningful questions.

2. Identify Important Information: 
Determine the key pieces of information from the article that can be turned into questions. Some examples of important > information include:
- Who did something? (Person-related facts)
- What happened? (Events)
- When did something occur? (Dates and time)
- Where did something happen? (Places)
- Why did something happen? (Explanations or reasons)
- How did something happen? (Processes or sequences)
By identifying these details, you can focus on producing questions that revolve around these key facts.

3. Generate Questions
Aim to generate a variety of question types:
- Factual: "When did X happen?"
- Definition: "What is X?"
- Causal: "Why did X happen?"
- Procedural: "How does X work?"
- Inference: "What can be inferred about X?"

4. Generate answer for each question.
5. Structure the output in json format like this:
[
	{{
		"link": "Link to the news that is used to create question-answer pair",
		"date": "Date of upload of the news used to created question-answer pair"
		"context": "The context used to generate question-answer pair"
		"question": "The question created"
		"answer": "The answer for the questions created".
	}},
	...
]
===

For each piece of news, I want you to create 3 question-answer pairs based on the title and content of the news. Here is an example output:
[
    {{
		"link": "https://finance.yahoo.com/news/market-apos-too-optimistic-apos-175752296.html?",
		"date": "2024-10-08",
        "context": "Marie Curie won the Nobel Prize in Physics in 1903.",
        "question": "When did Marie Curie win the Nobel Prize in Physics?",
        "answer": "1903"
    }},
    {{
		"link": "https://finance.yahoo.com/news/mfadsfqwefsdfsa-175752296.html?",
		"date": "2024-10-08",
        "context": "The capital of France is Paris.",
        "question": "What is the capital of France?",
        "answer": "Paris"
    }}
]

Only output in the format specified.
"""

messages = [
    {"role": "system", "content": system_msg},
    {"role": "user", "content": user_msg}
]

In [10]:
completion = client.chat.completions.create(model="gpt-4o-mini", messages=messages)

from datetime import datetime

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
questions_file_name = f"questions_{current_time}.json"
with open(questions_file_name, "w") as f:
    f.write(completion.choices[0].message.content)

# Checking created question-answer pairs

In [16]:
question_answer_pairs = json.loads(open(questions_file_name).read())

check_system_msg = f"""
You are a helpful assistant. 
"""

check_user_msg = f"""
You are given a json array of question-answer pairs. Here is the format of the json array of news you will be working with:
[
	{{
		"link": "Link to the news that is used to create question-answer pair",
		"date": "Date of upload of the news used to created question-answer pair"
		"context": "The context used to generate question-answer pair"
		"question": "The question created"
		"answer": "The answer for the questions created".
	}},
	...
]

Here is the array in json:
{question_answer_pairs}

For each question-answer pair, you should check if the answer is correct. If the answer is correct, you should respond with "Correct". If the answer is incorrect, you should respond with "Incorrect" and add the correct asnwer.
To check the answer, you can follow these guidelines:
===
1. Read the context and question. You can also refer to the link provided to get more information.
2. Determine if the answer provided is correct based on the context.
3. If the answer is correct, respond with "Correct". If the answer is incorrect, respond with "Incorrect" and provide the correct answer.
4. Repeat the process for each question-answer pair.
5. Structure the output in json format like this:
[
    {{
        "question": "The question created",
        "answer": "The answer for the questions created",
        "response": "Correct" or "Incorrect",
        "correct_answer": "Correct answer if the response is Incorrect"
    }},
    ...
]
===

Here is the example output:
[
    {{
        "question": "When did Marie Curie win the Nobel Prize in Physics?",
        "answer": "1903",
        "response": "Correct"
    }},
    {{
        "question": "What is the capital of France?",
        "answer": "Washington",
        "response": "Incorrect",
        "correct_answer": "Paris"
    }},
]


Only output in the format specified.
"""

check_messages = [
    {"role": "system", "content": check_system_msg},
    {"role": "user", "content": check_user_msg}
]

In [17]:
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=check_messages
)

from datetime import datetime

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
check_questions_file_name = f"check_questions_{current_time}.json"
with open(check_questions_file_name, "w") as f:
    f.write(completion.choices[0].message.content)

# Misc

In [None]:
question_json = json.loads(open(questions_file_name).read())
print(len(question_json))

# check_json = json.loads(open(check_questions_file_name).read())
# print(len(check_json))

In [None]:
import json

final_quest = json.loads(open("./questions/final_questions.json").read())
print(len(final_quest))

links = list(set([qa["link"] for qa in final_quest]))
with open("links.txt", "w") as f:
    f.write("\n".join(links))