In [39]:
import os
import re
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv; load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

import warnings
warnings.filterwarnings('ignore')

## Getting the 10 Questions, Answers, and Answers with Caption for Single View Charts

In [40]:
df = pd.read_csv('./200charts.csv')

In [41]:
df = df[df['views'] == 'single view'].reset_index(drop=True)
df.head()

Unnamed: 0,imageid,full_caption,image_base64,domain,chart_type,views
0,82,private and publicsector investment in rd clas...,iVBORw0KGgoAAAANSUhEUgAAA5EAAAEbCAIAAADMBJd/AA...,Healthcare,Bar Graph,single view
1,149,Timing when the stabilization scenarios achiev...,iVBORw0KGgoAAAANSUhEUgAABDkAAAGJCAIAAAAPHn+RAA...,Climate Science,Scatter Plot,single view
2,196,Decomposition of the change in total annual c...,iVBORw0KGgoAAAANSUhEUgAABCsAAAGhCAIAAADHuqkfAA...,Climate Science,Bar Graph,single view
3,203,"Summary of projected changes in crop yields, ...",iVBORw0KGgoAAAANSUhEUgAAAvgAAAEuCAIAAAB0z/EbAA...,Climate Science,Bar Graph,single view
4,236,Projections and uncertainties for global mean ...,iVBORw0KGgoAAAANSUhEUgAABCMAAAG7CAIAAAB2IMgWAA...,Energy,Bar Graph,single view


In [42]:
prompt = '''
        I have a chart, and I need a list of questions generated from it. Your task is to create **objective and quantifiable questions** that can be fully answered using only the information in the chart.  

        ### **Guidelines for Question Generation:**  
        1. **Ensure all questions are measurable and verifiable.**  
        - Use **specific numerical comparisons** instead of vague terms like *"significant," "important," or "major changes"*.  
        - Example: ✅ *"What is the percentage increase in sales from 2015 to 2020?"*  

        2. **Clearly define the scope.**  
        - Specify **time periods, categories, or comparison criteria** to avoid ambiguity.  
        - Example: ✅ *"What was the compound annual growth rate (CAGR) of wind energy from 2010 to 2020?"*  

        3. **Avoid speculative or assumption-based wording.**  
        - Do not include *"why"* unless causation is explicitly provided in the chart.  
        - Example: ✅ *"What was the CO₂ emission increase (in percentage) from 2019 to 2020?"*  

        4. **Ensure all questions are fully answerable using the given chart.**  
        - The questions should require no external knowledge.  
        - Example: ✅ *"How does the GDP growth rate shown in the chart correlate with inflation trends?"*  

        ### **Format for Output:**  
        Generate a numbered list of refined questions:  

        1. (Generated question 1)  
        2. (Generated question 2)  
        3. (Generated question 3)  
        4. (Generated question 4)  

        Now, generate the list of refined questions based on the attached chart.

'''

In [43]:
max_epochs = 89
current_epoch = 0  

results = []  # List to store each row's data

for idx, row in df.iterrows():
    if current_epoch >= max_epochs:  
        break  

    chart = row["image_base64"] 
    caption = row['full_caption']

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": "Give me maximum of 10 questions"
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt,  
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{chart}"
                            },
                        },
                    ],
                }
            ],
        )

        response_ques = response.choices[0].message.content

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": response_ques,  
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{chart}"
                            },
                        },
                    ],
                }
            ],
        )

        response = response.choices[0].message.content
        
        response_cap = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": (
                                "Based on the following questions and the provided caption, "
                                "please generate a numbered list of answers that incorporate the caption context."
                                "However, if the caption serves no context for a particular question, "
                                "then you dont have to forecefully include the caption context\n\n"
                                "Questions:\n" + response_ques
                            ),
                        },
                        {
                            "type": "text",
                            "text": "Caption: " + caption,
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{chart}"
                            },
                        },
                    ],
                }
            ],
        )
        
        response_cap = response_cap.choices[0].message.content

        # Extract questions and answers using regular expressions
        questions = re.findall(r"\d+\.\s(.+?)(?=\n|$)", response_ques)
        answers = re.findall(r"\d+\.\s(.+?)(?=\n|$)", response)
        answer_w_cap = re.findall(r"\d+\.\s(.+?)(?=\n|$)", response_cap)

        new_row = {"image_id": row['imageid']}
        for i, (question, answer, answer_cap) in enumerate(zip(questions, answers, answer_w_cap)):
            new_row[f"Q{i+1}"] = question
            new_row[f"A{i+1}"] = answer
            new_row[f"AwC{i+1}"] = answer_cap

        # Append the new row dictionary to the results list
        results.append(new_row)
        
        print(f"Processed row {idx}")
        current_epoch += 1  

    except Exception as e:
        print(f"Error processing row {idx}: {e}")
        current_epoch += 1 

# Create a new dataframe from the results list
qa_df = pd.DataFrame(results)
qa_df.to_csv("qa_singleview.csv", index=False)

qa_df.head()

Processed row 0
Processed row 1
Processed row 2
Processed row 3
Processed row 4
Processed row 5
Processed row 6
Processed row 7
Processed row 8
Processed row 9
Processed row 10
Processed row 11
Processed row 12
Processed row 13
Processed row 14
Processed row 15
Processed row 16
Processed row 17
Processed row 18
Processed row 19
Processed row 20
Processed row 21
Processed row 22
Processed row 23
Processed row 24
Processed row 25
Processed row 26
Processed row 27
Processed row 28
Processed row 29
Processed row 30
Processed row 31
Processed row 32
Processed row 33
Processed row 34
Processed row 35
Processed row 36
Processed row 37
Processed row 38
Processed row 39
Processed row 40
Processed row 41
Processed row 42
Processed row 43
Processed row 44
Processed row 45
Processed row 46
Processed row 47
Processed row 48
Processed row 49
Processed row 50
Processed row 51
Processed row 52
Processed row 53
Processed row 54
Processed row 55
Processed row 56
Processed row 57
Processed row 58
Process

Unnamed: 0,image_id,Q1,A1,AwC1,Q2,A2,AwC2,Q3,A3,AwC3,...,AwC7,Q8,A8,AwC8,Q9,A9,AwC9,Q10,A10,AwC10
0,82,What is the percentage allocation of R&D expen...,The percentage allocation of R&D expenditure f...,**Mozambique Basic Research Percentage**: The ...,In which country is the allocation for experim...,The highest allocation for experimental resear...,**Highest Experimental Research**: The highest...,Compare the percentage allocations for applied...,Uganda has an allocation of approximately 75% ...,**Uganda vs. South Africa Applied Research**: ...,...,**Basic Research: South Africa vs. Nigeria**: ...,What is the percentage allocation for applied ...,The percentage allocation for applied research...,**Malawi Applied Research**: Malawi allocates ...,What is the combined percentage allocation of ...,The combined percentage allocation of applied ...,**Mozambique Applied and Experimental Combined...,In which country is the allocation for basic r...,The lowest allocation for basic research is in...,**Lowest Basic Research Allocation**: The lowe...
1,149,What is the earliest year at which the 450 ppm...,The earliest year at which the 450 ppmv level ...,The earliest year at which the 450 ppmv level ...,Which model first predicts reaching the 550 pp...,The LDNE model is the first to predict reachin...,The first model to predict reaching the 550 pp...,How many model scenarios are shown for the 650...,There are six model scenarios shown for the 65...,"There are 6 model scenarios shown for the 650,...",...,The latest year shown in the chart for any mod...,Which model is projected to reach 650 ppmv fir...,The LDNE model is projected to reach 650 ppmv ...,The model projected to reach 650 ppmv first is...,How many times does the MARIA model appear on ...,The MARIA model appears three times on the chart.,The MARIA model appears twice on the chart.,Which model predicts the slowest rate of incre...,The AIM model predicts the slowest rate of inc...,The AIM model predicts the slowest rate of inc...
2,196,What was the total change in annual CO₂ emissi...,The total change in annual CO₂ emissions from ...,The total change in annual CO₂ emissions from ...,How did the GDP per capita contribute to the c...,"From 1980–1990, GDP per capita contributed sig...",The GDP per capita contributed a significant p...,During which decade did population contribute ...,Population contributed the most to the change ...,Population contributed the most to the change ...,...,The decade with a smaller total change in annu...,What is the difference in the total change of ...,The difference in the total change of annual C...,The difference in the total change of annual C...,In which decade did GDP per capita have the la...,GDP per capita had the largest impact on CO₂ e...,GDP per capita had the largest impact on CO₂ e...,What was the contribution of carbon intensity ...,The contribution of carbon intensity of energy...,The contribution of carbon intensity of energy...
3,203,What is the percentage of yield projections sh...,"For the period 2010-2029, the percentage of yi...",**0 to 5% increase (2010-2029):** Approximatel...,How does the percentage of yield projections w...,The percentage of yield projections with a 5 t...,**5 to 10% increase change (2030-2049 to 2070-...,During which time period is the percentage of ...,The percentage of yield projections showing a ...,**Highest decrease from -25 to -50%:** 2090-2109.,...,**More than 50% decrease (2070-2089):** 5%.,"In the period 2030-2049, what is the total per...","In the period 2030-2049, the total percentage ...",**Total decrease (2030-2049):** 55% (0 to -5% ...,Which period shows the highest percentage of y...,The period showing the highest percentage of y...,**Highest 25 to 50% increase:** 2090-2109.,How does the percentage of yield projections w...,The percentage of yield projections with a dec...,**Decrease of -10 to -25% (2010-2029 vs 2050-2...
4,236,What is the global temperature increase range ...,The global temperature increase range for AOGC...,The global temperature increase range for AOGC...,How does the AOGCM mean global temperature inc...,The AOGCM mean global temperature increase for...,The AOGCM mean global temperature increase for...,What is the global temperature increase for th...,The global temperature increase for the MAGICC...,The global temperature increase for the MAGICC...,...,The global temperature increase for scenario A...,What is the difference in global temperature i...,The difference in global temperature increase ...,The difference in global temperature increase ...,In which scenario is the deviation between MAG...,The scenario in which the deviation between MA...,The deviation between MAGICC physics uncertain...,How does the temperature increase modeled by S...,The temperature increase modeled by Stott 2002...,The temperature increase modeled by Stott 2002...
