# Passing data in

Our goal for this session: to get ChatGPT to do interesting things by passing data.

In [78]:
import json
from dataclasses import dataclass
from typing import Dict, List, Optional
import requests
import sh
from ipywidgets import interact, Dropdown
import pandas as pd

In [41]:
import hashlib

In [30]:
import openai

## Define helpers

You can mostly ignore this code to start with. What it does is to provide the helper method `fetch_bundle()` that will fetch all the config, data and metadata for a single Grapher chart (or indicator).

In [22]:
@dataclass
class Indicator:
    data: dict
    metadata: dict

    def to_dict(self):
        return {"data": self.data, "metadata": self.metadata}

    def to_frame(self):
        df = pd.DataFrame.from_dict(self.data)
        entities = pd.DataFrame.from_records(self.metadata['dimensions']['entities']['values'])
        id_to_name = entities.set_index('id').name.to_dict()
        df['entities'] = df.entities.apply(id_to_name.__getitem__)
        df = df.rename(columns={'values': self.metadata.get('shortName', f'ind_{self.metadata["id"]}')})
        cols = ['entities', 'years'] + sorted([c for c in df.columns if c not in ['entities', 'years']])
        return df[cols]


@dataclass
class GrapherBundle:
    config: Optional[dict]
    dimensions: Dict[int, Indicator]
    origins: List[dict]

    def to_json(self):
        return json.dumps(
            {
                "config": self.config,
                "dimensions": {k: i.to_dict() for k, i in self.dimensions.items()},
                "origins": self.origins,
            }
        )

    def size(self):
        return len(self.to_json())

    @property
    def indicators(self) -> List[Indicator]:
        return list(self.dimensions.values())

    def to_frame(self):
        df = None
        for i in self.indicators:
            to_merge = i.to_frame()
            if df is None:
                df = to_merge
            else:
                df = pd.merge(df, to_merge, how='outer', on=['entities', 'years'])
        return df

    def __repr__(self):
        return f'GrapherBundle(config={self.config}, dimensions=..., origins=...)'

def fetch_grapher_config(slug):
    resp = requests.get(f"https://ourworldindata.org/grapher/{slug}")
    resp.raise_for_status()
    return json.loads(resp.content.decode("utf-8").split("//EMBEDDED_JSON")[1])


def fetch_dimension(id: int) -> Indicator:
    data = requests.get(
        f"https://api.ourworldindata.org/v1/indicators/{id}.data.json"
    ).json()
    metadata = requests.get(
        f"https://api.ourworldindata.org/v1/indicators/{id}.metadata.json"
    ).json()
    return Indicator(data, metadata)


def fetch_bundle(
    slug: Optional[str] = None, indicator_id: Optional[int] = None
) -> GrapherBundle:
    if slug:
        config = fetch_grapher_config(slug)
        indicator_ids = [d["variableId"] for d in config["dimensions"]]
    else:
        print(f"Fetching indicator {indicator_id}")
        config = None
        indicator_ids = [indicator_id]
    dimensions = {
        indicator_id: fetch_dimension(indicator_id) for indicator_id in indicator_ids
    }
    origins = []
    for d in dimensions.values():
        if d.metadata.get("origins"):
            origins.append(d.metadata.pop("origins"))
    return GrapherBundle(config, dimensions, origins)

## Test data fetching

Let's check to see how it works.

In [23]:
# fetch just one indicator
b = fetch_bundle(slug='gdp-per-capita-maddison')
b

GrapherBundle(config={'id': 4659, 'map': {'time': 'latest', 'colorScale': {'baseColorScheme': 'GnBu', 'binningStrategy': 'manual', 'customNumericValues': [1000, 2000, 5000, 10000, 20000, 50000, 100000], 'binningStrategyBinCount': 9}, 'columnSlug': '417485'}, 'tab': 'map', 'note': 'This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2011 prices.', 'slug': 'gdp-per-capita-maddison', 'title': 'GDP per capita', 'yAxis': {'min': 0, 'canChangeScaleType': True}, '$schema': 'https://files.ourworldindata.org/schemas/grapher-schema.004.json', 'version': 45, 'subtitle': 'This data is adjusted for inflation and for differences in the cost of living between countries.', 'hasMapTab': True, 'originUrl': 'https://ourworldindata.org/economic-growth', 'dimensions': [{'display': {'tolerance': 5, 'includeInTable': True}, 'property': 'y', 'variableId': 417485}], 'isPublished': True, 'variantName': 'Maddison Project Database, constant international-$', 'hideRelativeToggle': False, '

In [24]:
b.to_frame()

Unnamed: 0,entities,years,gdp_per_capita
0,Afghanistan,1950,1156.0000
1,Afghanistan,1951,1170.0000
2,Afghanistan,1952,1189.0000
3,Afghanistan,1953,1240.0000
4,Afghanistan,1954,1245.0000
...,...,...,...
19871,Zimbabwe,2014,1594.0000
19872,Zimbabwe,2015,1560.0000
19873,Zimbabwe,2016,1534.0000
19874,Zimbabwe,2017,1582.3662


In [25]:
# fetch a stacked chart that uses a bunch of indicators
fetch_bundle(slug='births-by-age-of-mother').to_frame()

Unnamed: 0,entities,years,births__sex_all__age_50_54__variant_estimates,births__sex_all__age_45_49__variant_estimates,births__sex_all__age_40_44__variant_estimates,births__sex_all__age_35_39__variant_estimates,births__sex_all__age_30_34__variant_estimates,births__sex_all__age_25_29__variant_estimates,births__sex_all__age_20_24__variant_estimates,births__sex_all__age_15_19__variant_estimates,births__sex_all__age_10_14__variant_estimates
0,Afghanistan,1950,152.0,3787.0000,19594.0,43741.0,65730.0,85980.0,92189.00,49818.0,4312.0
1,Afghanistan,1951,157.0,3865.0000,19959.0,44425.0,66674.0,87531.0,94080.00,50953.0,4396.0
2,Afghanistan,1952,158.0,3938.0000,20417.0,45142.0,67546.0,88980.0,95550.00,52179.0,4383.0
3,Afghanistan,1953,161.0,4010.0002,20751.0,45803.0,68756.0,90483.0,97439.00,53106.0,4423.0
4,Afghanistan,1954,162.0,4038.0000,20926.0,46370.0,69592.0,92193.0,98920.00,53903.0,4310.0
...,...,...,...,...,...,...,...,...,...,...,...
18283,Zimbabwe,2017,26.0,1639.0000,13662.0,48406.0,90133.0,106452.0,131067.99,85929.0,3321.0
18284,Zimbabwe,2018,25.0,1707.0000,14037.0,50681.0,87776.0,105130.0,132951.00,88194.0,3280.0
18285,Zimbabwe,2019,28.0,1793.0000,14244.0,52091.0,84423.0,104658.0,135975.00,88507.0,3183.0
18286,Zimbabwe,2020,28.0,1845.0000,14509.0,53003.0,81470.0,104550.0,139791.00,88099.0,3096.0


In [26]:
import shelve

In [27]:
def fetch_cached(slug=None, indicator_id=None):
    key = f'{slug}::{indicator_id}'
    with shelve.open('cache.db') as shelf:
        if key not in shelf:
            b = fetch_bundle(slug=slug, indicator_id=indicator_id)
            shelf[key] = b
        
        return shelf[key]

## Finding data

Let's make a helper that 

In [35]:
slug_whitelist = set(json.load(open('slugs.json')))

In [None]:
last_slug = 'life-expectancy'

In [76]:
@interact(slug=last_slug)
def find_data(slug=None):
    if not slug:
        return

    last_slug = slug
    
    if slug not in slug_whitelist:
        matches = sorted([s for s in slug_whitelist if s.startswith(slug)])[:5]
        if matches:
            print('\n'.join(matches))
        else:
            print('(not found)')
        return
    
    b = fetch_cached(slug=slug)
    return b.to_frame()

interactive(children=(Text(value='life-expectancy', description='slug'), Output()), _dom_classes=('widget-inte…

## Asking ChatGPT

In [31]:
client = openai.Client()

In [32]:
MODEL = 'gpt-4-turbo'

In [33]:
def gpt_response(message: str, model: str = MODEL) -> str:
    return client.chat.completions.create(
      model=model,
      messages=[{"role": "user", "content": message}],
    ).choices[0].message.content

In [34]:
print(gpt_response('Tell me a funny story in a single haiku with a surprising twist'))

Cat prowls, eyes a bird—  
Leaps high, poised with grace, but lands  
In a dog's bath tub.


In [63]:
def gpt_cached(message: str, model: str = MODEL) -> str:
    with shelve.open('cache.db') as shelf:
        key = hashlib.md5(f'{model}:::{message}'.encode('utf8')).hexdigest()
        if key in shelf:
            return shelf[key]

        resp = gpt_response(message, model)
        shelf[key] = resp
        return resp

## Ask ChatGPT about the data

The biggest problem is that we can't look at all the data at once, just one country at a time.

## Prompt and data format

In [95]:
slug = 'life-expectancy'

In [96]:
last_entity = 'Sweden'

In [101]:
PROMPT = '''
Summarise the following data for {entity} from the chart called {slug}?

{data}

What countries would be best to compare it to for this metric?
'''

b = fetch_cached(slug=slug)
df = b.to_frame()

@interact(entity=Dropdown(options=df.entities.unique(), value=last_entity))
def please_summarize(entity=None):
    global last_entity
    if not entity:
        return

    last_entity = entity

    df_e = df[df.entities == entity].drop(columns=['entities'])
    
    prompt = PROMPT.format(data=df_e.to_dict(orient='list'), entity=entity, slug=slug)
    sh.pbcopy(_in=prompt)

    if len(prompt) >= 128000:
        print(f'Prompt length: {len(prompt)}')
        print(f'        Limit: 128000')
    print(prompt[:3000] + '...')

interactive(children=(Dropdown(description='entity', options=('Afghanistan', 'Africa', 'Albania', 'Algeria', '…

After running this, the full prompt is in your clipboard. Paste it into ChatGPT to see its response.

## Things to try

What types of prompts work well?
- Try asking it to think step by step, then give an answer after '---'
- Try giving a lot of guidance
- Try giving little to no guidance
- Try comparing a country to its peers, income group or neighbours (see: `peers.json`)