# Setup

In [1]:
%%capture
!pip install --progress-bar off poetry
!pip install --progress-bar off git+https://github.com/oughtinc/ergo.git@87c7bc2b38c3007aab38da46c441cef548217e31

In [2]:
import warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="plotnine")

In [3]:
import pandas as pd
from datetime import datetime

import ergo
from ergo.platforms.metaculus.question import MetaculusQuestion, LinearQuestion, LogQuestion, ContinuousQuestion

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Get questions

Get all *open* questions on the *main* subdomain.

NOTE: log date questions are excluded because Ergo currently can't handle them

In [5]:
metaculus = ergo.Metaculus(username="oughtpublic", password="123456", api_domain="www")

In [6]:
qs = metaculus.get_questions(question_status="open", pages=99999, load_detail=False)

In [7]:
# For questions with open boundaries,
# the undetailed version of these questions is missing
# the probability above and below the question bounds, which we'd like to have.
# So, fetch the full question data.
for q in qs:
    if getattr(q, "low_open", False) or getattr(q, "high_open", False):
        q.refresh_question()

In [8]:
exemplar_q_id = 3530

In [9]:
exemplar_q = metaculus.get_question(exemplar_q_id)

In [10]:
qs_df = exemplar_q.to_dataframe(qs)

# Get question metadata

## Get all of the metadata already on the question

Get the field names from the question JSON from Metaculus:

In [11]:
metaculus_json_fields = list(exemplar_q.data.keys())

Get the property names from the MetaculusQuestion and ContinuousQuestion classes:

In [12]:
def properties(some_class):
    """
    Get all @properties of a class
    """
    class_items = some_class.__dict__.items()
    return [name for (name, value) in class_items if isinstance(value, property)]

In [13]:
question_properties = properties(MetaculusQuestion)

In [14]:
continuous_question_properties = properties(ContinuousQuestion)

In [15]:
%%capture
q_fields = metaculus_json_fields + question_properties + continuous_question_properties

simple_fields = [field for field in q_fields if type(getattr(exemplar_q, field)) in [bool, int, float, str, datetime]]

# This property causes an exception for some reason.
# Didn't seem worth investigating
simple_fields.remove("question_range_width")

for field in simple_fields:
    qs_df[field] = [getattr(q, field, None) for q in qs]

## Generate and add more metadata

In [16]:
def get_p_outside(q):
    if not hasattr(q, "latest_community_percentiles"):
        return None

    # q.latest_community_percentiles is a float for binary questions:
    # https://github.com/oughtinc/ergo/pull/378
    if type(q.latest_community_percentiles) == float:
        return None
    
    return q.latest_community_percentiles["low"] + (1 - q.latest_community_percentiles["high"])

In [17]:
metadata_columns = {
    "type": lambda q: type(q).__name__,
    "num_boundaries_open": lambda q: int(q.low_open) + int(q.high_open) if hasattr(q, "low_open") else None,
    "question_scale_low": lambda q: q.scale.low if hasattr(q, "scale") else None,
    "question_scale_high": lambda q: q.scale.high if hasattr(q, "scale") else None,
}

In [18]:
for (name, fn) in metadata_columns.items():
    qs_df[name] = [fn(q) for q in qs]

## Select and reorder columns

We have these columns:

In [19]:
qs_df[qs_df["id"] == exemplar_q_id]

Unnamed: 0,id,title,resolve_time,url,page_url,author,status,created_time,publish_time,close_time,can_use_powers,last_activity_time,activity,comment_count,votes,title_short,user_vote,user_community_vis,author_name,anon_prediction_count,last_read,question_url,low_open,high_open,p_outside,has_predictions,plot_title,type,num_boundaries_open,question_scale_low,question_scale_high
259,3530,How many people will die as a result of the 20...,2021-01-01 00:00:00,https://www.metaculus.com/api2/questions/3530/,/questions/3530/how-many-people-will-die-as-a-...,101465,A,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,True,2020-06-17T15:51:31.488705Z,3.829976,231,146,COVID-19 related deaths before 2021:,0,0.0,Jgalt,101.0,2020-06-16T03:42:28.735851Z,https://www.metaculus.com/questions/3530,True,True,0.04249,True,How many people will die as a result of the 20...,LogQuestion,2.0,200,100000000


Select all the ones that might plausibly be useful, and put them in a reasonable order:

In [20]:
qs_df = qs_df[[
    "id",
    "title",
    "question_url",
    "type",
    "num_boundaries_open",
    "low_open",
    "high_open",
    "p_outside",
    "question_scale_low",
    "question_scale_high",
    "anon_prediction_count",
    "last_activity_time",
    "votes",
    "comment_count",
    "created_time",
    "publish_time",
    "close_time",
    "resolve_time",
    "author_name",
    "last_read",
    "has_predictions",
    "activity",
    "title_short"
]]

In [21]:
qs_df[qs_df["id"] == exemplar_q_id]

Unnamed: 0,id,title,question_url,type,num_boundaries_open,low_open,high_open,p_outside,question_scale_low,question_scale_high,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
259,3530,How many people will die as a result of the 20...,https://www.metaculus.com/questions/3530,LogQuestion,2.0,True,True,0.04249,200,100000000,101.0,2020-06-17T15:51:31.488705Z,146,231,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,2021-01-01 00:00:00,Jgalt,2020-06-16T03:42:28.735851Z,True,3.829976,COVID-19 related deaths before 2021:


## Explanations of some important fields
- `low_open`: Is the lower boundary of the question open? (only applies to ContinuousQuestions)
- `high_open`: Is the upper boundary of the question open? (only applies to ContinuousQuestions)
- `p_outside`: How much of the total probability mass of the community prediction is outside the question range? (only applies to ContinuousQuestions)
- `anon_prediction_count`: Seems to be a proxy for the number of predictions. See "Data notes" below.
- `last_activity_time`: Seems to be a quick proxy for the time of the last prediction on the question. See "Data notes" below.
- `comment_count`: How many comments have been left on this question?
- `created_time`: When did the author of the question create it? (I think)
- `publish_time`: When was the question published to all Metaculus users?
- `close_time`: After what time are predictions on this question no longer allowed?
- `resolve_time`: When can the question be resolved, i.e. when will the answer be known?

## Data notes:
1. `anon_prediction_count` is the closest thing I could find to a count of number of predictions, but I'm not really sure how it relates to the number of predictions. In my testing:
    1. It seems to always be the same as the length of `prediction_timeseries`
    2. It seems to be correlated with something about the number of predictions shown. E.g.
        1. it's 101 for this question where the community prediction is shown: https://www.metaculus.com/questions/3530/how-many-people-will-die-as-a-result-of-the-2019-novel-coronavirus-covid-19-before-2021/.
        2. While it's 0 for this question where the community prediction is not shown yet: https://www.metaculus.com/questions/4614/when-will-directly-removing-carbon-dioxide-from-the-atmosphere-be-economically-feasible/ 
    3. I couldn't get it to increment. I tried:
        1. making a new prediction with an account that had already predicted on the question
        2. making a prediction with an account that had never predicted on that question before.
2.`last_activity_time` seems like the most obvious easy proxy for when the most recent prediction was made. However, I'm not sure how reliable it is.
    1. It did not update when I made a new prediction from an account that had already predicted on the question previously
    2. It may update when people leave comments or at other times
    3. Alternatively, we could use the last time from the `prediction_timeseries`, but that also doesn't seem to update every time someone makes a prediction
3. To get the datetime of the last posted comment, I think we'd need to retrieve it from a separate API (prob at least 30 min of work, maybe more like hours), so I haven't tried
4. Log date questions are excluded here because Ergo can't handle them yet.

# View data

## Export as csv

(For use when running locally in `ergo/notebooks`)

In [23]:
# qs_df.to_csv("../ergo/contrib/metac_qs_data/metac_qs_data.csv", index=False, float_format='%.20f')

A version of this CSV is uploaded as a [Google Sheet](https://docs.google.com/spreadsheets/d/1Aii_IkUTiJH6t14n2lhwhu4PJJjlTz6X5vdEi5gPGa0/edit#gid=1305569144).

## View all questions

In [24]:
qs_df

Unnamed: 0,id,title,question_url,type,num_boundaries_open,low_open,high_open,p_outside,question_scale_low,question_scale_high,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
0,4637,Will the S&P 500 close higher for 2020?,https://www.metaculus.com/questions/4637,BinaryQuestion,,,,,,,,2020-06-14T23:47:23.236997Z,4,1,2020-06-13T21:48:45.138218Z,2020-06-17 22:00:00.000,2020-12-31 21:00:00,2020-12-31 21:00:00,,,,0.2282622,Will the S&P 500 close higher for 2020?
1,4527,"Will the S&P 500 hit 10,000 points by the end ...",https://www.metaculus.com/questions/4527,BinaryQuestion,,,,,,,,2020-06-17T22:28:04.096616Z,2,1,2020-05-31T17:47:53.273048Z,2020-06-17 22:00:00.000,2030-01-01 07:59:00,2030-01-01 08:00:00,,,,0.2061936,"Will the S&P 500 hit 10,000 points by th"
2,4627,Will US forces shoot unarmed protesters in 2020?,https://www.metaculus.com/questions/4627,BinaryQuestion,,,,,,,,2020-06-17T22:59:57.314667Z,10,12,2020-06-08T18:14:40.826739Z,2020-06-17 22:00:00.000,2020-07-01 07:00:00,2021-01-01 08:00:00,,,,2.906938,Will US forces shoot unarmed protesters
3,4614,When will directly removing carbon dioxide fro...,https://www.metaculus.com/questions/4614,LinearDateQuestion,1.0,False,True,0.11757,2021-05-30,2099-12-31,0.0,2020-06-17T23:03:07.659243Z,12,8,2020-06-07T00:04:13.853777Z,2020-06-17 07:00:00.000,2067-01-02 01:03:00,2100-01-02 01:03:00,krtnu,2020-06-16T02:27:51.567315Z,True,2.389925,When will directly removing carbon dioxi
4,4622,How many more Starship prototypes will be dest...,https://www.metaculus.com/questions/4622,LinearQuestion,1.0,False,True,0.02291,0,15,1.0,2020-06-17T23:01:58.600137Z,13,5,2020-06-08T06:42:33.726537Z,2020-06-17 03:00:00.000,2021-01-02 07:32:00,2021-01-02 07:33:00,krtnu,,True,0.7364679,How many more Starship prototypes will b
5,4596,What will the percentage difference between fu...,https://www.metaculus.com/questions/4596,LinearQuestion,2.0,True,True,,-15,15,0.0,2020-06-11T03:05:37.439521Z,0,2,2020-06-04T22:13:19.855024Z,2020-06-16 22:00:00.000,2021-01-01 08:00:00,2021-06-01 07:00:00,AlyssaStevens,,True,0.03875392,What will the percentage difference betw
6,4646,What will total enrollment for recent US high ...,https://www.metaculus.com/questions/4646,LinearQuestion,2.0,True,True,,1000000,4000000,0.0,2020-06-14T20:23:13.564716Z,1,0,2020-06-14T20:23:13.564716Z,2020-06-16 22:00:00.000,2021-07-29 22:00:00,2022-08-31 22:00:00,AlyssaStevens,,True,0.259195,What will total enrollment for recent US
7,4628,Will one of GiveWell's 2019 top charities be e...,https://www.metaculus.com/questions/4628,BinaryQuestion,,,,,,,,2020-06-17T05:58:30.123175Z,9,7,2020-06-08T18:18:49.172425Z,2020-06-14 22:00:00.000,2023-01-01 08:00:00,2031-12-31 08:00:00,,,,1.364159,Will one of GiveWell's 2019 top charitie
8,4607,Will California Assembly Bill 3155 be chaptere...,https://www.metaculus.com/questions/4607,BinaryQuestion,,,,,,,,2020-06-17T21:43:46.828163Z,2,1,2020-06-05T15:34:10.753708Z,2020-06-14 22:00:00.000,2020-08-14 23:00:00,2020-09-29 23:00:00,,,,0.0003760133,Will California Assembly Bill 3155 be ch
9,4408,Will Apple announce plans to make ARM-based Ma...,https://www.metaculus.com/questions/4408,BinaryQuestion,,,,,,,,2020-06-17T20:01:42.778316Z,11,10,2020-05-14T08:11:39.669743Z,2020-06-13 22:00:00.000,2020-06-20 22:00:00,2020-06-26 22:00:00,,,,10.91421,Will Apple announce plans to make ARM-ba
