# Setup

In [17]:
import pandas as pd
from datetime import datetime

import ergo
from ergo.platforms.metaculus.question import LinearQuestion, LogQuestion, ContinuousQuestion

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Get questions to sort

Get all *open* questions on the *main* subdomain.

In [4]:
metaculus = ergo.Metaculus(username="oughtpublic", password="123456", api_domain="www")

In [5]:
qs = metaculus.get_questions(question_status="open", pages=99999)

In [18]:
exemplar_q_id = 3530

In [19]:
exemplar_q = metaculus.get_question(exemplar_q_id)

In [20]:
qs_df = exemplar_q.to_dataframe(qs)

# Get question metadata

## Get all of the metadata already on the question

Get the field names from the question JSON from Metaculus:

In [21]:
metaculus_json_fields = list(exemplar_q.data.keys())

Get the property names from the ContinuousQuestion class:

In [22]:
def properties(some_class):
    class_items = some_class.__dict__.items()
    return [name for (name, value) in class_items if isinstance(value, property)]

In [23]:
continuous_question_properties = properties(ContinuousQuestion)

In [30]:
%%capture
q_fields = metaculus_json_fields + continuous_question_properties

simple_fields = [field for field in q_fields if type(getattr(exemplar_q, field)) in [bool, int, float, str, datetime]]

# This property causes an exception for some reason
simple_fields.remove("question_range_width")

for field in simple_fields:
    qs_df[field] = [getattr(q, field, None) for q in qs]

## Generate and add more metadata

In [31]:
def get_p_outside(q):
    if not hasattr(q, "latest_community_percentiles"):
        return None
    # for some reason q.latest_community_percentiles is a float for some questions
    if type(q.latest_community_percentiles) == float:
        return None
    
    return q.latest_community_percentiles["low"] + (1 - q.latest_community_percentiles["high"])

In [32]:
metadata_columns = {
    "type": lambda q: type(q).__name__,
    "num_boundaries_open": lambda q: int(q.low_open) + int(q.high_open) if hasattr(q, "low_open") else None,
    "p_outside": get_p_outside,
    "question_range_min": lambda q: q.question_range["min"] if hasattr(q, "question_range") else None,
    "question_range_max": lambda q: q.question_range["max"] if hasattr(q, "question_range") else None,
    "question_url": lambda q: f"https://www.metaculus.com{q.page_url}"
}

In [33]:
for (name, fn) in metadata_columns.items():
    qs_df[name] = [fn(q) for q in qs]

## Select and reorder columns

At this point, we have these columns:

In [34]:
qs_df[qs_df["id"] == exemplar_q_id]

Unnamed: 0,id,title,resolve_time,type,num_boundaries_open,p_outside,question_range_min,question_range_max,question_url,url,page_url,author,status,created_time,publish_time,close_time,can_use_powers,last_activity_time,activity,comment_count,votes,title_short,user_vote,user_community_vis,author_name,anon_prediction_count,last_read,low_open,high_open,has_predictions,plot_title
253,3530,How many people will die as a result of the 20...,2021-01-01 00:00:00,LogQuestion,2.0,0.03308,200,100000000,https://www.metaculus.com/questions/3530/how-m...,https://www.metaculus.com/api2/questions/3530/,/questions/3530/how-many-people-will-die-as-a-...,101465,A,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,True,2020-06-16T00:13:10.915866Z,11.692703,229,146,COVID-19 related deaths before 2021:,0,0.0,Jgalt,101,2020-06-16T02:28:51.275880Z,True,True,True,How many people will die as a result of the 20...


Select all the ones that might plausibly be useful, and put them in a reasonable order:

In [35]:
qs_df = qs_df[[
    "id",
    "title",
    "question_url",
    "type",
    "num_boundaries_open",
    "low_open",
    "high_open",
    "p_outside",
    "question_range_min",
    "question_range_max",
    "anon_prediction_count",
    "last_activity_time",
    "votes",
    "comment_count",
    "created_time",
    "publish_time",
    "close_time",
    "resolve_time",
    "author_name",
    "last_read",
    "has_predictions",
    "activity",
    "title_short"
]]

In [36]:
qs_df[qs_df["id"] == exemplar_q_id]

Unnamed: 0,id,title,question_url,type,num_boundaries_open,low_open,high_open,p_outside,question_range_min,question_range_max,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
253,3530,How many people will die as a result of the 20...,https://www.metaculus.com/questions/3530/how-m...,LogQuestion,2.0,True,True,0.03308,200,100000000,101,2020-06-16T00:13:10.915866Z,146,229,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,2021-01-01 00:00:00,Jgalt,2020-06-16T02:28:51.275880Z,True,11.692703,COVID-19 related deaths before 2021:


## Explanations of some important fields
- `low_open`: Is the lower boundary of the question open? (only applies to ContinuousQuestions)
- `high_open`: Is the upper boundary of the question open? (only applies to ContinuousQuestions)
- `p_outside`: How much of the total probability mass of the community prediction is outside the question range?
- `anon_prediction_count`: Seems to be a proxy for the number of predictions. See "Data notes" below.
- `last_activity_time`: Seems to be a quick proxy for the time of the last prediction on the question. See "Data notes" below.
- `comment_count`: How many comments have been left on this question? VERIFY
- `created_time`: When did the author of the question create it? (I think)
- `publish_time`: When was the question published to all Metaculus users?
- `close_time`: After what time are predictions on this question no longer allowed?
- `resolve_time`: When can the question be resolved, i.e. when will the answer be known?

## Data notes:
1. `anon_prediction_count` is the closest thing I could find to a count of number of predictions, but I'm not really sure how it relates to the number of predictions. In my testing:
    1. It seems to always be the same as the length of `prediction_timeseries`
    2. It seems to be correlated with something about the number of predictions shown. E.g.
        1. it's 101 for this question where the community prediction is shown: https://www.metaculus.com/questions/3530/how-many-people-will-die-as-a-result-of-the-2019-novel-coronavirus-covid-19-before-2021/.
        2. While it's 0 for this question where the community prediction is not shown yet: https://www.metaculus.com/questions/4614/when-will-directly-removing-carbon-dioxide-from-the-atmosphere-be-economically-feasible/ 
    3. I couldn't get it to increment. I tried:
        1. making a new prediction with an account that had already predicted on the question
        2. making a prediction with an account that had never predicted on that question before.
2.`last_activity_time` seems like the most obvious easy proxy for when the most recent prediction was made. However, I'm not sure how reliable it is.
    1. It did not update when I made a new prediction from an account that had already predicted on the question previously
    2. It may update when people leave comments or at other times
    3. Alternatively, we could use the last time from the `prediction_timeseries`, but that also doesn't seem to update every time someone makes a prediction
3. To get the datetime of the last posted comment, I think we'd need to retrieve it from a separate API (prob at least 30 min of work, maybe more like hours), so I haven't tried

# View data

## Export as csv

In [40]:
qs_df.to_csv("metac_qs_data.csv", index=False)