Load the data.

In [45]:
import csv
import time
import random
import math
import statistics
import numpy as np
import matplotlib as mpl
import scipy.stats as sps

from datetime import datetime as dt

survey_files=["./survey_fcasts.yr1.csv"]
#, "./survey_fcasts.yr2.csv", "./survey_fcasts.yr3.csv", "./survey_fcasts.yr4.csv"]

MIN_PROB=0.01
EXTR=3

In [46]:
# List of individual forecasts. Type: List of dictionaries.
# Fields:
# {'ifp_id', 'ctt', 'cond', 'training', 'team', 'user_id', 'forecast_id', 'fcast_type', 'answer_option', 'value', 'fcast_date', 'expertise', 'q_status', 'viewtime', 'year', 'timestamp'}

preds=[]

year=1

for n in survey_files:
    f=open(n)
    forecast_reader=csv.DictReader(f)
    for entry in forecast_reader:
        entry['year']=year
        entry['cond']=int(entry['cond'])
        entry['value']=float(entry['value'])
        entry['timestamp']=dt.fromisoformat(entry['timestamp'])
        preds.append(entry)
    year=year+1

# List of individual questions. Type: List of dictionaries.
# Fields:
# {'ifp_id', 'q_type', 'q_text', 'q_desc', 'q_status', 'date_start', 'date_suspend', 'date_to_close', 'date_closed', 'outcome', 'short_title', 'days_open', 'n_opts', 'options'}

qdata=[]

qfile=open("ifps.csv")
qreader=csv.DictReader(qfile)

for entry in qreader:
    if entry['date_start']!='NULL':
        entry['date_start']=dt.strptime(entry['date_start'], '%m/%d/%y')
    if entry['date_suspend']!='NULL':
        entry['date_suspend']=dt.strptime(entry['date_suspend'], '%m/%d/%y %H:%M')
    qdata.append(entry)

Functions.

Problem with the code below: The Brier score has two different formulations (one equivalent to the mean squared error, another one being the original formulation by Brier). Only the original formulation is proper for forecasts on >2 options.

TODO: implement the original formulation: $BS=\frac{1}{N} \sum_{t=1}^{N} \sum_{i=1}^R (f_{ti}-o_{ti})^2$

In [47]:
def group_score(user_forecasts, users):
    forecasts=np.array([])
    results=np.array([])
    for u in users:
        udata=user_forecasts[u]
        forecasts=np.append(forecasts, np.array(udata['forecasts']))
        results=np.append(results, np.array(udata['options'])==np.array(udata['outcomes']))
    return np.mean((forecasts-results)**2)

def individual_score(user):
    return np.mean((np.array(user['forecasts'])-(np.array(user['options'])==np.array(user['outcomes'])))**2)

Different structures of the data.

In [48]:
# Outcomes of questions.
# Type: Dictionary.
# Fields: Keys are question ids (`ifp_id`), values are dictionaries (fields are `outcome`, `date_start`, `date_suspend`)

questions=dict()

for q in qdata:
    qid=q['ifp_id']
    if not qid in questions:
        questions[qid]=dict()
    questions[qid]['outcome']=q['outcome']
    questions[qid]['date_start']=q['date_start']
    questions[qid]['date_suspend']=q['date_suspend']
    questions[qid]['q_status']=q['q_status']

for p in preds:
    p['outcome']=questions[p['ifp_id']]['outcome']

In [49]:
# All forecasts, per user.
# Type: Dictionary of dictionaries. ("Look on my works, ye Mighty, and despair!")
# Fields: Keys are user ids (`user_id`), values are dictionaries, where:
# keys are 'forecasts', 'options' and 'outcomes'.

user_forecasts=dict()

for p in preds:
    uid=p['user_id']
    if not user_forecasts.__contains__(uid):
        user_forecasts[uid]=dict()
        user_forecasts[uid]['forecasts']=[]
        user_forecasts[uid]['options']=[]
        user_forecasts[uid]['outcomes']=[]
    user_forecasts[uid]['forecasts'].append(p['value'])
    user_forecasts[uid]['options'].append(p['answer_option'])
    user_forecasts[uid]['outcomes'].append(p['outcome'])

In [50]:
# In the file survey_fcasts.yr1.csv there are a couple of forecasts made by the user "NULL"
# I don't know what this means: I suspected at first that those were forecasts on "voided" questions,
# but it turns out that that's not the case.
# To be sure, I'll delete the forecasts by that user from the data.

if 'NULL' in user_forecasts.keys():
    user_forecasts.pop('NULL')
users=list(user_forecasts.keys())

In [51]:
# Sorted list of all scores individuals have received.
# Type: List of floats.

individual_scores=np.array([])

for k in user_forecasts.keys():
    u=user_forecasts[k]
    u['brier']=individual_score(u)
    individual_scores=np.append(individual_scores, u['brier'])

individual_scores=np.sort(individual_scores)

In [52]:
# Which teams to associate to each forecaster.
# Type: Dictionary.
# Fields: Keys are user ids (`user_id`), values are sets of team ids (`team`), usually (but not always!) unique.

user_teams=dict()

for p in preds:
    uid=p['user_id']
    if not user_teams.__contains__(uid):
        user_teams[uid]=dict()
        user_teams[uid]['teams']=set()
    user_teams[uid]['teams']
    if not p['team'] in user_teams[uid]['teams']:
        user_teams[uid]['teams'].add(p['team'])

for k in user_teams:
    if len(user_teams[k]['teams'])!=1:
        print(k, user_teams[k])

In [53]:
# For each question, which forecaster made which prediction when?
# Type: dictionary of dictionaries of dictionaries (!) of lists
# Fields: Keys are question ids (`ifp_id`), values are dictionaries
# where keys are `forecasters`, `outcome`, `date_start` or
# `date_suspend`, `forecasters` is a dictionary where the keys are
# user ids (`user_id`) and values are dictionaries, where keys are
# `forecasts`, `options`, `timestamps` (all numpy lists).

q_fsters=dict()

# collect the data

for p in preds:
    qid=p['ifp_id']
    if not qid in q_fsters:
        q_fsters[qid]=dict()
        q_fsters[qid]['outcome']=questions[qid]['outcome']
        q_fsters[qid]['date_start']=questions[qid]['date_start']
        q_fsters[qid]['date_suspend']=questions[qid]['date_suspend']
        q_fsters[qid]['q_status']=questions[qid]['q_status']
        q_fsters[qid]['forecasters']=dict()
    uid=p['user_id']
    if not uid in q_fsters[qid]['forecasters']:
        q_fsters[qid]['forecasters'][uid]=dict()
        q_fsters[qid]['forecasters'][uid]['forecasts']=[]
        q_fsters[qid]['forecasters'][uid]['options']=[]
        q_fsters[qid]['forecasters'][uid]['timestamps']=[]
    q_fsters[qid]['forecasters'][uid]['forecasts'].append(p['value'])
    q_fsters[qid]['forecasters'][uid]['options'].append(p['answer_option'])
    q_fsters[qid]['forecasters'][uid]['timestamps'].append(p['timestamp'])

# sort the data

for qid in q_fsters.keys():
    for uid in q_fsters[qid]['forecasters'].keys():
        q_fsters[qid]['forecasters'][uid]['timestamps']=np.array(q_fsters[qid]['forecasters'][uid]['timestamps'])
        indices=np.argsort(q_fsters[qid]['forecasters'][uid]['timestamps'])
        q_fsters[qid]['forecasters'][uid]['timestamps']=q_fsters[qid]['forecasters'][uid]['timestamps'][indices]
        q_fsters[qid]['forecasters'][uid]['forecasts']=np.array(q_fsters[qid]['forecasters'][uid]['forecasts'])[indices]
        q_fsters[qid]['forecasters'][uid]['options']=np.array(q_fsters[qid]['forecasters'][uid]['options'])[indices]

        # replacing forecasts with probability 0 with forecasts with probability 0.01
        q_fsters[qid]['forecasters'][uid]['forecasts'][np.where(q_fsters[qid]['forecasters'][uid]['forecasts']==0)]=MIN_PROB
        q_fsters[qid]['forecasters'][uid]['forecasts'][np.where(q_fsters[qid]['forecasters'][uid]['forecasts']==1)]=1-MIN_PROB

In [54]:
# Forecasts per question

q_forecasts=dict()

for qid in q_fsters.keys():
    q_forecasts[qid]=dict()
    for uid in q_fsters[qid]['forecasters'].keys():
        for i in range(0, len(q_fsters[qid]['forecasters'][uid]['options'])-1):
            opt=q_fsters[qid]['forecasters'][uid]['options'][i]
            if not opt in q_forecasts[qid]:
                q_forecasts[qid][opt]=dict()
                q_forecasts[qid][opt]['forecasts']=[]
                q_forecasts[qid][opt]['timestamps']=[]
            q_forecasts[qid][opt]['forecasts'].append(q_fsters[qid]['forecasters'][uid]['forecasts'][i])
            q_forecasts[qid][opt]['timestamps'].append(q_fsters[qid]['forecasters'][uid]['timestamps'][i])

for qid in q_forecasts.keys():
    for o in q_forecasts[qid].keys():
        q_forecasts[qid][o]['forecasts']=np.array(q_forecasts[qid][o]['forecasts'])
        q_forecasts[qid][o]['timestamps']=np.array(q_forecasts[qid][o]['timestamps'])

In [55]:
# aggregations methods

aggregations=dict()

# different per-question aggregation methods
# {arithmetic mean, geometric mean, median}×{probs, odds, log odds}×{exponential decay, no decay}×{extremized, not extremized}
means=['arith', 'geom', 'median']
formats=['probs', 'odds', 'logodds']
decay=['nodec']#, 'nodec']
extremize=['gjpextr', 'postextr', 'neyextr', 'noextr']

aggr_methods=[]

for i1 in means:
    for i2 in formats:
        for i3 in decay:
            for i4 in extremize:
                #the first sometimes doesn't work (negative values), the second is equivalent to the geometric mean of odds
                if (i1=='geom' and i2=='logodds') or (i1=='arith' and i2=='logodds'):
                    continue
                aggr_methods.append([i1, i2, i3, i4])

print(aggr_methods)

[['arith', 'probs', 'nodec', 'gjpextr'], ['arith', 'probs', 'nodec', 'postextr'], ['arith', 'probs', 'nodec', 'neyextr'], ['arith', 'probs', 'nodec', 'noextr'], ['arith', 'odds', 'nodec', 'gjpextr'], ['arith', 'odds', 'nodec', 'postextr'], ['arith', 'odds', 'nodec', 'neyextr'], ['arith', 'odds', 'nodec', 'noextr'], ['geom', 'probs', 'nodec', 'gjpextr'], ['geom', 'probs', 'nodec', 'postextr'], ['geom', 'probs', 'nodec', 'neyextr'], ['geom', 'probs', 'nodec', 'noextr'], ['geom', 'odds', 'nodec', 'gjpextr'], ['geom', 'odds', 'nodec', 'postextr'], ['geom', 'odds', 'nodec', 'neyextr'], ['geom', 'odds', 'nodec', 'noextr'], ['median', 'probs', 'nodec', 'gjpextr'], ['median', 'probs', 'nodec', 'postextr'], ['median', 'probs', 'nodec', 'neyextr'], ['median', 'probs', 'nodec', 'noextr'], ['median', 'odds', 'nodec', 'gjpextr'], ['median', 'odds', 'nodec', 'postextr'], ['median', 'odds', 'nodec', 'neyextr'], ['median', 'odds', 'nodec', 'noextr'], ['median', 'logodds', 'nodec', 'gjpextr'], ['median

In [56]:
# differently aggregated means

for a in aggr_methods:
    aggrkey='_'.join(a)
    aggregations[aggrkey]=dict()
    for qid in q_forecasts.keys():
        aggregations[aggrkey][qid]=dict()
        aggregations[aggrkey][qid]['outcome']=questions[qid]['outcome']
        aggregations[aggrkey][qid]['aggr_forecasts']=[]
        aggregations[aggrkey][qid]['options']=[]

        for o in q_forecasts[qid].keys():
            n=len(q_forecasts[qid][o]['forecasts'])
            if 'probs' in a:
                poss_transformed=q_forecasts[qid][o]['forecasts']
            elif 'odds' in a:
                poss_transformed=q_forecasts[qid][o]['forecasts']/(1-q_forecasts[qid][o]['forecasts'])
            elif 'logodds' in a:
                poss_transformed=q_forecasts[qid][o]['forecasts']/(1-q_forecasts[qid][o]['forecasts'])
                poss_transformed=np.log(poss_transformed)

            if 'arith' in a:
                aggregations[aggrkey][qid]['aggr_forecasts'].append(np.mean(poss_transformed))
            elif 'geom' in a:
                aggregations[aggrkey][qid]['aggr_forecasts'].append(statistics.geometric_mean(poss_transformed))
            elif 'median' in a:
                aggregations[aggrkey][qid]['aggr_forecasts'].append(np.median(poss_transformed))

            aggregations[aggrkey][qid]['options'].append(o)

        if 'odds' in a:
            odds=np.array(aggregations[aggrkey][qid]['aggr_forecasts'])
            aggregations[aggrkey][qid]['aggr_forecasts']=odds/(1+odds)
        elif 'logodds' in a:
            log_odds=np.array(aggregations[aggrkey][qid]['aggr_forecasts'])
            odds=np.exp(log_odds)
            aggregations[aggrkey][qid]['aggr_forecasts']=odds/(1+odds)

        if 'gjpextr' in a:
            p=np.array(aggregations[aggrkey][qid]['aggr_forecasts'])
            aggregations[aggrkey][qid]['aggr_forecasts']=((p**EXTR)/(((p**EXTR)+(1-p))**(1/EXTR)))
        elif 'postextr' in a:
            p=np.array(aggregations[aggrkey][qid]['aggr_forecasts'])
            aggregations[aggrkey][qid]['aggr_forecasts']=p**EXTR
        elif 'neyextr' in a:
            p=np.array(aggregations[aggrkey][qid]['aggr_forecasts'])
            d=n*(math.sqrt(3*n**2-3*n+1)-2)/(n**2-n-1)
            aggregations[aggrkey][qid]['aggr_forecasts']=p**d

        aggregations[aggrkey][qid]['options']=np.array(aggregations[aggrkey][qid]['options'])
        aggregations[aggrkey][qid]['aggr_forecasts']=np.array(aggregations[aggrkey][qid]['aggr_forecasts'])

        # Renormalize to 1
        Z=np.sum(aggregations[aggrkey][qid]['aggr_forecasts'])
        aggregations[aggrkey][qid]['aggr_forecasts']/=Z

for a in aggregations.keys():
    briers=[]
    for qid in aggregations[a].keys():
        aggregations[a][qid]['brier']=np.mean((aggregations[a][qid]['aggr_forecasts']-(aggregations[a][qid]['outcome']==aggregations[a][qid]['options']))**2)
        briers.append(aggregations[a][qid]['brier'])
    aggregations[a]['brier']=np.mean(np.array(briers))

Peeking into the data structures.

In [57]:
print("forecast: ", preds[0])
print("question: ", qdata[0])
print("question result: ", questions['1001-0'])
print("all individual scores, sorted: ", individual_scores)
print("all teams for a user: ", user_teams['3304'])
#print("results for aggregation: ", aggregations)
#print("user forecasts: ", user_forecasts['3304'])
#print("data of questions, by forecasters: ", q_fsters['1176-0'])

forecast:  {'ifp_id': '1004-0', 'ctt': '1a', 'cond': 1, 'training': 'a', 'team': 'NA', 'user_id': '600', 'forecast_id': '-200987', 'fcast_type': '0', 'answer_option': 'a', 'value': 0.1, 'fcast_date': '2011-08-31', 'expertise': '1', 'q_status': 'closed', 'viewtime': 'NA', 'year': 1, 'timestamp': datetime.datetime(2011, 8, 31, 16, 17, 18), 'outcome': 'b'}
question:  {'ifp_id': '1001-0', 'q_type': '0', 'q_text': 'Will the Six-Party talks (among the US, North Korea, South Korea, Russia, China, and Japan) formally resume in 2011?', 'q_desc': "'In' refers to any time during the remainder of the 2011 calendar year, as defined by Eastern Time. Outcome will be resolved based on reporting from one or more of the following sources: BBC News or Reuters or Economist Online (http://www.bbc.co.uk/news/ or http://www.reuters.com/ or http://www.economist.com). If nothing is reported in these sources, then the 'status quo' outcome typically will be assumed (e.g., for a question about a political leader 

Calculating the advantage of different types of groups.

> Of these, the most important and least obvious is the transformation of the aggregate forecasts. Note that we take the weighted mean first, and then transform;
this works much better than transforming first and then averaging
the transformed individual predictions. The transformation we used is: $\frac{p^a}{(p^a+(1-p))^{\frac{1}{a}}}$ with $a=3$.

*— Lyle Ungar et al., “The Good Judgment Project: A large scale test of different methods of combining expert predictions” p. 2, 2012*

TODO: check that!

In [58]:
l=[(aggregations[k]['brier'],k) for k in aggregations.keys()]
l.sort()

for e in l:
    print(e[1], e[0])

print(individual_scores)

geom_odds_nodec_postextr 0.06684331420597692
geom_odds_nodec_gjpextr 0.06685036961388066
geom_probs_nodec_postextr 0.06948618742701797
geom_probs_nodec_gjpextr 0.06971932842735376
arith_probs_nodec_gjpextr 0.07178242689262797
arith_probs_nodec_postextr 0.07208091809301938
geom_odds_nodec_neyextr 0.07231780719749753
geom_probs_nodec_neyextr 0.07291391843827348
median_logodds_nodec_postextr 0.0772573134290434
median_probs_nodec_postextr 0.07725829634908649
median_odds_nodec_postextr 0.07726297637880059
median_logodds_nodec_gjpextr 0.07729252652831538
median_probs_nodec_gjpextr 0.0772935031717426
median_odds_nodec_gjpextr 0.07729827605332332
median_logodds_nodec_neyextr 0.08259674842253747
median_probs_nodec_neyextr 0.08259785089346433
median_odds_nodec_neyextr 0.08260076153437045
arith_probs_nodec_neyextr 0.08721705632596859
geom_probs_nodec_noextr 0.09383628773950334
geom_odds_nodec_noextr 0.09423579611898124
median_logodds_nodec_noextr 0.10458883612613978
median_probs_nodec_noextr 0.10

In [59]:
GROUP_SCORE_SAMPLES=500
MAX_GROUP_SIZE=100

group_scores=dict()

for size in range(2, MAX_GROUP_SIZE+1):
    samples=[]
    for n in range(0, GROUP_SCORE_SAMPLES):
        sample_group=random.sample(users, size)
        score=group_score(user_forecasts, sample_group)
        samples.append(score)
    group_scores[size]=np.mean(np.array(samples))

for k in group_scores.keys():
    print(k, len((np.where(individual_scores>group_scores[k]))[0])/len(individual_scores))

2 0.47920892494929007
3 0.49594320486815413
4 0.48225152129817445
5 0.4964503042596349
6 0.4934077079107505
7 0.49391480730223125
8 0.505578093306288
9 0.5106490872210954
10 0.5
11 0.5111561866125761
12 0.5060851926977687
13 0.513184584178499
14 0.5086206896551724
15 0.5152129817444219
16 0.5025354969574036
17 0.5081135902636917
18 0.5060851926977687
19 0.5157200811359026
20 0.5136916835699797
21 0.5121703853955375
22 0.518762677484787
23 0.5157200811359026
24 0.518762677484787
25 0.5162271805273834
26 0.5126774847870182
27 0.5152129817444219
28 0.5157200811359026
29 0.5202839756592292
30 0.5177484787018256
31 0.5101419878296146
32 0.5182555780933062
33 0.5212981744421906
34 0.507606490872211
35 0.5141987829614605
36 0.5172413793103449
37 0.5192697768762677
38 0.5192697768762677
39 0.5141987829614605
40 0.5167342799188641
41 0.52079107505071
42 0.5141987829614605
43 0.5157200811359026
44 0.5147058823529411
45 0.5157200811359026
46 0.5157200811359026
47 0.5162271805273834
48 0.519269776