In [1]:
import pandas
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2

In [2]:
der_scores = dict(
    alien=739.9, 
    amidar=188.6,
    assault=431.2,
    asterix=470.8,
    bank_heist=51.0,
    battle_zone=10124.6,
    boxing=0.2,
    breakout=1.9, 
    chopper_command=861.8,
    crazy_climber=16185.3,
    demon_attack=508, 
    freeway=27.9, 
    frostbite=866.8,
    gopher=349.5, 
    hero=6857.0, 
    jamesbond=301.6,
    kangaroo=779.3,
    krull=2851.5,
    kung_fu_master=14346.1,
    ms_pacman=1204.1, 
    pong=-19.3,
    private_eye=97.8,
    qbert=1152.9,
    road_runner=9600.0,
    seaquest=354.1, 
    up_n_down=2877.4,
)

simple_scores = dict(
    alien=616.9, 
    amidar=88,
    assault=527.2,
    asterix=1128.3,
    bank_heist=34.2,
    battle_zone=5184.4,
    boxing=9.1,
    breakout=16.4, 
    chopper_command=1246.9,
    crazy_climber=62583.6,
    demon_attack=208.1, 
    freeway=20.3, 
    frostbite=254.7,
    gopher=771.0, 
    hero=2656.6, 
    jamesbond=125.3,
    kangaroo=323.1,
    krull=4539.9,
    kung_fu_master=17257.2,
    ms_pacman=1480.0, 
    pong=12.8,
    private_eye=58.3,
    qbert=1288.8,
    road_runner=5640.6,
    seaquest=683.3, 
    up_n_down=3350.3,
)

sunrise_scores = dict(
    alien=872, 
    amidar=122.6,
    assault=594.8,
    asterix=755.0,
    bank_heist=266.7,
    battle_zone=15700,
    boxing=6.7,
    breakout=1.8, 
    chopper_command=1040,
    crazy_climber=22230,
    demon_attack=919.8, 
    freeway=30.2, 
    frostbite=2026.7,
    gopher=654.7, 
    hero=8072.5, 
    jamesbond=425.0,
    kangaroo=2726.7,
    krull=3171.9,
    kung_fu_master=9896.7,
    ms_pacman=1482.3,
    pong=-13.8,
    private_eye=100,
    qbert=1830.8,
    road_runner=11913.3,
    seaquest=570.7,
    up_n_down=3522.0,
)

random_scores = dict(
    alien=227.8,
    amidar=5.8,
    assault=222.4,
    asterix=210.0,
    bank_heist=14.2,
    battle_zone=2360,
    boxing=0.1,
    breakout=1.7, 
    chopper_command=811.0,
    crazy_climber=10780.5, 
    demon_attack=152.1,
    freeway=0.0, 
    frostbite=65.2,
    gopher=257.6, 
    hero=1027.0, 
    jamesbond=29.0,
    kangaroo=52.0,
    krull=1598.0,
    kung_fu_master=258.5,
    ms_pacman=307.3, 
    pong=-20.7,
    private_eye=24.9,
    qbert=163.9,
    road_runner=11.5, 
    seaquest=68.4,
    up_n_down=533.4
)


curl_scores = dict(
    alien=558.2,
    amidar=142.1,
    assault=600.6,
    asterix=734.5,
    bank_heist=131.6,
    battle_zone=14870.,
    boxing=1.2,
    breakout=4.9,
    chopper_command=1058.5,
    crazy_climber=12146.5, 
    demon_attack=817.6,
    freeway=26.7, 
    frostbite=1181.3,
    gopher=669.3, 
    hero=6279.3, 
    jamesbond=471.0,
    kangaroo=872.5,
    krull=4229.6,
    kung_fu_master=14307.8,
    ms_pacman=1465.5, 
    pong=-16.5,
    private_eye=218.4,
    qbert=1042.4,
    road_runner=5661., 
    seaquest=384.5,
    up_n_down=2955.2
)

human_scores = dict(
    alien=7127.7,
    amidar=1719.5,
    assault=742,
    asterix=8503.3,
    bank_heist=753.1,
    battle_zone=37187.5,
    boxing=12.1,
    breakout=30.5,
    chopper_command=7387.8,
    crazy_climber=35829.4,
    demon_attack=1971,
    freeway=29.6,
    frostbite=4334.7,
    gopher=2412.5,
    hero=30826.4,
    jamesbond=302.8,
    kangaroo=3035.0, 
    krull=2665.5, 
    kung_fu_master=22736.3,
    ms_pacman=6951.6,
    pong=14.6,
    private_eye=69571.3,
    qbert=13455.0, 
    road_runner=7845.0,
    seaquest=42054.7,
    up_n_down=11693.2
)

otrainbow_scores = dict(
    alien=824.7,
    amidar=82.8,
    assault=351.9,
    asterix=628.5,
    bank_heist=182.1,
    battle_zone=4060.6,
    boxing=2.5,
    breakout=9.8,
    chopper_command=1033.3,
    crazy_climber=21327.8,
    demon_attack=711.8,
    freeway=25.0,
    frostbite=231.6,
    gopher=778,
    hero=6458.8,
    jamesbond=112.3,
    kangaroo=605.4, 
    krull=3277.9, 
    kung_fu_master=5722.2,
    ms_pacman=941.9,
    pong=1.3,
    private_eye=100,
    qbert=509.3, 
    road_runner=2696.7,
    seaquest=286.9,
    up_n_down=2847.6,
)


nature_scores = dict(
    alien=1620, amidar=978, assault=4280.4,
    asterix=4359.0, bank_heist=455, battle_zone=29900.,
    boxing=88, breakout=385.5, chopper_command=6126.,
    crazy_climber=110763, demon_attack=12149.4, freeway=30.8,
    frostbite=797.4, gopher=8777.4, hero=20437.8, jamesbond=768.5,
    kangaroo=7259., krull=8422.3, kung_fu_master=26059.,
    ms_pacman=3085.6, pong=19.5, private_eye=146.7,
    qbert=13117.3, road_runner=39544.0, seaquest=5860.6, up_n_down=9989.9
)

ilya_scores = dict([
 ['alien', 771.2],
 ['amidar', 102.8],
 ['assault', 452.4],
 ['asterix', 603.5],
 ['bank_heist', 168.9],
 ['battle_zone', 12954.0],
 ['boxing', 6.0],
 ['breakout', 16.1],
 ['chopper_command', 780.3],
 ['crazy_climber', 20516.5],
 ['demon_attack', 1113.4],
 ['freeway', 9.8],
 ['frostbite', 331.1],
 ['gopher', 636.3],
 ['hero', 3736.3],
 ['jamesbond', 236.0],
 ['kangaroo', 940.6],
 ['krull', 4018.1],
 ['kung_fu_master', 9111.0],
 ['ms_pacman', 960.5],
 ['pong', -8.5],
 ['private_eye', -13.6],
 ['qbert', 854.4],
 ['road_runner', 8895.1],
 ['seaquest', 301.2],
 ['up_n_down', 3180.8]])

def to_df(score_dict):
    df = pandas.DataFrame(np.array(list(score_dict.values())).reshape(-1, 1),
                                     index=list(score_dict.keys()),
                                     columns=["GameScoreAverage"])
    return df

atari_der_scores = to_df(der_scores)
atari_otrainbow_scores = to_df(otrainbow_scores)
atari_human_scores = to_df(human_scores)
atari_random_scores = to_df(random_scores)
atari_nature_scores = to_df(nature_scores)
atari_ilya_scores = to_df(ilya_scores)
atari_curl_scores = to_df(curl_scores)
atari_simple_scores = to_df(simple_scores)
atari_sunrise_scores = to_df(sunrise_scores)



atari_nature_scores["game"] = list(nature_scores.keys())
atari_der_scores["game"] = list(der_scores.keys())
atari_otrainbow_scores["game"] = list(otrainbow_scores.keys())
atari_human_scores["game"] = list(human_scores.keys())
atari_random_scores["game"] = list(random_scores.keys())
atari_ilya_scores["game"] = list(ilya_scores.keys())
atari_curl_scores["game"] = list(curl_scores.keys())
atari_simple_scores["game"] = list(simple_scores.keys())
atari_sunrise_scores["game"] = list(sunrise_scores.keys())

In [3]:
def split_on_fields(df, fields=["n_step", "encoder", "tag",
                                 "batch_size", "replay_ratio",
                                 "batch_t", "jumps", "nce",
                                 "use_all_targets", "time_contrastive",
                                 "model_rl_weight", "detach_model"],
                   min_len=0,
                   min_games=0,
                   show_all_fields=False):
    strings = [""]
    df.tag = [str(t).replace("q_l1", "ql1") for t in df.tag]
    dfs = [df]
    
    for field in fields:
        new_dfs = []
        new_strs = []
        for df, string in zip(dfs, strings):
            if not show_all_fields and len(set(df[field].astype(str))) == 1:
                new_dfs.append(df)
                new_strs.append(string)
            else:
                for value in set(df[field].astype(str)):
                    new_str = string + "{}: {}  ".format(field[:5], value)
                    new_df = df[np.array([str(x) == str(value) for x in df[field]])]
                    if len(new_df) > min_len and len(set(new_df.game)) >= min_games:
                        new_dfs.append(new_df)
                        new_strs.append(new_str)
    
                    
        dfs = new_dfs
        strings = new_strs
        
    return dfs, [string[:-2] for string in strings]

def find_missing_pairs(dfs, names):
    games = []
    for df in dfs:
        games += list(df.game)
    games = set(games)
    
    for df, name in zip(dfs, names):
        seeds = set(df.seed)
        found_pairs = set([(game, seed) for game, seed in zip(df.game, df.seed)])

        missing_pairs = []
        for seed in seeds:
            for game in games:
                if (game, seed) not in found_pairs:
                    missing_pairs.append("\"--game {} --seed {}\"".format(game, seed))

        missing_pairs = " ".join(missing_pairs)

        print(name)
        print("PAIRS=( {} )".format(missing_pairs))
        print()

In [4]:
def mean_pairwise_score(df1, df2):
    s1 = (df1["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])/\
          (df2["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])
    s2 = (df2["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])/\
          (df1["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])
    score = (s1.mean() - s2.mean())
    return score

def median_pairwise_score(df1, df2):
    s1 = (df1["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])/\
          (df2["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])
    s2 = (df2["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])/\
          (df1["GameScoreAverage"] - atari_random_scores["GameScoreAverage"])
    return (s1.median() - s2.median())

def group_dfs(dfs):
    proc_dfs = []
    for df in dfs:
        if "seed" in df.keys():
            proc_dfs.append(df.groupby(["seed", "game"]).mean().groupby("game").mean())
        else:
            proc_dfs.append(df.groupby("game").mean())
            
    return proc_dfs

def compare_dfs(dfs, names, min_games=0, pairwise_top_n=-1, sort_key=np.median):
    if min_games > 0:
        games = Counter()
        games_count = [Counter(df.game) for df in dfs]
        for count in games_count:
            games = games + count
        mandatory_games = set([g[0] for g in games.most_common()[:min_games]])
        usable_dfs = []
        usable_names = []
        for df, name in zip(dfs, names):
            if set(df.game) >= mandatory_games:
                usable_dfs.append(df)
                usable_names.append(name)
        dfs = usable_dfs
        names = usable_names
    games = set(dfs[0].game)
    for df in dfs[1:]:
        games = games & set(df.game)
    
    games_dfs = group_dfs(dfs)
    
    games_dfs = [df[[g in games for g in df.index]] for df in games_dfs]
    for df in games_dfs:
        gsn = []
        for game, score in zip(df.index, df["GameScoreAverage"]):
            gsn.append((score - random_scores[game])/(human_scores[game] - random_scores[game]))

        df["GameScoreNormalized"] = np.array(gsn)
    
    for df in games_dfs:
        gsn = []
        for game, score in zip(df.index, df["GameScoreAverage"]):
            gsn.append((score - random_scores[game])/(nature_scores[game] - random_scores[game]))

        df["GameScoreNatureNormalized"] = np.array(gsn)
        
    for df in games_dfs:
        gsn = []
        for game, score in zip(df.index, df["GameScoreAverage"]):
            gsn.append((score - random_scores[game])/(der_scores[game] - random_scores[game]))

        df["GameScoreDERNormalized"] = np.array(gsn)
        
    scores = [-sort_key(df["GameScoreNormalized"]) for df in games_dfs]
    indices = np.argsort(scores)
    games_dfs = np.array(games_dfs)[indices]
    names = np.array(names)[indices]
    
    print()    
    print("Human Scores:")
    for df, name in zip(games_dfs, names):
        print("{0}: Median: {1:.3f}, Mean: {2:.3f}".format(name, df["GameScoreNormalized"].median(), df["GameScoreNormalized"].mean()))
    
    print()
    print("Nature Scores:")
    for df, name in zip(games_dfs, names):
        print("{0}: Median: {1:.3f}, Mean: {2:.3f}".format(name, df["GameScoreNatureNormalized"].median(), df["GameScoreNatureNormalized"].mean()))
        
    print()
    print("Comparison over {} games:".format(len(games)))
    if pairwise_top_n < 0:
        pairwise_top_n = len(dfs)
    for i in range(pairwise_top_n):
        print()
        for j in range(pairwise_top_n):
            if i == j:
                continue
            better_games = np.sum(games_dfs[i]["GameScoreAverage"] > games_dfs[j]["GameScoreAverage"])
            mean_comp_score = mean_pairwise_score(games_dfs[i], games_dfs[j])
            median_comp_score = median_pairwise_score(games_dfs[i], games_dfs[j])
            
            print("{} above {}: {}, {:.3f}, {:.3f}".format(names[i], names[j],
                                                           better_games, 
                                                           median_comp_score,
                                                          mean_comp_score))
            
    return games_dfs, names

def sort_games(game_df, key="GameScoreNormalized"):
    indices = np.argsort(game_df[key])
    games_dfs[-1][key][indices]
    for i, game in enumerate(indices):
        i = i+1
        median = (i == len(game_df)//2 or i == len(game_df)//2+1)
        print("{} {} : {:.3f} {:.3f} {}".format(i, 
                                                game_df.index[game],
                                                game_df[key][game],
                                                game_df["GameScoreAverage"][game],
                                                "median" if median else ""))

In [93]:
def bootstrap(df, num_runs=5):
    games = set(df.game)
    games_dfs = []
    
    for game in games:
        game_df = df[df.game == game]
        
        indices = np.random.randint(0, len(game_df), (num_runs,))
        
        new_df = game_df.iloc[indices]
        games_dfs.append(new_df)
        
    return pandas.concat(games_dfs)

def seed_bootstrap(df, num_runs=5):
    seeds = list(set(df.seed))
    seed_dfs = []
    
    for i in range(num_runs):
        seed = seeds[np.random.randint(len(seeds))]
        seed_df = df[df.seed == seed]
        
        seed_dfs.append(seed_df)

    return pandas.concat(seed_dfs)
    
def bootstrap_distribution(df, samples=100, runs=5):
    medians = []
    means = []
    
    for i in range(samples):
        b_df = bootstrap(df, runs)
        b_df = group_dfs([b_df])[0]
        medians.append(b_df.GameScoreNormalized.median())
        means.append(b_df.GameScoreNormalized.mean())
        
    print(sorted(medians))
    sns.distplot(medians)
    plt.title("Median Distribution")
    plt.ylabel("Count")
    plt.xlabel("Median HNS")
    plt.show()
    sns.distplot(means)
    plt.title("Mean Distribution")
    plt.ylabel("Count")
    plt.xlabel("Mean HNS")
    plt.show()
    
    return medians, means

In [9]:
byol_data = pandas.read_csv("byol_replications.csv")
# byol_data = byol_data[np.array([seed in (120, 101, 88, 444, 420) for seed in byol_data.seed])]
byol_data = byol_data[np.array([rr==64 for rr in byol_data.replay_ratio])]
release_data = pandas.read_csv("release_test.csv")
byol_data = pandas.concat([byol_data, release_data])

byol_sweeps, byol_names = split_on_fields(byol_data, 
                                          [
                                           "tag",
                                           "noisy_nets", 
                                           "distributional", 
                                           "replay_ratio", 
                                           "jumps",
                                           "classifier",
                                           "augmentation",
                                           "target_update_interval",
                                           "batch_size",
                                           "reward_loss_weight",
                                           "model_rl_weight",
                                           "nce_loss_weight",
                                           "init",
                                           "encoder",
                                          ],
                                          0,
                                         min_games=26)

find_missing_pairs(byol_sweeps, byol_names)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


tag: pbl_control
PAIRS=(  )

tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_va
PAIRS=(  )

tag: j2_control
PAIRS=(  )

tag: no_aug_dqn_control
PAIRS=(  )

tag: big_model_control
PAIRS=(  )

tag: byol_ql1_squared_nogs_m5
PAIRS=(  )

tag: no_aug_dist_byol_bil_squared_nogs_m5_rew1_512
PAIRS=(  )

tag: uat_control
PAIRS=(  )

tag: byol_ql1_squared_nogs_m5_rew01_mse
PAIRS=(  )

tag: byol_ql1_rep_test
PAIRS=(  )

tag: release_test_3
PAIRS=( "--game jamesbond --seed 101" "--game demon_attack --seed 101" "--game amidar --seed 120" "--game jamesbond --seed 444" )

tag: byol_tc_control
PAIRS=(  )

tag: byol_base_6432
PAIRS=(  )

tag: deepmdp_control
PAIRS=(  )

tag: dist_byol_ql1_squared_nogs_m5_rew1_512
PAIRS=( "--game asterix --seed 120" )

tag: j1_control
PAIRS=(  )

tag: affine_dist_byol_ql1_squared_nogs_m5_norm_ql1_va
PAIRS=(  )

tag: j0_control
PAIRS=(  )

tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5
PAIRS=(  )

tag: hybrid_aug_control
PAIRS=( "--game crazy_climber --seed 101" )

tag

In [10]:
medians, means = bootstrap_distribution(byol_sweeps[-10], samples=100, runs=20)

NameError: name 'bootstrap_distribution' is not defined

In [11]:
np.mean(np.array(medians) > 0.32)

NameError: name 'medians' is not defined

In [12]:
names =  [
          *byol_names,
          "der", 
#           "nature",
          "curl",
          "drq",
          "simple",
          "human",
          "nature",
          "random",
          "otrainbow",
          "sunrise",
             ]
dfs = [
       *byol_sweeps,
       atari_der_scores, 
#        atari_nature_scores,
       atari_curl_scores,
       atari_ilya_scores,
       atari_simple_scores,
       atari_human_scores,
       atari_nature_scores,
       atari_random_scores,
       atari_otrainbow_scores,
       atari_sunrise_scores,
                         ]
games_dfs, names = compare_dfs(dfs, names, min_games=0, sort_key=np.median)


Human Scores:
nature: Median: 1.008, Mean: 2.675
human: Median: 1.000, Mean: 1.000
tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.421, Mean: 0.592
tag: j1_control: Median: 0.403, Mean: 0.608
tag: release_test_3: Median: 0.366, Mean: 0.559
tag: j0_control: Median: 0.352, Mean: 0.536
tag: deepmdp_control: Median: 0.334, Mean: 0.567
tag: byol_tc_control: Median: 0.319, Mean: 0.487
sunrise: Median: 0.305, Mean: 0.445
tag: byol_ql1_squared_nogs_m5_rew01_mse: Median: 0.300, Mean: 0.457
tag: dist_byol_ql1_squared_nogs_m5_rew1_512: Median: 0.297, Mean: 0.514
tag: byol_ql1_b0: Median: 0.297, Mean: 0.458
tag: j2_control: Median: 0.284, Mean: 0.578
tag: big_model_control: Median: 0.282, Mean: 0.507
tag: pbl_control: Median: 0.281, Mean: 0.578
tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5: Median: 0.276, Mean: 0.510
tag: byol_replication_aug_control: Median: 0.268, Mean: 0.448
drq: Median: 0.268, Mean: 0.357
tag: byol_ql1_squared_nogs_m5: Median: 0.265, Mean: 0.476
tag: tc_nce_

tag: release_test_3 above tag: deepmdp_control: 15, 0.045, 0.352
tag: release_test_3 above tag: byol_tc_control: 18, 0.307, 0.091
tag: release_test_3 above sunrise: 12, -0.176, 10.323
tag: release_test_3 above tag: byol_ql1_squared_nogs_m5_rew01_mse: 18, 0.407, -0.538
tag: release_test_3 above tag: dist_byol_ql1_squared_nogs_m5_rew1_512: 15, 0.081, -0.181
tag: release_test_3 above tag: byol_ql1_b0: 18, 0.221, -1.063
tag: release_test_3 above tag: j2_control: 11, -0.165, -0.225
tag: release_test_3 above tag: big_model_control: 15, 0.092, 0.276
tag: release_test_3 above tag: pbl_control: 13, -0.033, -0.149
tag: release_test_3 above tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5: 21, 0.384, 0.531
tag: release_test_3 above tag: byol_replication_aug_control: 16, 0.375, 0.163
tag: release_test_3 above drq: 20, 0.953, 0.991
tag: release_test_3 above tag: byol_ql1_squared_nogs_m5: 20, 0.266, 0.152
tag: release_test_3 above tag: tc_nce_control: 19, 0.303, 0.750
tag: release_test_3 above 

sunrise above tag: affine_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 18, 0.326, -5.321
sunrise above otrainbow: 21, 0.741, -1.276
sunrise above tag: hybrid_aug_control: 19, 0.735, -2.471
sunrise above tag: byol_base_6432: 19, 0.452, -0.310
sunrise above tag: no_aug_dqn_control: 22, 0.725, -1.560
sunrise above curl: 17, 0.267, -0.333
sunrise above tag: hybrid_control: 21, 1.739, 4.157
sunrise above der: 23, 1.114, 3.900
sunrise above simple: 16, 0.560, -3.710
sunrise above tag: byol_replication_control: 20, 0.822, 5.903
sunrise above tag: nce_control: 24, 2.163, 0.449
sunrise above random: 26, inf, inf

tag: byol_ql1_squared_nogs_m5_rew01_mse above nature: 1, -4.312, -9.343
tag: byol_ql1_squared_nogs_m5_rew01_mse above human: 3, -3.057, -37.091
tag: byol_ql1_squared_nogs_m5_rew01_mse above tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 5, -0.533, -0.353
tag: byol_ql1_squared_nogs_m5_rew01_mse above tag: j1_control: 8, -0.422, -0.082
tag: byol_ql1_squared_nogs_m5_rew01_mse above tag: re

tag: big_model_control above tag: uat_control: 15, 0.243, 0.069
tag: big_model_control above tag: both_losses_aug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 9, -0.289, -1.058
tag: big_model_control above tag: no_aug_dist_byol_bil_squared_nogs_m5_rew1_512: 14, 0.095, 0.327
tag: big_model_control above tag: byol_ql1_rep_test: 19, 0.525, 1.209
tag: big_model_control above tag: affine_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 13, -0.044, 0.067
tag: big_model_control above otrainbow: 19, 0.999, 1.205
tag: big_model_control above tag: hybrid_aug_control: 18, 0.444, 0.910
tag: big_model_control above tag: byol_base_6432: 21, 0.439, 0.999
tag: big_model_control above tag: no_aug_dqn_control: 19, 0.928, -0.756
tag: big_model_control above curl: 13, -0.123, 0.755
tag: big_model_control above tag: hybrid_control: 20, 0.741, 3.752
tag: big_model_control above der: 15, 0.599, 9.502
tag: big_model_control above simple: 14, 0.253, 0.932
tag: big_model_control above tag: byol_replication_control: 21,

tag: byol_ql1_squared_nogs_m5 above sunrise: 9, -0.602, 4.007
tag: byol_ql1_squared_nogs_m5 above tag: byol_ql1_squared_nogs_m5_rew01_mse: 10, -0.079, -0.349
tag: byol_ql1_squared_nogs_m5 above tag: dist_byol_ql1_squared_nogs_m5_rew1_512: 9, -0.317, -0.295
tag: byol_ql1_squared_nogs_m5 above tag: byol_ql1_b0: 11, -0.145, 0.239
tag: byol_ql1_squared_nogs_m5 above tag: j2_control: 8, -0.368, -0.472
tag: byol_ql1_squared_nogs_m5 above tag: big_model_control: 10, -0.166, -0.092
tag: byol_ql1_squared_nogs_m5 above tag: pbl_control: 6, -0.327, -1.171
tag: byol_ql1_squared_nogs_m5 above tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5: 13, -0.070, -0.436
tag: byol_ql1_squared_nogs_m5 above tag: byol_replication_aug_control: 11, -0.102, 0.097
tag: byol_ql1_squared_nogs_m5 above drq: 19, 0.443, 0.389
tag: byol_ql1_squared_nogs_m5 above tag: tc_nce_control: 10, -0.297, -0.093
tag: byol_ql1_squared_nogs_m5 above tag: uat_control: 10, -0.208, -0.207
tag: byol_ql1_squared_nogs_m5 above tag: bo

tag: byol_ql1_rep_test above tag: byol_ql1_b0: 10, -0.371, -1.243
tag: byol_ql1_rep_test above tag: j2_control: 6, -0.634, -1.217
tag: byol_ql1_rep_test above tag: big_model_control: 7, -0.525, -1.209
tag: byol_ql1_rep_test above tag: pbl_control: 7, -0.472, -1.375
tag: byol_ql1_rep_test above tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5: 10, -0.152, -1.193
tag: byol_ql1_rep_test above tag: byol_replication_aug_control: 10, -0.374, -1.039
tag: byol_ql1_rep_test above drq: 12, -0.007, 0.022
tag: byol_ql1_rep_test above tag: byol_ql1_squared_nogs_m5: 10, -0.295, -1.230
tag: byol_ql1_rep_test above tag: tc_nce_control: 8, -0.419, -0.708
tag: byol_ql1_rep_test above tag: uat_control: 8, -0.307, -1.543
tag: byol_ql1_rep_test above tag: both_losses_aug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 5, -0.691, -1.945
tag: byol_ql1_rep_test above tag: no_aug_dist_byol_bil_squared_nogs_m5_rew1_512: 9, -0.205, -0.536
tag: byol_ql1_rep_test above tag: affine_dist_byol_ql1_squared_nogs_m5_nor

tag: no_aug_dqn_control above tag: tc_nce_control: 9, -0.766, 1.507
tag: no_aug_dqn_control above tag: uat_control: 7, -0.655, 0.907
tag: no_aug_dqn_control above tag: both_losses_aug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 4, -1.093, 2.066
tag: no_aug_dqn_control above tag: no_aug_dist_byol_bil_squared_nogs_m5_rew1_512: 8, -0.582, -0.093
tag: no_aug_dqn_control above tag: byol_ql1_rep_test: 10, -0.431, 0.477
tag: no_aug_dqn_control above tag: affine_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 9, -0.544, 2.307
tag: no_aug_dqn_control above otrainbow: 15, 0.323, 0.459
tag: no_aug_dqn_control above tag: hybrid_aug_control: 11, -0.261, 0.791
tag: no_aug_dqn_control above tag: byol_base_6432: 6, -0.233, 0.012
tag: no_aug_dqn_control above curl: 11, -0.267, -0.277
tag: no_aug_dqn_control above tag: hybrid_control: 14, 0.202, 0.696
tag: no_aug_dqn_control above der: 11, -0.067, 1.417
tag: no_aug_dqn_control above simple: 11, -0.180, 1.507
tag: no_aug_dqn_control above tag: byol_replication

tag: byol_replication_control above tag: uat_control: 6, -1.007, -18.682
tag: byol_replication_control above tag: both_losses_aug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 3, -1.159, -39.331
tag: byol_replication_control above tag: no_aug_dist_byol_bil_squared_nogs_m5_rew1_512: 5, -0.662, -3.126
tag: byol_replication_control above tag: byol_ql1_rep_test: 11, -0.600, -12.045
tag: byol_replication_control above tag: affine_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: 9, -0.775, -39.962
tag: byol_replication_control above otrainbow: 13, 0.075, -1.487
tag: byol_replication_control above tag: hybrid_aug_control: 10, -0.217, -12.636
tag: byol_replication_control above tag: byol_base_6432: 8, -0.264, -1.334
tag: byol_replication_control above tag: no_aug_dqn_control: 10, -0.103, 5.807
tag: byol_replication_control above curl: 7, -0.313, 1.606
tag: byol_replication_control above tag: hybrid_control: 11, -0.020, 0.551
tag: byol_replication_control above der: 9, -0.312, 2.320
tag: byol_replicatio

In [13]:
for i in range(len(names)):
    print(names[i])
    indices = np.array(list(np.argsort(-games_dfs[i]["GameScoreNormalized"])))
    games_dfs[i]["gshn"] = games_dfs[i].GameScoreNormalized
    games_dfs[i]["gsnn"] = games_dfs[i].GameScoreNatureNormalized
    games_dfs[i]["gsa"] = games_dfs[i].GameScoreAverage
    print(games_dfs[i].iloc[indices][["gsa", "gshn", "gsnn"]])
    print()

nature
                      gsa       gshn  gsnn
game                                      
breakout            385.5  13.326389   1.0
assault            4280.4   7.809854   1.0
boxing               88.0   7.325000   1.0
demon_attack      12149.4   6.595910   1.0
krull              8422.3   6.392787   1.0
road_runner       39544.0   5.046595   1.0
crazy_climber    110763.0   3.991493   1.0
gopher             8777.4   3.953687   1.0
jamesbond           768.5   2.700877   1.0
kangaroo           7259.0   2.416024   1.0
kung_fu_master    26059.0   1.147821   1.0
pong                 19.5   1.138810   1.0
freeway              30.8   1.040541   1.0
qbert             13117.3   0.974592   1.0
up_n_down          9989.9   0.847372   1.0
chopper_command    6126.0   0.808144   1.0
battle_zone       29900.0   0.790754   1.0
hero              20437.8   0.651382   1.0
bank_heist          455.0   0.596562   1.0
amidar              978.0   0.567310   1.0
asterix            4359.0   0.500283   1.0
ms_p

In [41]:
for i, n in enumerate(names):
    print(i, n)

0 nature
1 human
2 tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_va
3 tag: byol_ql1_squared_nogs_m5_rew01_mse
4 tag: dist_byol_ql1_squared_nogs_m5_rew1_512
5 tag: byol_ql1_b0
6 tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5
7 tag: byol_replication_aug_control
8 drq
9 tag: byol_ql1_squared_nogs_m5
10 tag: no_aug_dist_byol_bil_squared_nogs_m5_rew1_512
11 tag: byol_ql1_rep_test
12 curl
13 Base
14 UAT
15 tag: nan  noisy: 1  distr: 1  repla: 32  class: bilinear  targe: 2000
16 tag: nan  noisy: 1  distr: 1  repla: 32  class: bilinear  targe: 1000
17 tag: byol_base_no_ema
18 UAT no-dist pri aug
19 otrainbow
20 tag: byol_base_control
21 Base no-dist NoPri 64/32
22 tag: ankesh_config_replication
23 tag: hybrid_aug_control
24 our_der
25 tag: byol_base_6432
26 Base no-dist Pri 64/32
27 Base w/ hard negs
28 5k-eps-no-target
29 uat_tui1_no_dueling_c51
30 tag: hybrid_control
31 Base NT dueling
32 der
33 our drq dqn dueling no-dist
34 Base w/o target
35 tag: byol_base_aug
36 tag: ema_target_cont

In [19]:
simple = games_dfs[list(names).index("simple")]
random = games_dfs[list(names).index("random")]
human = games_dfs[list(names).index("human")]
der = games_dfs[list(names).index("der")]
otrainbow = games_dfs[list(names).index("otrainbow")]
sunrise = games_dfs[list(names).index("sunrise")]
curl = games_dfs[list(names).index("curl")]
drq = games_dfs[list(names).index("drq")]
no_aug = games_dfs[list(names).index("tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5")]
aug = games_dfs[list(names).index("tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_va")]


table_names = ["Random", "Human", "SimPLe", "DER", "OTRainbow", "CuRL", "DrQ", "Sunrise", "MPM", "MPM+Aug"]
table_dfs = [random, 
             human, 
             simple,
             der, 
             otrainbow,
             curl,
             drq,
             sunrise,
             no_aug,
             aug]
results_df = pandas.DataFrame({name:df.GameScoreAverage for df, name in zip(table_dfs, table_names)},
                              index=games_dfs[0].index)
for i in range(len(results_df)):
    results_df.iloc[i] = ["{:0.1f}".format(v) for v in results_df.iloc[i]]

results_df.loc["Median Human-Norm'd"] = ["{:0.3f}".format(float(df.GameScoreNormalized.median())) for df in table_dfs]
results_df.loc["Mean Human-Norm'd"] = ["{:0.3f}".format(float(df.GameScoreNormalized.mean())) for df in table_dfs]
results_df.loc["Median DQN-Norma'd"] = ["{:0.3f}".format(float(df.GameScoreNatureNormalized.median())) for df in table_dfs]
results_df.loc["Mean DQN-Norm'd"] = ["{:0.3f}".format(float(df.GameScoreNatureNormalized.mean())) for df in table_dfs]
results_df.loc["# Superhuman"] = ["{}".format(int((df.GameScoreNormalized >= 1.).sum())) for df in table_dfs]

results_df.index = [g.replace("_", " ").title() for g in results_df.index]
print(results_df.to_latex(float_format="%.2f"))

\begin{tabular}{lllllllllll}
\toprule
{} &   Random &    Human &   SimPLe &      DER & OTRainbow &     CuRL &      DrQ &  Sunrise &      MPM &  MPM+Aug \\
\midrule
Alien               &    227.8 &   7127.7 &    616.9 &    739.9 &     824.7 &    558.2 &    771.2 &    872.0 &    801.9 &    919.6 \\
Amidar              &      5.8 &   1719.5 &     88.0 &    188.6 &      82.8 &    142.1 &    102.8 &    122.6 &    177.3 &    159.6 \\
Assault             &    222.4 &    742.0 &    527.2 &    431.2 &     351.9 &    600.6 &    452.4 &    594.8 &    661.1 &    699.5 \\
Asterix             &    210.0 &   8503.3 &   1128.3 &    470.8 &     628.5 &    734.5 &    603.5 &    755.0 &    619.2 &    983.5 \\
Bank Heist          &     14.2 &    753.1 &     34.2 &     51.0 &     182.1 &    131.6 &    168.9 &    266.7 &    313.1 &    370.1 \\
Battle Zone         &   2360.0 &  37187.5 &   5184.4 &  10124.6 &    4060.6 &  14870.0 &  12954.0 &  15700.0 &  11510.0 &  14472.0 \\
Boxing              &      0.1 &

In [16]:
simple = games_dfs[list(names).index("simple")]
random = games_dfs[list(names).index("random")]
human = games_dfs[list(names).index("human")]
der = games_dfs[list(names).index("drq")]
curl = games_dfs[list(names).index("curl")]
drq = games_dfs[list(names).index("drq")]


table_names = ["Random", "Human", "SimPLe", "DER", "CuRL", "DrQ", "MPM", "MPM+Aug"]
table_dfs = [random, 
             human, 
             simple,
             der, 
             curl,
             drq,
             games_dfs[8],
             games_dfs[1]]
results_df = pandas.DataFrame({name:df.GameScoreAverage for df, name in zip(table_dfs, table_names)},
                              index=games_dfs[0].index)
for i in range(len(results_df)):
    results_df.iloc[i] = ["{:0.1f}".format(v) for v in results_df.iloc[i]]

results_df.loc["Median Human-Normalized"] = ["{:0.3f}".format(float(df.GameScoreNormalized.median())) for df in table_dfs]
results_df.loc["Mean Human-Normalized"] = ["{:0.3f}".format(float(df.GameScoreNormalized.mean())) for df in table_dfs]
results_df.loc["Median DQN-Normalized"] = ["{:0.3f}".format(float(df.GameScoreNatureNormalized.median())) for df in table_dfs]
results_df.loc["Mean DQN-Normalized"] = ["{:0.3f}".format(float(df.GameScoreNatureNormalized.mean())) for df in table_dfs]

results_df.index = [g.replace("_", " ").title() for g in results_df.index]
print(results_df.to_latex(float_format="%.2f"))

\begin{tabular}{lllllllll}
\toprule
{} &   Random &    Human &   SimPLe &      DER &     CuRL &      DrQ &      MPM &  MPM+Aug \\
\midrule
Alien                   &    227.8 &   7127.7 &    616.9 &    771.2 &   1148.2 &    771.2 &    720.0 &   7127.7 \\
Amidar                  &      5.8 &   1719.5 &     88.0 &    102.8 &    232.3 &    102.8 &    107.9 &   1719.5 \\
Assault                 &    222.4 &    742.0 &    527.2 &    452.4 &    543.7 &    452.4 &    576.8 &    742.0 \\
Asterix                 &    210.0 &   8503.3 &   1128.3 &    603.5 &    524.3 &    603.5 &    835.4 &   8503.3 \\
Bank Heist              &     14.2 &    753.1 &     34.2 &    168.9 &    193.7 &    168.9 &    196.8 &    753.1 \\
Battle Zone             &   2360.0 &  37187.5 &   5184.4 &  12954.0 &  11208.0 &  12954.0 &  15698.0 &  37187.5 \\
Boxing                  &      0.1 &     12.1 &      9.1 &      6.0 &      4.8 &      6.0 &     26.9 &     12.1 \\
Breakout                &      1.7 &     30.5 &     16.4

In [53]:
corrs = []
import seaborn as sns

from scipy import stats
import matplotlib.pyplot as plt
def r2(x, y):
    return stats.pearsonr(x, y)[0]

for game in games_dfs[0].index:

    game_scores = [df.loc[game]["GameScoreAverage"] for df in games_dfs[2:]]
    hn_medians = [df["GameScoreNormalized"].median() for df in games_dfs[2:]]

    corr = r2(game_scores, hn_medians)
    corrs.append(corr)
    
#     sns.jointplot(game_scores, hn_medians, kind="reg", stat_func=r2)
#     plt.xlabel("Human-normalized score on {}".format(game))
#     plt.ylabel("Median human-normalized score on Atari26")
#     plt.show()
corrs = np.array(corrs)
print(np.sum(corrs**2 > 0.1))

indices = np.argsort(-corrs)
for i in indices:
    print(games_dfs[0].index[i], corrs[i])

21
battle_zone 0.6339725769470762
kung_fu_master 0.6125451697671178
alien 0.6093354483909768
bank_heist 0.6044673928841325
frostbite 0.5939096080555604
assault 0.5465138621075727
breakout 0.5391604962238604
pong 0.51964601583719
ms_pacman 0.5002458899651953
asterix 0.4969561260567246
kangaroo 0.4592035161320661
demon_attack 0.45839165098408857
jamesbond 0.4500748638058255
chopper_command 0.431083108091754
boxing 0.42012746327509526
road_runner 0.3980389844633387
gopher 0.3866721033049136
seaquest 0.3665141649940542
amidar 0.3377300069687536
hero 0.33150685081631887
crazy_climber 0.3254062609509704
krull 0.2930956324057619
qbert 0.25481902962399183
freeway 0.22805313394584212
up_n_down 0.20757402176950332
private_eye -0.07975457674286303


In [54]:
uat_data = pandas.read_csv("uat_sweep.csv")
byol_data = pandas.read_csv("byol_replications.csv")
ankesh_byol_data = pandas.read_csv("ankesh_newexps.csv")
# byol_data = pandas.concat([byol_data, ankesh_byol_data])
byol_data = byol_data[byol_data["use_ram"] != 1]
# byol_data = byol_data[np.array(["shift" not in aug and "dropout" not in aug for aug in byol_data.augmentation])]
byol_sweeps, byol_names = split_on_fields(byol_data, ["tag", "frame_dropout", "augmentation",  "noisy_nets", "distributional", 
                                                      "replay_ratio", "jumps", "classifier", "model_nce_weight",
                                                      "reward_loss_weight", "dqn_hidden_size", 
                                                      "grad_scale_factor",
                                                      "target_update_interval", "dynamics_blocks",
                                                      "byol", "delta_clip"], 0,
                                         min_games=0)
for name, df in zip(byol_names, byol_sweeps):
    print(name)
    print(Counter(df.game))
    print()

tag: byol_ql1_squared_nogs_m75_rew01
Counter({'pong': 5, 'crazy_climber': 5, 'breakout': 5, 'bank_heist': 5, 'battle_zone': 5, 'up_n_down': 5, 'boxing': 5, 'assault': 5, 'frostbite': 5, 'kangaroo': 5})

tag: dropout_nfd_nklf_0502_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va
Counter({'breakout': 5, 'battle_zone': 5, 'crazy_climber': 5, 'pong': 5, 'up_n_down': 5, 'bank_heist': 5, 'boxing': 5, 'assault': 5, 'frostbite': 5, 'kangaroo': 5})

tag: byol_no_aug_no_gs
Counter({'breakout': 5, 'pong': 5, 'kangaroo': 5, 'battle_zone': 5, 'bank_heist': 5, 'assault': 5, 'up_n_down': 5, 'frostbite': 5, 'boxing': 4})

tag: nd_1k_no_norm_naug_dist_byol_bil_squared_nogs_m5
Counter({'battle_zone': 5, 'crazy_climber': 5, 'breakout': 5, 'up_n_down': 5, 'pong': 5, 'bank_heist': 5, 'boxing': 5, 'assault': 5, 'frostbite': 5, 'kangaroo': 5})

tag: byol_base_control
Counter({'chopper_command': 5, 'breakout': 5, 'crazy_climber': 5, 'private_eye': 5, 'kung_fu_master': 5, 'battle_zone': 5, 'freeway': 5, 'gopher'

In [55]:
sweeps = byol_sweeps + \
        [
       fixed_jat_gl,
       uat_aug_bn,
       our_drq_dqn,
       atari_ilya_scores]
names =  byol_names + [
          "UAT", 
          "UAT no-dist pri aug",
          "Our DRQ",
          "old_drq"]

games_dfs, names = compare_dfs(sweeps, names, min_games=10, pairwise_top_n=8, sort_key=lambda x: np.mean(sorted(x)[1:3]))# sort_key=lambda x: np.mean(sorted(x)[1:3]))


Human Scores:
tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.545, Mean: 0.857
tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_rva: Median: 0.560, Mean: 0.849
tag: fcmlp_dist_byol_ql1_squared_nogs_m5: Median: 0.412, Mean: 0.523
tag: dist_byol_ql1_squared_nogs_m5_renorm_512  dqn_h: 512: Median: 0.589, Mean: 0.791
tag: byol_ql1_squared_nogs_m5_rew01_mse: Median: 0.411, Mean: 0.623
tag: byol_ql1_b0: Median: 0.378, Mean: 0.490
tag: dist_byol_ql1_squared_nogs_m5_rew1_512: Median: 0.549, Mean: 0.771
tag: nd_1k_no_norm_naug_dist_byol_ql1_squared_nogs_m5: Median: 0.424, Mean: 0.711
tag: byol_ql1_squared_nogs_m5_rew01_mse_512: Median: 0.418, Mean: 0.587
tag: byol_replication_aug_control: Median: 0.418, Mean: 0.551
tag: dist_byol_ql1_squared_nogs_m5_fixed_512_ql1_rva: Median: 0.438, Mean: 0.783
tag: no_int_nd_dist_byol_ql1_squared_nogs_m5: Median: 0.472, Mean: 0.768
tag: byol_ql1_squared_nogs_m5  delta: nan: Median: 0.421, Mean: 0.577
tag: dist_byol_ql1_squared_nogs_m5_fixed_512_van: Medi

tag: nd_fcmlp_dist_byol_ql1_squared_nogs_m5: Median: 0.079, Mean: 0.444
tag: all_dropouts_0202_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.119, Mean: 0.143
tag: id_only_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.056, Mean: 0.107
tag: all_dropouts_0502_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.070, Mean: 0.080
tag: dropout_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va  frame: 0.2  augme: ["intensity","dropout"]: Median: 0.047, Mean: 0.083
tag: dropout_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va  frame: 0.2  augme: ["dropout"]: Median: 0.054, Mean: 0.178
tag: dropout_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va  frame: 0.0: Median: 0.029, Mean: 0.124
tag: nofd_dropout_aug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.019, Mean: 0.093
tag: dropout_nfd_klf_0505_naug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.030, Mean: 0.048
tag: dropout_aug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va: Median: 0.005, Mean: 0.048
tag: all_dro

In [56]:
for i in range(len(names)):
    print(names[i])
    indices = np.array(list(np.argsort(-games_dfs[i]["GameScoreNormalized"])))
    games_dfs[i]["gsn"] = games_dfs[i].GameScoreNormalized
    games_dfs[i]["gsnn"] = games_dfs[i].GameScoreNatureNormalized
    games_dfs[i]["gsa"] = games_dfs[i].GameScoreAverage
    print(games_dfs[i].iloc[indices][["gsa", "gsn", "gsnn"]])
    print()

tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_va
                     gsa       gsn      gsnn
game                                        
boxing            30.452  2.529333  0.345301
kangaroo        3876.000  1.281931  0.530595
crazy_climber  36659.800  1.033151  0.258838
assault          699.476  0.918160  0.117564
up_n_down       7307.800  0.607036  0.716375
breakout          15.598  0.482569  0.036212
bank_heist       370.100  0.481662  0.807396
pong              -3.780  0.479320  0.420896
frostbite       1811.020  0.408905  2.384349
battle_zone    14472.000  0.347771  0.439797

tag: dist_byol_ql1_squared_nogs_m5_norm_ql1_rva
                     gsa       gsn      gsnn
game                                        
boxing            27.686  2.298833  0.313834
kangaroo        6899.200  2.295407  0.950076
crazy_climber  27240.000  0.657095  0.164624
pong              -0.096  0.583683  0.512537
assault          520.340  0.573403  0.073420
breakout          17.434  0.546319  0.040995
up_n

                     gsa       gsn      gsnn
game                                        
crazy_climber  55798.400  1.797201  0.450258
kangaroo        4204.400  1.392021  0.576162
assault          552.428  0.635158  0.081328
boxing             5.858  0.479833  0.065506
bank_heist       256.900  0.328461  0.550590
up_n_down       3954.100  0.306520  0.361730
pong             -11.486  0.261020  0.229204
breakout           9.110  0.257292  0.019307
frostbite        752.780  0.161045  0.939060
battle_zone     7956.000  0.160678  0.203195

UAT no-dist pri aug
                     gsa       gsn      gsnn
game                                        
boxing            26.220  2.176667  0.297156
assault          495.150  0.524923  0.067213
breakout          16.152  0.501806  0.037655
crazy_climber  19906.600  0.364331  0.091277
bank_heist       275.940  0.354229  0.593784
kangaroo        1035.600  0.329735  0.136478
frostbite       1059.640  0.232917  1.358154
up_n_down       3006.620  0.221619

                    gsa       gsn      gsnn
game                                       
assault         529.998  0.591990  0.075800
up_n_down      2882.920  0.210534  0.248456
kangaroo        250.000  0.066376  0.027473
breakout          3.142  0.050069  0.003757
pong            -19.430  0.035977  0.031592
bank_heist       32.320  0.024523  0.041107
battle_zone    3066.000  0.020271  0.025635
frostbite       102.760  0.008797  0.051297
crazy_climber  9458.400 -0.052781 -0.013223
boxing           -0.972 -0.089333 -0.012196

tag: dropout_aug_dist_byol_ql1_squared_nogs_m5_norm_ql1_va
                    gsa       gsn      gsnn
game                                       
assault         719.702  0.957086  0.122549
boxing            6.608  0.542333  0.074039
frostbite       375.600  0.072702  0.423928
kangaroo        121.600  0.023332  0.009657
up_n_down       792.620  0.023228  0.027412
breakout          1.482 -0.007569 -0.000568
pong            -20.994 -0.008329 -0.007313
bank_heist      

In [321]:
uat_data = pandas.read_csv("uat_sweep.csv")
byol_data = pandas.read_csv("byol_replications.csv")
byol_data = byol_data[byol_data["use_ram"] == 1]

byol_sweeps, byol_names = split_on_fields(byol_data, ["tag", "noisy_nets", "distributional", 
                                                      "replay_ratio", "jumps", "classifier",
                                                      "target_update_interval", "dynamics_blocks",
                                                     "byol", "use_ram", "bits"], 0,
                                         min_games=8)

    

  interactivity=interactivity, compiler=compiler, result=result)



Human Scores:
UAT no-dist pri aug: Median: 0.330, Mean: 0.518
tag: ram_test_hybrid_control: Median: 0.314, Mean: 0.280
UAT: Median: 0.312, Mean: 0.309
tag: ram_test_hybrid_byol: Median: 0.310, Mean: 0.320
old_drq: Median: 0.304, Mean: 0.321
Our DRQ: Median: 0.300, Mean: 0.406
tag: ram_test_der_byol: Median: 0.270, Mean: 0.432
tag: ram_test_der_control: Median: 0.164, Mean: 0.317
tag: ram_bits_test_der_byol: Median: 0.115, Mean: 0.077
tag: ram_bits_test_hybrid_byol_squared: Median: 0.100, Mean: 0.002
tag: ram_bits_test_hybrid_control: Median: 0.098, Mean: 0.042
tag: ram_bits_test_hybrid_byol_nogs_squared: Median: 0.093, Mean: -0.024
tag: ram_bits_test_hybrid_byol: Median: 0.079, Mean: -0.011
tag: ram_bits_test_der_control: Median: 0.075, Mean: -0.082

Nature Scores:
UAT no-dist pri aug: Median: 0.171, Mean: 0.343
tag: ram_test_hybrid_control: Median: 0.152, Mean: 0.255
UAT: Median: 0.231, Mean: 0.260
tag: ram_test_hybrid_byol: Median: 0.184, Mean: 0.354
old_drq: Median: 0.280, Mean: 0.

tag: ram_bits_test_hybrid_byol_squared above Our DRQ: 2, -2.177, -2.835
tag: ram_bits_test_hybrid_byol_squared above tag: ram_test_der_byol: 2, -2.010, -2.460
tag: ram_bits_test_hybrid_byol_squared above tag: ram_test_der_control: 4, 0.174, -0.345
tag: ram_bits_test_hybrid_byol_squared above tag: ram_bits_test_der_byol: 3, -0.324, -0.127
tag: ram_bits_test_hybrid_byol_squared above tag: ram_bits_test_hybrid_control: 4, 0.412, -0.269
tag: ram_bits_test_hybrid_byol_squared above tag: ram_bits_test_hybrid_byol_nogs_squared: 6, 0.047, 0.950
tag: ram_bits_test_hybrid_byol_squared above tag: ram_bits_test_hybrid_byol: 4, -0.421, -0.146
tag: ram_bits_test_hybrid_byol_squared above tag: ram_bits_test_der_control: 3, -0.523, 0.522

tag: ram_bits_test_hybrid_control above UAT no-dist pri aug: 1, -1.977, -1.688
tag: ram_bits_test_hybrid_control above tag: ram_test_hybrid_control: 1, -1.049, -1.970
tag: ram_bits_test_hybrid_control above UAT: 1, -1.591, -2.559
tag: ram_bits_test_hybrid_control abo

In [None]:
sweeps = byol_sweeps + \
        [
       fixed_jat_gl,
       uat_aug_bn,
       our_drq_dqn,
       atari_ilya_scores]
names =  byol_names + [
          "UAT", 
          "UAT no-dist pri aug",
          "Our DRQ",
          "old_drq"]

games_dfs, names = compare_dfs(sweeps, names, min_games=9, sort_key=lambda x: np.median(x))

In [116]:
from sklearn import linear_model
import sklearn

for df, name in zip(byol_sweeps, byol_names):

    # sns.jointplot(df.ModelNCELoss,
    #               df.GameScoreDERNormalized,
    #               kind="reg", stat_func=r2)
    import statsmodels.api as sm
    preprocessing = sklearn.preprocessing.OneHotEncoder()
    game_vars = np.array(list(df.game)).reshape(-1, 1)
    game_vars = preprocessing.fit_transform(game_vars).todense()
    byol_losses = np.array(list(df.ModelNCELoss)).reshape(-1, 1)
#     byol_losses = sklearn.preprocessing.scale(byol_losses)
    variables = np.concatenate([game_vars, byol_losses], axis=1)
    targets = np.array(list(df.GameScoreNormalized)).reshape(-1, 1)

    regression = sm.GLS(targets, variables).fit()
#     regression.fit(variables, targets)

    print(name)
    print("Regression Coefficients:")
    print(regression.summary())
    print()

tag: ram_bits_test_hybrid_byol_nogs_squared
Regression Coefficients:
                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.916
Model:                            GLS   Adj. R-squared:                  0.895
Method:                 Least Squares   F-statistic:                     42.58
Date:                Fri, 26 Jun 2020   Prob (F-statistic):           3.08e-16
Time:                        15:33:05   Log-Likelihood:                 26.755
No. Observations:                  45   AIC:                            -33.51
Df Residuals:                      35   BIC:                            -15.44
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


tag: byol_base_aug_control
Regression Coefficients:
                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.621
Model:                            GLS   Adj. R-squared:                  0.528
Method:                 Least Squares   F-statistic:                     6.678
Date:                Fri, 26 Jun 2020   Prob (F-statistic):           1.88e-12
Time:                        15:33:05   Log-Likelihood:                -69.510
No. Observations:                 128   AIC:                             191.0
Df Residuals:                     102   BIC:                             265.2
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------


  return np.sqrt(eigvals[0]/eigvals[-1])
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


tag: byol_ql1_b0_naug
Regression Coefficients:
                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.402
Model:                            GLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     2.612
Date:                Fri, 26 Jun 2020   Prob (F-statistic):             0.0201
Time:                        15:33:06   Log-Likelihood:                -16.170
No. Observations:                  45   AIC:                             52.34
Df Residuals:                      35   BIC:                             70.41
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1   

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  return np.sqrt(eigvals[0]/eigvals[-1])
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.735
Model:                            GLS   Adj. R-squared:                  0.671
Method:                 Least Squares   F-statistic:                     11.51
Date:                Fri, 26 Jun 2020   Prob (F-statistic):           5.01e-20
Time:                        15:33:06   Log-Likelihood:                 37.235
No. Observations:                 130   AIC:                            -22.47
Df Residuals:                     104   BIC:                             52.09
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0672      0.091      0.740      0.4

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.861
Model:                            GLS   Adj. R-squared:                  0.825
Method:                 Least Squares   F-statistic:                     24.06
Date:                Fri, 26 Jun 2020   Prob (F-statistic):           1.83e-12
Time:                        15:33:06   Log-Likelihood:                 7.1538
No. Observations:                  45   AIC:                             5.692
Df Residuals:                      35   BIC:                             23.76
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.4929      0.312      1.578      0.1

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  return np.sqrt(eigvals[0]/eigvals[-1])
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.648
Model:                            GLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     7.574
Date:                Fri, 26 Jun 2020   Prob (F-statistic):           4.63e-14
Time:                        15:33:06   Log-Likelihood:                -18.168
No. Observations:                 129   AIC:                             88.34
Df Residuals:                     103   BIC:                             162.7
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0658      0.139      0.472      0.6

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.448
Model:                            GLS   Adj. R-squared:                  0.306
Method:                 Least Squares   F-statistic:                     3.153
Date:                Fri, 26 Jun 2020   Prob (F-statistic):            0.00689
Time:                        15:33:06   Log-Likelihood:                 14.151
No. Observations:                  45   AIC:                            -8.303
Df Residuals:                      35   BIC:                             9.764
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.3055      0.336      0.909      0.3

  return np.sqrt(eigvals[0]/eigvals[-1])
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [57]:
print(" ".join(list(map(str, range(9461890, 9461997)))))
    

9461890 9461891 9461892 9461893 9461894 9461895 9461896 9461897 9461898 9461899 9461900 9461901 9461902 9461903 9461904 9461905 9461906 9461907 9461908 9461909 9461910 9461911 9461912 9461913 9461914 9461915 9461916 9461917 9461918 9461919 9461920 9461921 9461922 9461923 9461924 9461925 9461926 9461927 9461928 9461929 9461930 9461931 9461932 9461933 9461934 9461935 9461936 9461937 9461938 9461939 9461940 9461941 9461942 9461943 9461944 9461945 9461946 9461947 9461948 9461949 9461950 9461951 9461952 9461953 9461954 9461955 9461956 9461957 9461958 9461959 9461960 9461961 9461962 9461963 9461964 9461965 9461966 9461967 9461968 9461969 9461970 9461971 9461972 9461973 9461974 9461975 9461976 9461977 9461978 9461979 9461980 9461981 9461982 9461983 9461984 9461985 9461986 9461987 9461988 9461989 9461990 9461991 9461992 9461993 9461994 9461995 9461996
