In [1]:
__author__ = 'nileshtrip'
import numpy as np
import math
import itertools
from collections import OrderedDict, defaultdict
from scipy.stats import sem
from datasets import load_wine, load_parkinson, load_triazines, load_fertility, load_forest_fires
import csv

import pickle
import time
import os
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rc('text', usetex=True)

In [2]:
def truncate(n, dig=0):
    multiplier = 10 ** dig
    return int(n * multiplier) / multiplier

In [3]:
def err_sd(err):
    
    err = np.array(err)
    squared_err = np.power(err, 2)
    se = np.std(squared_err)/np.sqrt(squared_err.shape[0])
    
    mu = np.mean(squared_err)
    
    return mu, se

In [4]:
def r_err_sd(err):
    
    err = np.array(err)
    squared_err = np.power(err, 2)
    se = np.std(squared_err)/np.sqrt(squared_err.shape[0])
    
    r_err = np.sqrt(np.mean(squared_err))
    r_sd = 0.5*se/r_err
    
    return r_err, r_sd

In [5]:
def r_err(err):
    
    err = np.array(err)
    squared_err = np.power(err, 2)
    mu = np.sqrt(np.mean(squared_err))
    
    return mu

In [6]:
def get_TDLasso_data(folder_path):
    
    results = OrderedDict()
    for data_file in [i for i in os.listdir(folder_path) if ".pickle" in i]:
        data = pickle.load(open(os.path.join(folder_path, data_file), "rb")) 
        for key in data.keys():
            if key=="results":
                #print(len(data[key]))
                
                tdlasso=np.array(data[key][0])
                truth=np.array(data[key][1])
                
                #print(data["dataset"])
                #print(r_err_sd2(linreg-truth))
                
                errors_tdlasso = r_err_sd(np.ravel(tdlasso-truth))
                results[(data["dataset"], "tdlasso")] = errors_tdlasso

    return results

In [7]:
tdlasso_folder_path="4-11-TDLassoSP_Real"

In [8]:
get_TDLasso_data(tdlasso_folder_path)

OrderedDict([(('Parkinson', 'tdlasso'),
              (12.253520715326323, 0.13575597839123998)),
             (('Fertility', 'tdlasso'),
              (0.40920561422032503, 0.07162378610640245)),
             (('Wine', 'tdlasso'), (0.9812686200048398, 0.015428162944273006)),
             (('Fire', 'tdlasso'), (82.06568534308094, 36.032016265403165)),
             (('Triazines', 'tdlasso'),
              (0.14830938639979402, 0.023715140936050174))])

In [9]:
def get_TDValsReg_data(folder_path):
    
    results = OrderedDict()
    for data_file in [i for i in os.listdir(folder_path) if ".pickle" in i]:
        data = pickle.load(open(os.path.join(folder_path, data_file), "rb")) 
        for key in data.keys():
            if key=="results":
                #print(len(data[key]))
                
                tdregeuclid=np.array(data[key][0])
                truth=np.array(data[key][1])
                
                #print(data["dataset"])
                #print(r_err_sd2(linreg-truth))
                
                errors_tdvalsreg = r_err_sd(np.ravel(tdregeuclid-truth))
                results[(data["dataset"], "tdvalsreg")] = errors_tdvalsreg

    return results

In [10]:
tdvalsreg_folder_path="4-11-TDValsSP_Real"

In [11]:
get_TDValsReg_data(tdvalsreg_folder_path)

OrderedDict([(('Triazines', 'tdvalsreg'),
              (0.173469200900365, 0.0036813964392945664)),
             (('Parkinson', 'tdvalsreg'),
              (12.25343578765015, 0.0021470346917246497)),
             (('Fire', 'tdvalsreg'), (82.06635418216294, 2.5670262337393104)),
             (('Wine', 'tdvalsreg'),
              (0.8410590832690957, 0.0003736788904566116)),
             (('Fertility', 'tdvalsreg'),
              (0.4088802709453759, 0.01283775108227386))])

In [12]:
def get_TDRegKernel_data(folder_path):
    
    results = OrderedDict()
    for data_file in [i for i in os.listdir(folder_path) if ".pickle" in i]:
        data = pickle.load(open(os.path.join(folder_path, data_file), "rb")) 
        for key in data.keys():
            if key=="results":
                #print(len(data[key]))
                
                tdregkernel=np.array(data[key][0])
                truth=np.array(data[key][1])
                
                #print(data["dataset"])
                #print(r_err_sd2(linreg-truth))
                
                errors_tdregkernel = r_err_sd(np.ravel(tdregkernel-truth))
                results[(data["dataset"], "tdregkernel")] = errors_tdregkernel

    return results

In [13]:
tdregkernel_folder_path="4-11-TDRegKernelSP_Real"

In [14]:
get_TDRegKernel_data(tdregkernel_folder_path)

OrderedDict([(('Parkinson', 'tdregkernel'),
              (12.33263377313558, 0.14467599399723352)),
             (('Triazines', 'tdregkernel'),
              (0.15103470294183405, 0.024037624569851517)),
             (('Fertility', 'tdregkernel'),
              (0.3844995854289188, 0.07598235810001784)),
             (('Wine', 'tdregkernel'),
              (0.834501944324581, 0.01529846011068214)),
             (('Fire', 'tdregkernel'),
              (81.94670504805939, 35.83402534128768))])

In [None]:
def get_linreg_data(folder_path):
    
    results = OrderedDict()
    for data_file in [i for i in os.listdir(folder_path) if ".pickle" in i]:
        data = pickle.load(open(os.path.join(folder_path, data_file), "rb")) 
        for key in data.keys():
            if key=="results":
                #print(len(data[key]))
                
                linreg=np.array(data[key][0])
                truth=np.array(data[key][1])
                
                #print(data["dataset"])
                #print(r_err_sd2(linreg-truth))
                
                errors_linreg = r_err_sd(np.ravel(linreg-truth))
                results[(data["dataset"], "OLS")] = errors_linreg

    return results

In [None]:
ols_folder_path="7-28-LinReg_Real"

In [None]:
def process_OLS(ols_folder_path):
    
    return list(get_linreg_data(folder_path = ols_folder_path).items())

In [None]:
OLS = process_OLS(ols_folder_path)

In [None]:
OLS

In [None]:
def get_JM_data(folder_path, method):
    
    results = OrderedDict()
    results["datasets"]=[]
    for data_file in [i for i in os.listdir(folder_path) if ".pickle" in i]:
        data = pickle.load(open(os.path.join(folder_path, data_file), "rb")) 
        for key in data.keys():
            if key=="results":
                lams = data[key][0][0]
                deb_preds = data[key][1][0]
                main_preds = data[key][2][0]
                
                y_tests = data[key][3]
                kappas = data[key][4]
                feasibles = data[key][5]
                dataset = data["dataset"]
                
                if method==data["main_reg_params"]["method"]:
                
                    i=0
                    results["lams"] = lams
                    results["datasets"].append(dataset)
                    results[dataset] = feasibles[0][0]
                    for lam in lams:
                        results[(dataset, lam, "debiased_err")] = r_err(deb_preds[i]-y_tests)
                        results[(dataset, lam, "all_points")] = deb_preds[i]-y_tests
                        results[(dataset, lam, "feasible")] = feasibles[0][i]
                        i+=1
                    results[(dataset, "main_err")] = r_err(main_preds-y_tests)

    return results

In [None]:
def perform_dataset(dataset, results):
    
    for key in data:
        if "main" in str(key) and dataset in str(key):
            main_err = data[key]
            
    db = []
    for key in data:
        if "debiased" in str(key) and dataset in str(key):
            db.append((key[1], data[key]))
    return main_err, db

In [None]:
def min_feasible(dataset, results):
    
    for key in results:
        if "main" in str(key) and dataset in str(key):
            main_err = results[key]
            
    num_pts=len(results[dataset])
    first_feasible = [False for i in range(num_pts)]
    datapoint_errs = [None for i in range(num_pts)]
    
    for key in results:
        if "all_points" in str(key) and dataset in str(key):
            lam = key[1]
            for i in range(num_pts):
                first = results[(dataset, lam, "feasible")][i]
                val = results[(dataset, lam, "all_points")][0][i]
                if first_feasible[i]==False and first==True:
                    datapoint_errs[i]=val
                    first_feasible[i]=True
                
                if i==(num_pts-1) and first_feasible[i]==False:
                    datapoint_errs[i]=val
            
    return main_err, np.array(datapoint_errs)

In [None]:
def process_JM(folder_path):
    
    JM_stuff= []
    computer_cpus=48
    main_reg_params = [{"method" : "Lasso"}, {"method" : "Ridge"}, {"method" : "Elastic"}]
    for method in main_reg_params:
        JM_data = get_JM_data(folder_path = folder_path, method=method["method"])
        for dataset in JM_data["datasets"]:
            main_err, debiased_errs = min_feasible(dataset, JM_data)
            JM_stuff.append(((dataset, "JM " +"("+str(method["method"])+")"), r_err_sd(debiased_errs)))
    
    return list(set(JM_stuff))

In [None]:
jm_folder_path="7-25-JM_Real"

In [None]:
JM = process_JM(jm_folder_path)

In [None]:
def get_OM_data(folder_path):
    
    results = OrderedDict()
    for data_file in [i for i in os.listdir(folder_path) if ".pickle" in i]:
        data = pickle.load(open(os.path.join(folder_path, data_file), "rb")) 
        for key in data.keys():
            if key=="results":
                #print(len(data[key]))
                
                fo1f=np.array(data[key][0])
                foq=np.array(data[key][3])
                main=np.array(data[key][6])
                truth=np.array(data[key][7])
            
                errors_fo1f = r_err_sd(fo1f-truth)
                errors_foq = r_err_sd(foq-truth)
                errors_main = r_err_sd(main-truth)
                best_method_f = data[key][-2]
                best_method_q = data[key][-1]
                
                results[(data["n_splits"], data["main_reg_params"]["method"], data["aux_reg_params"]["method"], data["dataset"],   "fo1f")] = errors_fo1f
                results[(data["n_splits"], data["main_reg_params"]["method"], data["aux_reg_params"]["method"], data["dataset"], "foq")] = errors_foq
                results[(data["n_splits"], data["main_reg_params"]["method"], data["aux_reg_params"]["method"], data["dataset"], "baseline")] = errors_main
                results[(data["dataset"], "aux_choice_f")] = best_method_f
                results[(data["dataset"], "aux_choice_q")] = best_method_q
            

    return results

In [None]:
om_folder_path="8-1-weather"

In [None]:
OM_data = get_OM_data(folder_path = om_folder_path)

In [None]:
OM_data

In [None]:
def get_method(x):
    
    return x[0][3]

In [None]:
def process_OM(folder_path):
    
    OM_data = get_OM_data(folder_path = folder_path)
    OM = list(OM_data.items())
    
    stuff=[]
    for x in OM:
        if x[0][-1] == "fo1f":
            stuff.append((((x[0][3], "OM $f$ " + "(" + str(x[0][1]) + ")"), x[1])))
        if x[0][-1] == "foq":
            stuff.append((((x[0][3], "OM $q$ " + "(" +str(x[0][1]) + ")"), x[1])))
    for x in OM:
        if x[0][-1] == "baseline":
            stuff.append(((x[0][3], str(x[0][1])), x[1]))

    return stuff

In [None]:
om_results = process_OM(om_folder_path)

In [None]:
om_results

In [None]:
datasets={"Fertility" : [69, 31, 8, "Yes"], "Fire" : [320, 197, 10, "Yes"], "Parkinson" : [1877, 3998, 17, "Yes"], "Wine" : [4898, 1599, 11, "Yes"], "Triazines" : [139, 47, 60, "No"]}

In [None]:
def load_all_results(ols_folder_path, jm_folder_path, om_folder_path):
    
    OLS = process_OLS(ols_folder_path)
    JM = process_JM(jm_folder_path)
    OM = process_OM(om_folder_path)
    
    l = [OLS, JM, OM]
    al=[]
    for i in l:
        for j in i:
            al.append(j)
            
    method = lambda x: x[0][0]
    
    return sorted(al, key=method)

In [None]:
ols_folder_path = "7-28-LinReg_Real"
jm_folder_path = "7-25-JM_Real"
om_folder_path = "7-24-OM_Real"

In [None]:
l = load_all_results(ols_folder_path, jm_folder_path, om_folder_path);

In [None]:
l

In [None]:
def make_matrix(sizes, data):
    
    stuff = defaultdict(lambda: defaultdict(list))
    for row in data:
        dataset = row[0][0]
        method = row[0][1]
        val = row[1]
        el = str(truncate(val[0], dig=4)) + "$" + "\\" + "pm$" + str(truncate(truncate(val[0]+val[1], dig=4)-truncate(val[0], dig=4), dig=4))
        stuff[dataset][method].append(el)
        
    return stuff

In [None]:
def transpose_matrix(mat):
    
    stuff = defaultdict(lambda: defaultdict(list))
    for dataset in mat:
        for method in mat[dataset]:
            val = mat[dataset][method][0]
            stuff[method][dataset] = val
        
    return stuff

In [None]:
results = make_matrix(datasets, l);

In [None]:
trans_res = transpose_matrix(results)

In [None]:
def make_table_trans(sizes, matrix):
    
    header = ["Method", "Fertility", "Fire", "Parkinson", 'Wine', 'Triazines']
    
    f = open('real_trans.csv', 'w')
    writer = csv.writer(f, delimiter=",")
    methods = ['OLS', 'Ridge', 'JM (Ridge)', 'OM $f$ (Ridge)', 'OM $q$ (Ridge)',
               'Lasso', 'JM (Lasso)', 'OM $f$ (Lasso)', 'OM $q$ (Lasso)',
               'Elastic', 'JM (Elastic)', 'OM $f$ (Elastic)', 'OM $q$ (Elastic)'
               ]
    
    writer.writerow(header)
    for method in methods:
        row=[method]
        for dataset in header:
            if dataset!="Method":
                row.append(matrix[method][dataset])
        writer.writerow(row)
    f.close()
    return

In [None]:
make_table_trans(datasets, trans_res)