In [98]:
%matplotlib notebook
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_curve, roc_auc_score
import functools
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_palette("Dark2")
sns.set_style("darkgrid")



os.chdir("/Users/andour/Google Drive/projects/Dissertation/Final data")

In [99]:
synthetic_dataset_freq = pickle.load(open( "simulated_data_freq", "rb" )).reset_index()
synthetic_dataset_bayes = pickle.load(open( "simulated_data_bayes", "rb" )).reset_index()
synthetic_dataset_bayes.columns

Index(['index', 'dataset', 'parameters', 'noise_bucket', 'label', 'drift',
       'x_array', 'y_array', 'logistic_evalutaion',
       'logistic_param_estimation', 'logistic_posterior_integral',
       'linear_evalutaion', 'linear_param_estimation',
       'linear_posterior_integral', 'bayes_classification',
       'waic_classification'],
      dtype='object')

In [147]:
synthetic_dataset_freq.columns

Index(['index', 'dataset', 'parameters', 'noise_bucket', 'label', 'drift',
       'x_array', 'y_array', 'y_pred_linear', 'param_linear',
       'y_pred_logistic', 'param_logistic', 'linear_mse', 'logistic_mse',
       'linear_mae', 'logistic_mae', 'linear_r2', 'logistic_r2',
       'likelihood_linear', 'likelihood_logistic', 'aic_linear',
       'aic_logistic', 'bic_linear', 'bic_logistic', 'shanon_bic_logistic',
       'shanon_bic_linear', 'shanon_aic_logistic', 'shanon_aic_linear',
       'mse_classification', 'mae_classification', 'r2_classification',
       'chi2_classification', 'aic_classification', 'bic_classification',
       'shanon_bic_classification', 'shanon_aic_classification'],
      dtype='object')

In [176]:
# take the true positive from the frequentist and bayesian methods

tp_sample_freq = synthetic_dataset_freq.loc[(synthetic_dataset_freq.chi2_classification == synthetic_dataset_freq.label)]
tp_sample_bayes = synthetic_dataset_bayes.loc[(synthetic_dataset_bayes.waic_classification == synthetic_dataset_bayes.label)]



col_param_freq = ['index','parameters', 'noise_bucket', 'label','param_linear', 'param_logistic']
col_param_bayes = ['index','parameters', 'noise_bucket', 'label','logistic_param_estimation', 'linear_param_estimation']


In [177]:
tp_sample_freq = tp_sample_freq[col_param_freq].rename({'param_linear': 'param_linear_freq', 'param_logistic': 'param_log_freq'})
tp_sample_bayes = tp_sample_bayes[col_param_bayes].rename({'linear_param_estimation': 'param_linear_bayes', 'logistic_param_estimation': 'param_log_bayes'})

In [178]:
tp_sample_freq = tp_sample_freq[col_param_freq].rename({'param_linear': 'param_linear_freq', 'param_logistic': 'param_log_freq'}, axis = 1)
tp_sample_bayes = tp_sample_bayes[col_param_bayes].rename({'linear_param_estimation': 'param_linear_bayes', 'logistic_param_estimation': 'param_log_bayes'}, axis = 1)


In [179]:
def parameter_within(value, upper_limit, lower_limit):
    if value >= lower_limit and value <= upper_limit:
        return True
    else:
        return False
    

In [180]:
# Is a within beta limits? Limit the dataframe to linear models :

tp_sample_freq_linear = tp_sample_freq.loc[tp_sample_freq.label == "linear"]

tp_sample_freq_linear["param_estimation"] = tp_sample_freq_linear.apply(lambda df : parameter_within(value = df.parameters["b"], 
                                                    lower_limit = df.param_linear_freq["beta"][0],
                                                    upper_limit = df.param_linear_freq["beta"][1] ), 
                                                          axis = 1)

param_linear = tp_sample_freq_linear.groupby(["noise_bucket", "param_estimation"]).count().reset_index()
total = param_linear.groupby("noise_bucket").sum().loc[:,"parameters"].reset_index().rename({"parameters" : "Total"}, axis = 1)

linear_freq = param_linear.merge(total, how="left", on= "noise_bucket")[["param_estimation","noise_bucket","parameters", "Total"]].\
groupby(["noise_bucket", "param_estimation"]).\
apply(lambda df : df["parameters"]/df["Total"] * 100)
linear_freq = pd.DataFrame(linear_freq).reset_index().rename({0: "Estimation Percentage Freq"}, axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [181]:
tp_sample_bayes_linear = tp_sample_bayes.loc[tp_sample_bayes.label == "linear"]

tp_sample_bayes_linear["param_estimation"] = tp_sample_bayes_linear.apply(lambda df : parameter_within(value = df.parameters["b"], 
                                                    lower_limit = df.param_linear_bayes["beta"][1],
                                                    upper_limit = df.param_linear_bayes["beta"][0] ), 
                                                          axis = 1)

param_linear = tp_sample_bayes_linear.groupby(["noise_bucket", "param_estimation"]).count().reset_index()
total = param_linear.groupby("noise_bucket").sum().loc[:,"parameters"].reset_index().rename({"parameters" : "Total"}, axis = 1)

linear_bayes = param_linear.merge(total, how="left", on= "noise_bucket")[["param_estimation","noise_bucket","parameters", "Total"]].\
groupby(["noise_bucket", "param_estimation"]).\
apply(lambda df : df["parameters"]/df["Total"] * 100)
linear_bayes = pd.DataFrame(linear_bayes).reset_index().rename({0: "Estimation Percentage Bayes"}, axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [182]:
linear_best_estimates = pd.merge(linear_freq, linear_bayes)
linear_best_estimates

Unnamed: 0,noise_bucket,param_estimation,level_2,Estimation Percentage Freq,Estimation Percentage Bayes
0,0.1,False,0,38.333333,44.0
1,0.1,True,1,61.666667,56.0
2,0.2,False,2,37.254902,36.5
3,0.2,True,3,62.745098,63.5
4,0.3,False,4,30.357143,43.0
5,0.3,True,5,69.642857,57.0
6,0.4,False,6,37.5,49.746193
7,0.4,True,7,62.5,50.253807
8,0.5,False,8,36.956522,50.510204
9,0.5,True,9,63.043478,49.489796


In [None]:
linear_best_estimates.to_csv("shanon_bic_waic_linear.csv")