# Imports

In [8]:
%reload_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('/workspace/phylo_estimation/data_inference')

In [39]:
import pickle
import itertools
import time
import os
import statistics
from time import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from evaluation.regression import generate_reg_results, get_regression_norm_results, get_regression_div_results, new_get_regression_div_results

from tensorflow.keras.models import load_model

In [10]:
pd.options.display.float_format = "{:,.4f}".format

# Results generation

In [11]:
pickle_base = '/workspace/phylo_estimation/data_inference/pickles/old_sims/dataset_'
res_path = "/workspace/phylo_estimation/data_inference/models/reg/"
n_tips = ['674', '489', '87']

data = dict()
for i in n_tips:
    with open(pickle_base + i + "_10k.pkl", 'rb') as f:
        data[i] = pickle.load(f)

In [12]:
print(data['674']['X_test'].shape)

(6000, 674)


In [35]:
n_trees_tested = 1000
results = dict()
inf_times = dict()
mae_dict = dict()

for i in n_tips:
    print('---', i, 'tips ---')
    results[i] = dict()
    inf_times[i] = dict()
    mae_dict[i] = dict()
    
    for label in np.unique(data[i]['div_info_test']):
        div_scenario = label.split('/')[1].split('_')[0]
        results[i][div_scenario] = dict()
        inf_times[i][div_scenario] = dict()
        mae_dict[i][div_scenario] = dict()
        
        norm_types = ['norm', 'no_norm']
        for norm in norm_types:
            results[i][div_scenario][norm] = dict()
            inf_times[i][div_scenario][norm] = dict()
            mae_dict[i][div_scenario][norm] = dict()
            
            # Load regression model 
            model_path = res_path + div_scenario + '/' + i + "_regression_"
            if norm != 'no_norm':
                model_path += norm + '_'
            
            results[i][div_scenario][norm], ex_time = generate_reg_results(model_path, data[i]['X_test'],
                                                                           data[i]['y_reg_test'],
                                                                           data[i]['y_reg_norm_test'],
                                                                           data[i]['div_info_test'],
                                                                           data[i]['resc_factor_test'],
                                                                           div_scenario, label, norm)
            inf_times[i][div_scenario][norm] = ex_time
            
            ##### NEW CODE #####
            nn_model = load_model(model_path + 'model.keras')

        
        
        
        
            pred = nn_model.predict(np.expand_dims(data[i]['X_test'][data[i]['div_info_test'] == label], axis=2))
            
            if norm == 'norm':
                norm_text = '_norm'
            else:
                norm_text = ''
            y_reg_test = data[i]['y_reg' + norm_text + '_test'][data[i]['div_info_test'] == label]
            y_reg_test = [np.array(elem) for elem in y_reg_test]
            
            real = y_reg_test
            error = abs(pred-real)
            
            resc_factor_test = data[i]['resc_factor_test'][data[i]['div_info_test'] == label]
            
            if norm == 'norm':
                mae_dict[i][div_scenario][norm]['mae_rescaled'] = np.mean(error, axis=0)
                
                
                if div_scenario == "BD" or div_scenario == "HE" or div_scenario == "SAT":
                    
                    pred[:, 0] = pred[:, 0] / resc_factor_test
    
                elif div_scenario == "ME":
                    
                    pred[:, 0] = pred[:, 0] / resc_factor_test
                    pred[:, 2] = pred[:, 2] / resc_factor_test
  
                else:
        
                    pred[:, 0] = pred[:, 0] / resc_factor_test
                    pred[:, 1] = pred[:, 1] / resc_factor_test
                    pred[:, 4] = pred[:, 4] / resc_factor_test
                    
                y_reg_test = data[i]['y_reg_test'][data[i]['div_info_test'] == label]
                y_reg_test = [np.array(elem) for elem in y_reg_test]
                
                real = y_reg_test
                error = abs(pred-real)
                
                mae_dict[i][div_scenario][norm]['mae'] = np.mean(error, axis=0)
                
            else: 
            
                mae_dict[i][div_scenario][norm]['mae'] = np.mean(error, axis=0)
                
                if div_scenario == "BD" or div_scenario == "HE" or div_scenario == "SAT":
                    
                    pred[:, 0] = pred[:, 0] * resc_factor_test
    
                elif div_scenario == "ME":

                    pred[:, 0] = pred[:, 0] * resc_factor_test
                    pred[:, 2] = pred[:, 2] * resc_factor_test
  
                else:
                    
                    pred[:, 0] = pred[:, 0] * resc_factor_test
                    pred[:, 1] = pred[:, 1] * resc_factor_test
                    pred[:, 4] = pred[:, 4] * resc_factor_test
                    
                
                y_reg_test = data[i]['y_reg_norm_test'][data[i]['div_info_test'] == label]
                y_reg_test = [np.array(elem) for elem in y_reg_test]
                
                real = y_reg_test
                error = abs(pred-real)                
            
                mae_dict[i][div_scenario][norm]['mae_rescaled'] = np.mean(error, axis=0)            
            
            

--- 674 tips ---
BD
--- Inference time:  BD scenario & norm 0.20946669578552246 seconds ---
--- Inference time:  BD scenario & no_norm 0.20560646057128906 seconds ---
HE
--- Inference time:  HE scenario & norm 0.20915651321411133 seconds ---
--- Inference time:  HE scenario & no_norm 0.15805554389953613 seconds ---
ME
--- Inference time:  ME scenario & norm 0.17746281623840332 seconds ---
--- Inference time:  ME scenario & no_norm 0.17595911026000977 seconds ---
SAT
--- Inference time:  SAT scenario & norm 0.19025230407714844 seconds ---
--- Inference time:  SAT scenario & no_norm 0.1393575668334961 seconds ---
SR
--- Inference time:  SR scenario & norm 0.1877448558807373 seconds ---
(1020, 5)
AAAAAAAAAAAA
--- Inference time:  SR scenario & no_norm 0.18520355224609375 seconds ---
WW
--- Inference time:  WW scenario & norm 0.18587708473205566 seconds ---
(986, 5)
AAAAAAAAAAAA
--- Inference time:  WW scenario & no_norm 0.1889641284942627 seconds ---
--- 489 tips ---
BD
--- Inference time

{'mae_rescaled': array([0.07186544, 0.02227895, 0.11367133, 0.09332199, 4.22636319]),
 'mae': array([0.06611636, 0.02344743, 0.11367133, 0.09332199, 4.00761877])}

In [15]:
results["674"]["BD"]["norm"]

{'MAE': array([1.22314086, 0.07544485, 1.61966674, 0.40997035]),
 'MAE_norm': array([0.61845747, 0.07544485, 0.83216266, 0.22585427])}

## Inference time

In [7]:
for i in inf_times:
    t = []
    for scenario in inf_times[i]:
        t.append(inf_times[i][scenario]['norm'])
   
    print(f"\nInference time statistics for {i}:")
    print(f"Mean: {np.mean(t):.4f}")
    print(f"Standard deviation: {np.std(t):.4f}")
    print(f"Minimum: {np.min(t):.4f}")
    print(f"Maximum: {np.max(t):.4f}")


Inference time statistics for 674:
Mean: 0.8089
Standard deviation: 0.8767
Minimum: 0.3100
Maximum: 2.7660

Inference time statistics for 489:
Mean: 0.2203
Standard deviation: 0.0696
Minimum: 0.1487
Maximum: 0.3595

Inference time statistics for 87:
Mean: 0.1388
Standard deviation: 0.0221
Minimum: 0.0950
Maximum: 0.1602


## Training time

In [8]:
for i in n_tips:
    train_times = []
    
    for label in np.unique(data[i]['div_info_test']):
        div_scenario = label.split('/')[1].split('_')[0]
        
        # Load regression model 
        model_path = res_path + div_scenario + '/' + i + "_regression_norm_"

        with open(model_path + 'model_data.pkl', 'rb') as f:
            n_params, train_time = pickle.load(f)
                
        train_times.append(train_time)
    
    print('\nTraining times for', i, 'tips')
    print('-'*5)
    print('Mean:', np.mean(train_times))
    print('Std Dev:', np.std(train_times))
    print('Max:', np.max(train_times))
    print('Min:', np.min(train_times))


Training times for 674 tips
-----
Mean: 92.79047838846843
Std Dev: 11.50902236923396
Max: 108.95637226104736
Min: 78.42817187309265

Training times for 489 tips
-----
Mean: 147.82049945990244
Std Dev: 48.91674298602154
Max: 243.39358139038086
Min: 103.69829964637756

Training times for 87 tips
-----
Mean: 138.14758396148682
Std Dev: 52.42740244921074
Max: 212.1332881450653
Min: 77.02222275733948


# Regression metrics MAE vs MAE_norm

In [9]:
#div_scenario = list(results[list(results.keys())[0]].keys())
#get_regression_norm_results(results, '674', div_scenario, 'norm')

In [10]:
#div_scenario = list(results[list(results.keys())[0]].keys())
#get_regression_norm_results(results, '489', div_scenario, 'norm')

In [11]:
#div_scenario = list(results[list(results.keys())[0]].keys())
#get_regression_norm_results(results, '87', div_scenario, 'norm')

## Comparison between diversification scenarios

In [62]:
n_tips = ['674', '489', '87']
labels = ['BD', 'HE', 'ME', 'SAT', 'SR', 'WW']
for tip in n_tips: 
    print("---------" + str(tip) + "---------" )
    for label in labels: 
        print(label)
        data = new_get_regression_div_results(mae_dict, tip, label, 'norm', 'mae')
        print(data)

---------674---------
BD
         r      a
MAE 0.1263 0.0754
HE
         r      a
MAE 0.2087 0.0181
ME
         r      a   time   frac
MAE 0.0356 0.0679 5.0582 0.0489
SAT
     lambda 0
MAE    0.0656
SR
        r0     r1     a0     a1   time
MAE 0.0661 0.0234 0.1137 0.0933 4.0076
WW
        r0     r1     a0     a1   time
MAE 0.0339 0.1077 0.1472 0.0873 2.8722
---------489---------
BD
         r      a
MAE 0.1351 0.0744
HE
         r      a
MAE 0.2271 0.0191
ME
         r      a   time   frac
MAE 0.0333 0.0710 5.0218 0.0488
SAT
     lambda 0
MAE    0.0705
SR
        r0     r1     a0     a1   time
MAE 0.0556 0.0209 0.1033 0.0954 3.1034
WW
        r0     r1     a0     a1   time
MAE 0.0322 0.1075 0.1243 0.0851 2.8025
---------87---------
BD
         r      a
MAE 0.2558 0.1120
HE
         r      a
MAE 0.3786 0.0246
ME
         r      a   time   frac
MAE 0.0377 0.0926 5.6389 0.0502
SAT
     lambda 0
MAE    0.1776
SR
        r0     r1     a0     a1   time
MAE 0.0736 0.0211 0.1024 0.0904 3.2904