In [16]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
sys.path.insert(0, os.path.abspath('../../..'))
from g4l.estimators import BIC
from g4l.estimators import SMC
import g4l.display
from g4l.estimators import Prune
from g4l.models import ContextTree
from g4l.models import integrity
from g4l.data import Sample

## Comparing the mutual ocurrence of trees between methods

In [17]:
results_folder = os.path.abspath('../../simulation_study/results')
methods = ['SeqROCTM', 'prune', 'smc']
models = ['model1']
sizes = [10000]
correct_model = '000 1 10 100'

import warnings

#import pandas as pd
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)


def get_results(method, model_name, size):
    file = '%s/%s/%s_%s.csv' % (results_folder, method, model_name, size)
    df = pd.read_csv(file)
    df.tree = df.tree.astype(str).map(lambda t: ' '.join(sorted(t.split())))
    return df

def get_champion_trees(df1, df2, sample_idx):
    trees1 = df1[df1.sample_idx == sample_idx]
    trees2 = df2[df2.sample_idx == sample_idx]
    
    #trees2.tree = trees2.tree.astype(str).map(lambda t: ' '.join(sorted(t.split())))
    trees2['match'] = trees2.tree.isin(trees1.tree).astype(int)
    trees1['match'] = trees1.tree.isin(trees2.tree).astype(int)
    return trees1, trees2

def compare(method1, method2, model, size):
    df = pd.DataFrame(columns=['sample_idx', 'precision', 'recall'])
    df1 = get_results(method1, model, size)
    df2 = get_results(method2, model, size)
    for sample_idx in df1.sample_idx.unique():
        trees1, trees2 = get_champion_trees(df1, df2, sample_idx)
        precision = trees2['match'].sum()/len(trees2)
        recall = trees1['match'].sum()/len(trees1)
        df.loc[len(df)] = [int(sample_idx), precision, recall]
    return df, df1, df2

In [18]:
df, df1, df2 = compare('SeqROCTM', 'smc', 'model1', 10000)
df.mean()

sample_idx    49.500000
precision      1.000000
recall         0.899705
dtype: float64

In [19]:
#x = get_results('SeqROCTM', 'model1', 5000)
#x.head(30)

In [20]:
method_name = 'smc'
model = 'model1'
size = 10000
#df, df1, df2 = get_results(method_name, model, size)
df, df1, df2 = compare('SeqROCTM', 'smc', 'model1', size)
prec_rec = df[['precision', 'recall']].mean()
dfr = df1[df1.tree=='000 1 10 100']
dfr2 = df2[df2.tree=='000 1 10 100']
correct_tree_occurrence = len(dfr)
found = dfr.opt.sum()
found2 = dfr2.opt.sum()
print("Method:", method_name)
print("Size:", size)
print("Model:", model)
print("Occurrences of correct tree:", correct_tree_occurrence)
print("Found by python version:", found2)
print("Found by matlab version:", found)
print("Precision: ", prec_rec.precision)
print("Recall: ", prec_rec.recall)

Method: smc
Size: 10000
Model: model1
Occurrences of correct tree: 55
Found by python version: 42
Found by matlab version: 37
Precision:  1.0
Recall:  0.8997050727050726


In [21]:
dfr2['opt_smc'] = dfr2.opt 
seq_opt = dfr.set_index('sample_idx')[['opt']]
smc_opt = dfr2.set_index('sample_idx')[['opt_smc']]
seq_opt.join(smc_opt)

Unnamed: 0_level_0,opt,opt_smc
sample_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
2,1,1
6,1,1
7,1,1
8,1,1
9,0,1
10,0,0
12,1,1
13,1,1
16,0,1
17,1,1
