This notebook considers: Interpolation
1. __data.x1__ in data.py. It contains 23 features (initiator exclusive). 
2. From the result, we can find there is no difference with manual selected 23 features. We don't generate features in this notebook but we get rules for 23 features.
3. In this notebook, we compare the prediction order (sphere, worm, vesicle, other) with (vesicle, worm, sphere, other). There is no difference between select variables.

In [1]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('./Script')

import data1 as data
import random
from common import *
from rules import *
from realkd.patch import RuleFit
from sklearn.model_selection import cross_validate, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
import numpy as np
import matplotlib.colors as mcolorss

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.multioutput import ClassifierChain
from multilabel import BinaryRelevanceClassifier, ProbabilisticClassifierChain

STATE = np.random.RandomState(seed=1000)

lr = LogisticRegressionCV(penalty='l1', solver='saga', random_state=STATE)
lr_ind = BinaryRelevanceClassifier(lr)
lr_chain = ClassifierChain(lr, order=[0, 2, 1, 3])
lr_pcc = ProbabilisticClassifierChain(lr) 

rf = RandomForestClassifier(random_state=STATE, min_samples_leaf=1, n_estimators=100)
rf_ind = BinaryRelevanceClassifier(rf)
rf_chain = ClassifierChain(rf, order=[0, 2, 1, 3])
rf_pcc = ProbabilisticClassifierChain(rf)

# Rulefit
rufit_pcc = RuleFitWrapper(mode='chain')

full_estimators = [lr_ind, lr_pcc, lr_chain, rf_ind, rf_pcc, rf_chain, rufit_pcc]
full_names = ['LR_ind', 'LR_pcc', 'LR_chain', 'RanF_ind', 'RanF_pcc', 'Ranf_chain', 'Rufit_pcc']

In [3]:
from common import Experiment, LogLikelihoodEvaluator
from sklearn.model_selection import KFold

print("Current Prediction Order is:", data.y.columns.tolist())
print('Num of predictors:, ', data.x1.shape[1])
interpolation = Experiment(full_estimators, 
                    full_names,
                    KFold(30, shuffle=True, random_state=STATE),
                    data.x1, data.y.replace(-1.0, 0.0),
                    groups=data.comp_ids.array, 
                    evaluators=['accuracy', LogLikelihoodEvaluator(base=2)],
                    verbose=True, file_name='interpolation_full').run()

Current Prediction Order is: ['sphere', 'worm', 'vesicle', 'other']
Num of predictors:,  23
Running experiment with 30 repetitions
******************************


In [4]:
# import pickle
# with open('interpolation_full_phase.pkl', 'wb') as f:   
#     pickle.dump(interpolation, f)

In [10]:
inter_no_comp_df = interpolation.summary()
change_columns = {"mean_train_log likelihood": 'mean_train_log loss', 
                  'std_train_log likelihood': 'std_train_log loss',
                  'mean_test_log likelihood': 'mean_test_log loss',
                  'std_test_log likelihood': 'std_test_log loss'} # chang log likelihood to log loss

inter_no_comp_df = inter_no_comp_df.rename(change_columns, axis=1)
inter_no_comp_df['mean_train_log loss'] = -1 * inter_no_comp_df['mean_train_log loss']
inter_no_comp_df['mean_test_log loss'] = -1 * inter_no_comp_df['mean_test_log loss']
   
inter_no_comp_df = inter_no_comp_df[inter_no_comp_df.index.isin(['LR_ind', 'RanF_pcc', 'Rufit_pcc'])]
inter_no_comp_df.index = ['LR', 'Rf', 'Rufit']
inter_no_comp_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss
LR,0.382689,0.011744,0.378509,0.14902,2.908712,0.011742,2.92992,0.345263
Rf,0.999185,0.001356,0.800789,0.066668,0.245952,0.004644,1.163131,0.806772
Rufit,0.954916,0.005991,0.785439,0.09054,0.374219,0.011522,1.039242,0.431893


In [11]:
df_gam = pd.read_csv('inter_GAM_result.csv')
df_gam = df_gam.rename(columns = {'mean_train_logloss': 'mean_train_log loss',
                        'std_train_logloss': 'std_train_log loss',
                        'mean_test_logloss': 'mean_test_log loss',
                        'std_test_logloss': 'std_test_log loss'})
df_gam.set_index('Unnamed: 0', inplace=True)
df_gam.index.name = None
df_gam.head()

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss
joint,0.642335,0.001623,0.58114,0.000786,1.049974,0.03462,1.699951,1.359898
sphere,0.862069,0.003167,0.844386,0.002025,0.350696,0.029128,0.409624,0.069212
vesicle,0.882548,0.002433,0.857368,0.001625,0.262162,0.01319,0.316949,0.038933
worm,0.816908,0.002272,0.787719,0.001638,0.4016,0.004016,0.445646,0.01036
other,0.986697,0.001604,0.971053,0.001181,0.035516,0.009613,0.527731,1.230176


In [12]:
temp = df_gam.loc[df_gam.index=='joint', ]
temp.index = ['Gam']      
inter_no_comp_df = pd.concat([inter_no_comp_df, temp])
inter_no_comp_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss
LR,0.382689,0.011744,0.378509,0.14902,2.908712,0.011742,2.92992,0.345263
Rf,0.999185,0.001356,0.800789,0.066668,0.245952,0.004644,1.163131,0.806772
Rufit,0.954916,0.005991,0.785439,0.09054,0.374219,0.011522,1.039242,0.431893
Gam,0.642335,0.001623,0.58114,0.000786,1.049974,0.03462,1.699951,1.359898


In [15]:
inter_no_comp_df['mean_train_error'] = 1-inter_no_comp_df['mean_train_accuracy']
inter_no_comp_df['std_train_error'] = inter_no_comp_df['std_train_accuracy']
inter_no_comp_df['mean_test_error'] = 1-inter_no_comp_df['mean_test_accuracy']
inter_no_comp_df['std_test_error'] = inter_no_comp_df['std_test_accuracy']
inter_no_comp_df = inter_no_comp_df.reindex(['LR', 'Gam', 'Rufit', 'Rf'])

inter_no_comp_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss,mean_train_error,std_train_error,mean_test_error,std_test_error
LR,0.382689,0.011744,0.378509,0.14902,2.908712,0.011742,2.92992,0.345263,0.617311,0.011744,0.621491,0.14902
Gam,0.642335,0.001623,0.58114,0.000786,1.049974,0.03462,1.699951,1.359898,0.357665,0.001623,0.41886,0.000786
Rufit,0.954916,0.005991,0.785439,0.09054,0.374219,0.011522,1.039242,0.431893,0.045084,0.005991,0.214561,0.09054
Rf,0.999185,0.001356,0.800789,0.066668,0.245952,0.004644,1.163131,0.806772,0.000815,0.001356,0.199211,0.066668


In [16]:
inter_no_comp_df.to_csv('inter_full_performance.csv')

In [None]:
def plot_summary(metric, summ, num_reps=30, baseline=None, names=None, colors = list(mcolors.BASE_COLORS.keys())):
    width = 0.35
    ind = np.arange(len(summ))
    plt.bar(ind-width/2, summ[f'mean_train_{metric}'], width=width, label='train', 
            yerr=summ[f'std_train_{metric}']/num_reps**0.5, capsize=3.0)
    plt.bar(ind+width/2, summ[f'mean_test_{metric}'], width=width, label='test',
            yerr=summ[f'std_test_{metric}']/num_reps**0.5, capsize=3.0)
    if baseline:
        for i in range(len(baseline)):
            plt.axhline(y=baseline[i], color=colors[i], linestyle='-', label=names[i])
    plt.ylabel(metric)
    plt.legend()
    plt.xticks(ind, summ.index)