This notebook considers: Extrapolation
1. __data.x1__ in data.py. It contains 23 features (initiator exclusive). 
2. From the result, we can find there is no difference with manual selected 23 features. We don't generate features in this notebook but we get rules for 23 features.
3. In this notebook, we compare the prediction order (sphere, worm, vesicle, other) with (vesicle, worm, sphere, other). There is no difference between select variables.

In [1]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('./Script')

import data1 as data
import random
from common import *
from rules import *
from realkd.patch import RuleFit
from sklearn.model_selection import cross_validate, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
import numpy as np
import matplotlib.colors as mcolorss

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.multioutput import ClassifierChain
from multilabel import BinaryRelevanceClassifier, ProbabilisticClassifierChain

STATE = np.random.RandomState(seed=1000)

lr = LogisticRegressionCV(penalty='l1', solver='saga', random_state=STATE)
lr_ind = BinaryRelevanceClassifier(lr)
lr_chain = ClassifierChain(lr, order=[0, 2, 1, 3])
lr_pcc = ProbabilisticClassifierChain(lr) 

rf = RandomForestClassifier(random_state=STATE, min_samples_leaf=1, n_estimators=100)
rf_ind = BinaryRelevanceClassifier(rf)
rf_chain = ClassifierChain(rf, order=[0, 2, 1, 3])
rf_pcc = ProbabilisticClassifierChain(rf)

# Rulefit
rufit_pcc = RuleFitWrapper(mode='chain')

full_estimators = [lr_ind, lr_pcc, lr_chain, rf_ind, rf_pcc, rf_chain, rufit_pcc]
full_names = ['LR_ind', 'LR_pcc', 'LR_chain', 'RanF_ind', 'RanF_pcc', 'Ranf_chain', 'Rufit_pcc']

In [3]:
from common import ExtrapolationExperiment, sample_size, GroupDescription
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

print("Current Prediction Order is:", data.y.columns.tolist())
print('Num of predictors:, ', data.x1.shape[1])
extrapolation_full_phase = ExtrapolationExperiment(full_estimators, 
                    full_names,                    
                    data.x1, data.y.replace(-1.0, 0.0),
                    data.comp_ids.array, 
                    score=['accuracy', LogLikelihoodEvaluator(base=2), 
                            sample_size,
                            GroupDescription(data.comp_descr, 'composition')], min_test_size=20, file_name='Extrapolation_full').run()

Current Prediction Order is: ['sphere', 'worm', 'vesicle', 'other']
Num of predictors:,  23
Running experiment with 37 repetitions
****************************


In [5]:
# import pickle
# with open('extrapolation_full_phase.pkl', 'wb') as f:   
#     pickle.dump(extrapolation_full_phase, f)

In [13]:
extra_no_comp_df = extrapolation_full_phase.summary()
change_columns = {"mean_train_log likelihood": 'mean_train_log loss', 
                  'std_train_log likelihood': 'std_train_log loss',
                  'mean_test_log likelihood': 'mean_test_log loss',
                  'std_test_log likelihood': 'std_test_log loss'} # chang log likelihood to log loss

extra_no_comp_df = extra_no_comp_df.rename(change_columns, axis=1)
extra_no_comp_df['mean_train_log loss'] = -1 * extra_no_comp_df['mean_train_log loss']
extra_no_comp_df['mean_test_log loss'] = -1 * extra_no_comp_df['mean_test_log loss']
    
extra_no_comp_df = extra_no_comp_df[extra_no_comp_df.index.isin(['LR_ind', 'RanF_pcc', 'Rufit_pcc'])]
extra_no_comp_df.index = ['LR', 'Rf', 'Rufit']
extra_no_comp_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss,mean_train_size,std_train_size,mean_test_size,std_test_size
LR,0.377717,0.011388,0.446446,0.438791,2.911227,0.01301,3.590427,3.078563,586.214286,4.863752,5.785714,4.863752
Rf,0.999573,0.000754,0.759012,0.380918,0.242205,0.004092,2.069523,4.054021,586.214286,4.863752,5.785714,4.863752
Rufit,0.958876,0.005074,0.614941,0.429592,0.367581,0.008444,2.346265,2.671447,586.214286,4.863752,5.785714,4.863752


In [14]:
df_gam = pd.read_csv('extra_GAM_result.csv')
df_gam = df_gam.rename(columns = {'mean_train_logloss': 'mean_train_log loss',
                        'std_train_logloss': 'std_train_log loss',
                        'mean_test_logloss': 'mean_test_log loss',
                        'std_test_logloss': 'std_test_log loss'})
df_gam.set_index('Unnamed: 0', inplace=True)
df_gam.index.name = None
df_gam.head()

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss
joint,0.638607,0.002073,0.646817,0.001785,1.05439,0.0305,3.114656,2.22833
sphere,0.858364,0.002833,0.824915,0.001583,0.351939,0.011727,0.524477,0.09637
vesicle,0.882097,0.004238,0.886735,0.002058,0.262497,0.019133,0.966372,0.414441
worm,0.816069,0.001879,0.85717,0.001868,0.403825,0.002732,0.402728,0.008308
other,0.986701,0.001306,0.964498,0.001236,0.03613,0.001374,1.221078,1.515344


In [15]:
temp = df_gam.loc[df_gam.index=='joint', ]
temp.index = ['Gam']      
extra_no_comp_df = pd.concat([extra_no_comp_df, temp])
extra_no_comp_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss,mean_train_size,std_train_size,mean_test_size,std_test_size
LR,0.377717,0.011388,0.446446,0.438791,2.911227,0.01301,3.590427,3.078563,586.214286,4.863752,5.785714,4.863752
Rf,0.999573,0.000754,0.759012,0.380918,0.242205,0.004092,2.069523,4.054021,586.214286,4.863752,5.785714,4.863752
Rufit,0.958876,0.005074,0.614941,0.429592,0.367581,0.008444,2.346265,2.671447,586.214286,4.863752,5.785714,4.863752
Gam,0.638607,0.002073,0.646817,0.001785,1.05439,0.0305,3.114656,2.22833,,,,


In [21]:
extra_no_comp_df['mean_train_error'] = 1-extra_no_comp_df['mean_train_accuracy']
extra_no_comp_df['std_train_error'] = extra_no_comp_df['std_train_accuracy']
extra_no_comp_df['mean_test_error'] = 1-extra_no_comp_df['mean_test_accuracy']
extra_no_comp_df['std_test_error'] = extra_no_comp_df['std_test_accuracy']
extra_no_comp_df = extra_no_comp_df.reindex(['LR', 'Gam', 'Rufit', 'Rf'])
extra_no_comp_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss,mean_train_size,std_train_size,mean_test_size,std_test_size,mean_train_error,std_train_error,mean_test_error,std_test_error
LR,0.377717,0.011388,0.446446,0.438791,2.911227,0.01301,3.590427,3.078563,586.214286,4.863752,5.785714,4.863752,0.622283,0.011388,0.553554,0.438791
Gam,0.638607,0.002073,0.646817,0.001785,1.05439,0.0305,3.114656,2.22833,,,,,0.361393,0.002073,0.353183,0.001785
Rufit,0.958876,0.005074,0.614941,0.429592,0.367581,0.008444,2.346265,2.671447,586.214286,4.863752,5.785714,4.863752,0.041124,0.005074,0.385059,0.429592
Rf,0.999573,0.000754,0.759012,0.380918,0.242205,0.004092,2.069523,4.054021,586.214286,4.863752,5.785714,4.863752,0.000427,0.000754,0.240988,0.380918


In [22]:
extra_no_comp_df.to_csv('extra_full_performance.csv')