This notebook considers: Extrapolation
1. __data.x1__ in data.py. It contains 23 features (initiator exclusive). 
2. From the result, we can find there is no difference with manual selected 23 features. We don't generate features in this notebook but we get rules for 23 features.
3. In this notebook, we compare the prediction order (sphere, worm, vesicle, other) with (vesicle, worm, sphere, other). There is no difference between select variables.

In [1]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('./Script')

import data1 as data
import random
from common import *
from rules import *
from realkd.patch import RuleFit
from sklearn.model_selection import cross_validate, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
import numpy as np
import matplotlib.colors as mcolorss

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.multioutput import ClassifierChain
from multilabel import BinaryRelevanceClassifier, ProbabilisticClassifierChain

STATE = np.random.RandomState(seed=1000)

lr = LogisticRegressionCV(penalty='l1', solver='saga', random_state=STATE)

rf = RandomForestClassifier(random_state=STATE, min_samples_leaf=1, n_estimators=100)

# Rulefit
rufit = RuleFitWrapper()

indi_estimators = [lr, rf, rufit]
indi_names = ['LR', 'Rf', 'Rufit']

In [3]:
from common import Experiment, LogLikelihoodEvaluator
from sklearn.model_selection import KFold

extra_no_comp = {}

print('Num of predictors:, ', data.x1.shape[1])
for y in [data.sphere, data.worm, data.vesicle, data.other]:
    print('Target: ',y.name)
    experiment = Experiment(indi_estimators, 
                        indi_names,
                        GroupKFold(37),
                        data.x1, y.replace(-1.0, 0.0), data.comp_ids.array, 
                        evaluators=['accuracy', LogLikelihoodEvaluator(base=2)], min_test_size=20, file_name='Extrapolation_ind')
    extra_no_comp[y.name] = experiment.run()
    print()

Num of predictors:,  23
Target:  sphere
Running experiment with 37 repetitions
****************************

Target:  worm
Running experiment with 37 repetitions
****************************

Target:  vesicle
Running experiment with 37 repetitions
****************************

Target:  other
Running experiment with 37 repetitions
****************************



In [5]:
# import pickle
# with open('extrapolation_indi_phase.pkl', 'wb') as f:   
#     pickle.dump(extra_no_comp, f)

In [15]:
extra_no_comp_df = {}
for key in extra_no_comp:
    df = extra_no_comp[key].summary()
    df['mean_train_error'] = 1- df['mean_train_accuracy']
    df['std_train_error'] = df['std_train_accuracy']
    df['mean_test_error'] = 1- df['mean_test_accuracy']
    df['std_test_error'] = df['std_test_accuracy']
    extra_no_comp_df[key] = df

In [16]:
change_columns = {"mean_train_log likelihood": 'mean_train_log loss', 
                  'std_train_log likelihood': 'std_train_log loss',
                  'mean_test_log likelihood': 'mean_test_log loss',
                  'std_test_log likelihood': 'std_test_log loss'} # chang log likelihood to log loss

for each in extra_no_comp_df:
    extra_no_comp_df[each] = extra_no_comp_df[each].rename(change_columns, axis=1)
    extra_no_comp_df[each]['mean_train_log loss'] = -1*extra_no_comp_df[each]['mean_train_log loss']
    extra_no_comp_df[each]['mean_test_log loss'] = -1*extra_no_comp_df[each]['mean_test_log loss']

In [17]:
df_gam = pd.read_csv('extra_GAM_result.csv')
df_gam = df_gam.rename(columns = {'mean_train_logloss': 'mean_train_log loss',
                        'std_train_logloss': 'std_train_log loss',
                        'mean_test_logloss': 'mean_test_log loss',
                        'std_test_logloss': 'std_test_log loss'})
df_gam.set_index('Unnamed: 0', inplace=True)
df_gam.index.name = None
df_gam.head()

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss
joint,0.638607,0.002073,0.646817,0.001785,1.05439,0.0305,3.114656,2.22833
sphere,0.858364,0.002833,0.824915,0.001583,0.351939,0.011727,0.524477,0.09637
vesicle,0.882097,0.004238,0.886735,0.002058,0.262497,0.019133,0.966372,0.414441
worm,0.816069,0.001879,0.85717,0.001868,0.403825,0.002732,0.402728,0.008308
other,0.986701,0.001306,0.964498,0.001236,0.03613,0.001374,1.221078,1.515344


In [18]:
for key in extra_no_comp_df:
    temp = df_gam[df_gam.index == key]
    temp = temp.rename(index={key: 'Gam'})
    extra_no_comp_df[key] = pd.concat([extra_no_comp_df[key], temp])
    
    extra_no_comp_df[key]['mean_train_error'] = 1-extra_no_comp_df[key]['mean_train_accuracy']
    extra_no_comp_df[key]['std_train_error'] = extra_no_comp_df[key]['std_train_accuracy']
    extra_no_comp_df[key]['mean_test_error'] = 1-extra_no_comp_df[key]['mean_test_accuracy']
    extra_no_comp_df[key]['std_test_error'] = extra_no_comp_df[key]['std_test_accuracy']


In [19]:
from copy import deepcopy
dic = dict(zip(extra_no_comp_df['sphere'].columns.tolist(), 
               [[] for _ in range(len(extra_no_comp_df['sphere'].columns.tolist()))]))
model_dic = dict(zip(['LR', 'Rf', 'Rufit', 'Gam'], [deepcopy(dic) for _ in range(4)]))

for key in extra_no_comp_df:
    for col in extra_no_comp_df[key]:
        for indx in ['LR', 'Rf', 'Rufit', 'Gam']:
            model_dic[indx][col].append(extra_no_comp_df[key][col][indx])

In [20]:
for key in model_dic:
    for col in model_dic[key]:
        model_dic[key][col] = np.mean(model_dic[key][col])

In [24]:
average_df = pd.DataFrame(model_dic).T
average_df = average_df.reindex(['LR', 'Gam', 'Rufit', 'Rf'])
average_df.to_csv('extra_average_performance.csv')

In [25]:
average_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss,mean_train_error,std_train_error,mean_test_error,std_test_error
LR,0.711198,0.004448,0.74259,0.362704,0.727807,0.004611,0.897607,1.024929,0.288802,0.004448,0.25741,0.362704
Gam,0.885808,0.002564,0.883329,0.001686,0.263598,0.008742,0.778664,0.508616,0.114192,0.002564,0.116671,0.001686
Rufit,0.989188,0.002794,0.832062,0.272435,0.085698,0.005195,0.657738,0.988463,0.010812,0.002794,0.167938,0.272435
Rf,1.0,0.0,0.876031,0.237429,0.064246,0.001849,0.678348,1.96099,0.0,0.0,0.123969,0.237429
