This notebook considers: Interpolation

1. __data.x1__ in data.py. It contains 23 features (initiator exclusive). 
2. From the result, we can find there is no difference with manual selected 23 features. We don't generate features in this notebook but we get rules for 23 features.
3. In this notebook, we compare the prediction order (sphere, worm, vesicle, other) with (vesicle, worm, sphere, other). There is no difference between select variables.

In [1]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('./Script')

import data1 as data
import random
from common import *
from rules import *
from realkd.patch import RuleFit
from sklearn.model_selection import cross_validate, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
import numpy as np
import matplotlib.colors as mcolorss

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.multioutput import ClassifierChain
from multilabel import BinaryRelevanceClassifier, ProbabilisticClassifierChain

STATE = np.random.RandomState(seed=1000)

lr = LogisticRegressionCV(penalty='l1', solver='saga', random_state=STATE)

rf = RandomForestClassifier(random_state=STATE, min_samples_leaf=1, n_estimators=100)

# Rulefit
rufit = RuleFitWrapper()

indi_estimators = [lr, rf, rufit]
indi_names = ['LR', 'Rf', 'Rufit']

In [3]:
from common import Experiment, LogLikelihoodEvaluator
from sklearn.model_selection import KFold

inter_no_comp = {}

print('Num of predictors:, ', data.x1.shape[1])
for y in [data.sphere, data.worm, data.vesicle, data.other]:
    print('Target: ',y.name)
    experiment = Experiment(indi_estimators, 
                        indi_names,
                        KFold(30, shuffle=True, random_state=STATE),
                        data.x1, y.replace(-1, 0),
                        groups=data.comp_ids.array, 
                        evaluators=['accuracy', LogLikelihoodEvaluator(base=2)],
                        verbose=True, file_name='interpolation_indi')
    inter_no_comp[y.name] = experiment.run()
    print()

Num of predictors:,  23
Target:  sphere
Running experiment with 30 repetitions
******************************

Target:  worm
Running experiment with 30 repetitions
******************************

Target:  vesicle
Running experiment with 30 repetitions
******************************

Target:  other
Running experiment with 30 repetitions
******************************



In [5]:
# import pickle
# with open('interpolation_indi_phase.pkl', 'wb') as f:   
#     pickle.dump(inter_no_comp, f)

In [6]:
inter_no_comp_df = {}
for key in inter_no_comp:
    df = inter_no_comp[key].summary()
    df['mean_train_error'] = 1- df['mean_train_accuracy']
    df['std_train_error'] = df['std_train_accuracy']
    df['mean_test_error'] = 1- df['mean_test_accuracy']
    df['std_test_error'] = df['std_test_accuracy']
    inter_no_comp_df[key] = df

In [7]:
change_columns = {"mean_train_log likelihood": 'mean_train_log loss', 
                  'std_train_log likelihood': 'std_train_log loss',
                  'mean_test_log likelihood': 'mean_test_log loss',
                  'std_test_log likelihood': 'std_test_log loss'} # chang log likelihood to log loss

for each in inter_no_comp_df:
    inter_no_comp_df[each] = inter_no_comp_df[each].rename(change_columns, axis=1)
    inter_no_comp_df[each]['mean_train_log loss'] = -1*inter_no_comp_df[each]['mean_train_log loss']
    inter_no_comp_df[each]['mean_test_log loss'] = -1*inter_no_comp_df[each]['mean_test_log loss']

In [8]:
df_gam = pd.read_csv('inter_GAM_result.csv')
df_gam = df_gam.rename(columns = {'mean_train_logloss': 'mean_train_log loss',
                        'std_train_logloss': 'std_train_log loss',
                        'mean_test_logloss': 'mean_test_log loss',
                        'std_test_logloss': 'std_test_log loss'})
df_gam.set_index('Unnamed: 0', inplace=True)
df_gam.index.name = None
df_gam.head()

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss
joint,0.642335,0.001623,0.58114,0.000786,1.049974,0.03462,1.699951,1.359898
sphere,0.862069,0.003167,0.844386,0.002025,0.350696,0.029128,0.409624,0.069212
vesicle,0.882548,0.002433,0.857368,0.001625,0.262162,0.01319,0.316949,0.038933
worm,0.816908,0.002272,0.787719,0.001638,0.4016,0.004016,0.445646,0.01036
other,0.986697,0.001604,0.971053,0.001181,0.035516,0.009613,0.527731,1.230176


In [9]:
for key in inter_no_comp_df:
    temp = df_gam[df_gam.index == key]
    temp = temp.rename(index={key: 'Gam'})
    inter_no_comp_df[key] = pd.concat([inter_no_comp_df[key], temp])
    
    inter_no_comp_df[key]['mean_train_error'] = 1-inter_no_comp_df[key]['mean_train_accuracy']
    inter_no_comp_df[key]['std_train_error'] = inter_no_comp_df[key]['std_train_accuracy']
    inter_no_comp_df[key]['mean_test_error'] = 1-inter_no_comp_df[key]['mean_test_accuracy']
    inter_no_comp_df[key]['std_test_error'] = inter_no_comp_df[key]['std_test_accuracy']


In [10]:
from copy import deepcopy
dic = dict(zip(inter_no_comp_df['sphere'].columns.tolist(), 
               [[] for _ in range(len(inter_no_comp_df['sphere'].columns.tolist()))]))
model_dic = dict(zip(['LR', 'Rf', 'Rufit', 'Gam'], [deepcopy(dic) for _ in range(4)]))

for key in inter_no_comp_df:
    for col in inter_no_comp_df[key]:
        for indx in ['LR', 'Rf', 'Rufit', 'Gam']:
            model_dic[indx][col].append(inter_no_comp_df[key][col][indx])

In [11]:
for key in model_dic:
    for col in model_dic[key]:
        model_dic[key][col] = np.mean(model_dic[key][col])

In [14]:
average_df = pd.DataFrame(model_dic).T
average_df = average_df.reindex(['LR', 'Gam', 'Rufit', 'Rf'])

average_df.to_csv('inter_average_performance.csv')

In [15]:
average_df

Unnamed: 0,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_log loss,std_train_log loss,mean_test_log loss,std_test_log loss,mean_train_error,std_train_error,mean_test_error,std_test_error
LR,0.713231,0.004747,0.712061,0.092212,0.727174,0.004368,0.732412,0.128536,0.286769,0.004747,0.287939,0.092212
Gam,0.887055,0.002369,0.865132,0.001617,0.262494,0.013987,0.424988,0.33717,0.112945,0.002369,0.134868,0.001617
Rufit,0.989501,0.00276,0.923553,0.058483,0.085185,0.004568,0.279032,0.207822,0.010499,0.00276,0.076447,0.058483
Rf,0.999898,0.000188,0.932851,0.05424,0.064982,0.00198,0.286925,0.336166,0.000102,0.000188,0.067149,0.05424
