In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from pyearth import Earth

from tpot import TPOTRegressor





In [2]:
df = pd.read_csv('out.csv')

def data_normalize(df):
    cols = ['train_mean', 'train_std', 'test_mean', 'test_std']
    for col in cols:
        d = df.groupby('dataset')[col].max().to_dict()
        for k, v in d.items():
            df.loc[df['dataset']==k, col] /= v
    return df


In [42]:
inputs = ['allow_linear', 
          'allow_missing',
          'check_every', 
          'enable_pruning', 
          'endspan_alpha', 
          'max_degree', 
          'max_terms', 
          'minspan_alpha', 
          'penalty', 
          'smooth']
outputs = [
    'train_mean',
    'train_std',
    'test_mean',
    'test_std'
]

In [45]:
score = defaultdict(lambda:1.)
for dataset in df['dataset'].unique():
    print(dataset)
    df_ = df[df['dataset']==dataset].copy()
    df_['train_mean'] /= df_['train_mean'].max()
    
    X, y = df_[inputs], df_[outputs]
    y = y.values
    
    reg = Earth(feature_importance_type='rss', max_degree=10, max_terms=30)
    #reg = TPOTRegressor(generations=2, population_size=2, verbosity=0)
    reg.fit(X, y)
    print(np.sqrt(((reg.predict(X) - y)**2).mean()))
    
    
    a = pd.DataFrame({'labels': reg.xlabels_, 'imp': reg.feature_importances_}).sort_values(by='imp')
    for label, imp in zip(reg.xlabels_, reg.feature_importances_):
        score[label] += imp
    print(reg.summary_feature_importances(sort_by='rss'))

data/uci_standard/ENB2012_data.csv
0.0168615975557
                   rss
check_every        0.43   
max_degree         0.43   
allow_missing      0.08   
smooth             0.02   
endspan_alpha      0.02   
minspan_alpha      0.00   
penalty            0.00   
max_terms          0.00   
enable_pruning     0.00   
allow_linear       0.00   

data/uci_standard/CBM.csv
0.176103692469
                   rss
allow_linear       0.24   
penalty            0.16   
minspan_alpha      0.14   
check_every        0.13   
enable_pruning     0.12   
max_terms          0.10   
max_degree         0.06   
smooth             0.03   
allow_missing      0.01   
endspan_alpha      0.01   

data/uci_standard/winequality-white.csv
0.0287688652577
                   rss
max_terms          0.50   
endspan_alpha      0.50   
smooth             0.00   
penalty            0.00   
minspan_alpha      0.00   
max_degree         0.00   
enable_pruning     0.00   
check_every        0.00   
allow_missing      0.00  

In [46]:
print(score)
pd.DataFrame({'name': score.keys(), 'imp': score.values()}).sort_values(by='imp', ascending=False)

defaultdict(<function <lambda> at 0x7f6167bdfcf8>, {'penalty': 2.3679856508331523, 'max_terms': 2.1829504474538473, 'enable_pruning': 1.3175516469448572, 'endspan_alpha': 2.7172496759492177, 'smooth': 1.3771122645269545, 'max_degree': 1.8765253183188806, 'minspan_alpha': 2.0235133039084254, 'check_every': 1.977674610814244, 'allow_missing': 1.229250979275371, 'allow_linear': 1.9301861019750508})


Unnamed: 0,imp,name
3,2.71725,endspan_alpha
0,2.367986,penalty
1,2.18295,max_terms
6,2.023513,minspan_alpha
7,1.977675,check_every
9,1.930186,allow_linear
5,1.876525,max_degree
4,1.377112,smooth
2,1.317552,enable_pruning
8,1.229251,allow_missing


In [16]:
from lightjob.cli import load_db
db=load_db()
j=db.get_job_by_summary('ebfc21b0a990da74650221bc557b50f5')
np.std(np.sqrt(j['content']['result']['score_test']))

0.013437409728650677