In [33]:
print(__doc__);

import time
import numpy as np 
import matplotlib.pyplot as plt 
from mpl_toolkits.axes_grid1.parasite_axes import host_subplot
from mpl_toolkits.axisartist.axislines import Axes
from scipy.sparse.csr import csr_matrix

from sklearn import datasets
from sklearn.utils import shuffle  #以一致的方式排列数组或稀疏矩阵
from sklearn.metrics import mean_squared_error  #
from sklearn.svm.classes import NuSVR
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.metrics import hamming_loss


    def generate_data(case,sparse=False):
        bunch = None
        if case == 'regression':
            bunch = datasets.load_boston()
        elif case == 'classification':
            bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
        x,y = shuffle(bunch.data,bunch.target)
        offset = int(x,shape[0]*0.8)
        x_train,y_train = x[:offset],y[:offset]
        x_test,y_test = x[offset:],y[offset:]
        if sparse:
            x_train = csr_matrix(x_train)
            x_test = csr_matrix(x_test)
        else:
            x_train = np.array(x_train)
            x_test = np.array(x_test)
        y_test = np.array(y_test)
        y_train = np.array(y_train)
        data = {'x_train':x_train,'x_test':x_test,'y_train':y_train,'y_test':y_test}
        return data


    def benchmark_influence(conf):
        prediction_times = []
        prediction_powers = []
        complexities =[]
        for param_value in conf['changing_param_values']:
            conf['tuned_params'][conf['changing_param']] = param_value
            estimator = conf['estimator'](**conf['tuned_params'])
            print("Benchmarking %s" % estimator)
            estimator.fit(conf['data']['x_train'],conf['data']['y_train'])
            conf['postfit_hook'](estimator)
            complexity = conf['complexity_computer'](estimator)
            complexities.append(complexity)
            start_time = time.time()
        for _ in range(conf['n_samples']):
            y_pred = estimator.predict(conf['data']['x_test'])
        elapsed_time = (time.time() - start_time)/float(conf['n_samples'])
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](conf['data']['y_test'],y_pred)
        prediction_powers.append(pred_score)
        print("Complexity:%d | %s:%.4f | Pred. Time:%fs\n"%(complexity,conf['prediction_performance_label'],pred_score,elapsed_time))
        return prediction_powers,prediction_times,complexities
    
    def plot_influence(conf,mse_values,prediction_times,complexities):
        plt.figure(figsize=(12,6))
        host = host_subplot(111,axes_class=Axes)
        plt.subplots_adjust(right=0.75)
        par1 = host.twinx()
        host.set_xlabel('Model Complexity(%s)' % conf['complexity_label'])
        y1_label = conf['prediction_performance_label']
        y2_label = "Time(s)"
        host.set_ylabel(y1_label)
        par1.set_ylabel(y2_label)
        p1,=host.plot(complexities,mse_values,'b-',label="prediction error")
        p2,=par1.plot(complexities,prediction_times,'r-',label="latency")
        host.legend(loc='upper right')
        host.axis["left"].label.set_color(p1.get_color())
        host.axis["right"].label.set_color(p2.get_color())
        plt.title('Influence of Model Complexity - %s' % conf['estimator']._name_)
        plt.show()
        
    def _count_nonzero_coefficients(estimator):
        a = estimator.coef_.toarray()
        return np.count_nonzero(a)
    
    regression_data = generate_data('regression')
    classification_data = generate_data('classification',sparse=True)
    configurations = [
        {'estimator':SGDClassifier,'tuned_params':{'penalty':'elasticnet','alpha':0.001,'loss':
                                                  'modified_huber','fit_intercept':True},
         'changing_param':'l1_ratio',
         'changing_param_values':[0.25,0.5,0.75,0.9],
         'complexity_label':'non_zero coefficients',
         'complexity_computer':_count_nonzero_coefficients,
         'prediction_performance_computer':hamming_loss,
         'prediction_performance_label':'Hamming Loss (Misclassification Ratio)',
         'postfit_hook':lambda x:x.sparsify(),
         'data':classification_data,
         'n_samples':30},
         {'estimator':NuSVR,
          'tuned_params':{'c':le3,'gamma':2 ** -15},
          'changing_param':'nu',
          'changing_param_values':[0.1,0.25,0.5,0.75,0.9],
          'complexity_label':'n_support_vectors',
          'complexity_computer':lambda x: len(x.support_vectors_),
          'data':regression_data,
          'postfit_hook':lambda x:x,
          'prediction_performance_computer':mean_squared_error,
          'perdiction_performance_label':'MSE',
          'n_samples':30},
         {'estimator':GradientBoostingRegressor,
          'tuned_params':{'loss':'ls'},
          'changing_param':'n_estimators',
          'changing_param_values':[10,50,100,200,500],
          'complexity_label':'n_trees',
          'complexity_computer':lambda x: x.estimators,
          'data':regression_data,
          'postfit_hook':lambda x:x,
          'prediction_performance_computer':mean_squared_error,
          'perdiction_performance_label':'MSE',
          'n_samples':30}
    ]
    for conf in configurations:
        prediction_performances,prediction_times,complexities = benchmark_influence(conf)
        plot_influence(conf,prediction_performances,prediction_times,complexxities)
        

Automatically created module for IPython interactive environment
