# Model exploration

## Todo

- add more metrics
    - mutual info score
- multi variate output
- table of results
- Rhys: Compare the functional form of empirical models to that of LSMs, see where they differ
    - multivariate functional form
- 


In [None]:
import numpy as np
import pylab as pl
import xray
import pandas as pd
import tables
import os, sys

from numbers import Number
from collections import OrderedDict

In [None]:
import pals_utils as pu
from pals_utils.stats import metrics

In [None]:
pd.options.display.max_rows = 8

In [None]:
%pylab inline
pl.rcParams['figure.figsize'] = (12.0, 3)
from IPython.display import display, HTML

In [None]:
#import mpld3
#mpld3.enable_notebook()

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split

In [None]:
from sklearn.linear_model import LinearRegression, Perceptron, SGDRegressor, LogisticRegression, PassiveAggressiveRegressor
from sklearn.svm import SVR, NuSVR  #, LinearSVR
# from sklearn.neural_network import MultilayerPerceptronRegressor # This is from a pull request: https://github.com/scikit-learn/scikit-learn/pull/3939
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
met_vars = ['SWdown', 'Tair', 'LWdown', 'Wind', 'Rainf', 'PSurf', 'Qair']
met_data = xray.open_dataset('/home/naught101/phd/data/PALS/datasets/met/TumbaFluxnet.1.4_met.nc')
met_df = met_data.to_dataframe().reset_index(['x','y','z']).ix[:, met_vars]

flux_vars = ['Qh', 'Qle', 'Rnet', 'NEE']
flux_data = xray.open_dataset('/home/naught101/phd/data/PALS/datasets/flux/TumbaFluxnet.1.4_flux.nc')
flux_df = flux_data.to_dataframe().reset_index(['x','y']).ix[:, flux_vars]


In [None]:
import time

def timeit(f):
    def timed(*args, **kw):
        ts = time.time()
        result = f(*args, **kw)
        te = time.time()
        #print(f.__name__, 'took: {:2.4f} sec'.format(te-ts))        
        return (result, te-ts)
    return timed

In [None]:
@timeit
def fit_pipeline(pipe, X, Y):
    pipe.fit(X, Y)
    
    
@timeit
def get_pipeline_prediction(pipe, X):    
    return(pipe.predict(X))


def get_pipeline_name(pipe, suffix=None):
    if suffix is not None:
        return ', '.join(list(pipe.named_steps.keys()) + [suffix])
    else:
        return ', '.join(pipe.named_steps.keys())

In [None]:
def plot_tests_data(Y_pred, Y_validate, y_var):
    # Sample plot
    plot_data = pd.DataFrame({y_var+'_obs': Y_validate, y_var+'_pred': Y_pred})
 
    # week 7 raw
    pl.plot(plot_data[(70*48):(77*48)])
    pl.legend(plot_data.columns)
    pl.show()
    
    # fornightly rolling mean
    pl.plot(pd.rolling_mean(plot_data, window=14*48))
    pl.legend(plot_data.columns)
    pl.show()
    
    #daily cycle
    pl.plot(plot_data.groupby(np.mod(plot_data.index, 48)).mean())
    pl.legend(plot_data.columns)
    pl.show()

In [None]:
def run_metrics(Y_pred, Y_validate, metrics):
    metric_data = OrderedDict()
    for (n, m) in metrics.items():
        metric_data[n] = m(Y_pred, Y_validate)
    return metric_data

In [None]:
if not os.path.exists('cache/'):
    os.mkdir('cache')
cache = pd.HDFStore('cache/cache.hdf5')


In [None]:
def test_pipeline(pipe, X=met_df, Y=flux_df, y_var='Qh', name=None, plot=False, cache=cache, clear_cache=False):
    """Top-level pipeline fitter and tester.
    
    Fits and predicts with a model, runs metrics, optionally runs some diagnostic plots.
    """
    
    if name is None:
        name = get_pipeline_name(pipe)

    if 'metric_data' in cache and not clear_cache:
        if name in cache.metric_data.index:
            print("Metrics already calculated for %s, skipping." % name)
            return
        metric_data = cache.metric_data
    else:
        metric_data = pd.DataFrame()
    
    Y = np.array(Y[y_var])
    
    train_len = (7*len(X)//10)
    
    # X_train, X_validate, Y_train, Y_validate = train_test_split(X, Y, train_size=0.7, random_state=0)
    X_train = X[:train_len]
    X_validate = X[train_len:]
    Y_train = Y[:train_len]
    Y_validate = Y[train_len:]
    
    
    if 'predictions/' + name in cache and not clear_cache:
        print('prediction already run for %s, skipping fit and predict' % name)
        Y_pred = np.array(cache['predictions'][name])
    else: 
        # Fit model
        metric_data.ix[name, 't_fit'] = fit_pipeline(pipe, X_train, Y_train)[1]
    
        # Run model
        (Y_pred, metric_data.ix[name, 't_pred']) = get_pipeline_prediction(pipe, X_validate)
        # Some sklearn models return vector (n,) inputs as 2D arrays (n,1)
        if len(Y_pred.shape) > 1:
            Y_pred = Y_pred[:,0]
        
        cache.put('predictions/' + name, pd.DataFrame(Y_pred))
    
    for k, v in run_metrics(Y_pred, Y_validate, metrics).items():
        metric_data.ix[name, k] = v
    cache['metric_data'] = metric_data
    cache.flush()
    
    # Plotting    
    if plot:
        [print('{:>10}:'.format(k), '{:.3f}'.format(v) if isinstance(v, Number) else v) for (k,v) in metric_data.items()]

        plot_test_data(Y_pred, Y_validate, y_var)
    

## Linear regression

- insensitive to scaling or PCA

In [None]:
pipe = make_pipeline(LinearRegression())
test_pipeline(pipe, clear_cache=True)

In [None]:
#pipe = make_pipeline(StandardScaler(), LinearRegression())
#test_pipeline(pipe)

In [None]:
#pipe = make_pipeline(PCA(), LinearRegression())
#test_pipeline(pipe)

In [None]:
#pipe = make_pipeline(StandardScaler(), PCA(), LinearRegression())
#test_pipeline(pipe)

## Polynomial regression

- Only a slight improvement
    - because non-linearities are localised?

In [None]:
pipe = make_pipeline(PolynomialFeatures(2), LinearRegression())
test_pipeline(pipe, name=get_pipeline_name(pipe, 'poly2'))

In [None]:
pipe = make_pipeline(PolynomialFeatures(5), LinearRegression())
test_pipeline(pipe, name=get_pipeline_name(pipe, 'poly5'))

In [None]:
met_df_with_lag = pd.concat([met_df, met_df.diff()], axis=1).dropna()
met_df_with_lag.shape

In [None]:
np.linalg.matrix_rank(np.array(met_df_with_lag[:40000]))

In [None]:
flux_df.shape

In [None]:
flux_df[1:40001].shape

In [None]:
pipe = make_pipeline(LinearRegression())
test_pipeline(pipe, X=met_df_with_lag[:40000], Y=flux_df[1:40001], name=get_pipeline_name(pipe, 'lag1'))

## SGD

- very sensitive to scaling. Not sensitive to PCA

In [None]:
#pipe = make_pipeline(SGDRegressor())
#test_pipeline(pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), SGDRegressor())
test_pipeline(pipe)

In [None]:
#pipe = make_pipeline(PCA(), SGDRegressor())
#test_pipeline(pipe)

In [None]:
#pipe = make_pipeline(StandardScaler(), PCA(), SGDRegressor())
#test_pipeline(pipe)

In [None]:
#test_model("LogisticRegression", LogisticRegression())

In [None]:
#test_model("PassiveAggressiveRegressor", PassiveAggressiveRegressor())

## Support Vector Machines

- Sensitive to scaling, not to PCA

In [None]:
#pipe = make_pipeline(SVR())
#test_pipeline(pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), SVR())
test_pipeline(pipe)

In [None]:
#pipe = make_pipeline(StandardScaler(), PCA(), SVR())
#test_pipeline(pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), SVR(kernel='poly'))
#
test_pipeline(pipe, name=get_pipeline_name(pipe, 'polykernel'))

## Multilayer Perceptron

In [None]:
pipe = make_pipeline(MultilayerPerceptronRegressor())
test_pipeline(pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor())
test_pipeline(pipe)  

In [None]:
pipe = make_pipeline(PCA(), MultilayerPerceptronRegressor())
test_pipeline(pipe)  

In [None]:
pipe = make_pipeline(StandardScaler(), PCA(), MultilayerPerceptronRegressor())
test_pipeline(pipe)           

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(activation='logistic'))
test_pipeline(pipe, get_pipeline_name(pipe, 'logisitic'))

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(20,20,20,)))
test_pipeline(pipe, get_pipeline_name(pipe, "[20,20,20]"))

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(10,10,)))
test_pipeline(pipe, get_pipeline_name(pipe, "[10,10]"))

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(10,30,)))
test_pipeline(pipe, get_pipeline_name(pipe, "[10,30]"))

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(20,20,)))
test_pipeline(pipe, get_pipeline_name(pipe, "[20,20]"))

## K-nearest neighbours 

- Not sensitive to scaling or PCA

In [None]:
pipe = make_pipeline(KNeighborsRegressor())
test_pipeline(pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
test_pipeline(pipe)

In [None]:
pipe = make_pipeline(PCA(), KNeighborsRegressor())
test_pipeline(pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=1000))
test_pipeline(pipe, get_pipeline_name(pipe, "1000 neighbours"))

## Decision Trees

In [None]:
pipe = make_pipeline(DecisionTreeRegressor())
test_pipeline(pipe)

In [None]:
pipe = make_pipeline(ExtraTreesRegressor())
test_pipeline(pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), PCA(), ExtraTreesRegressor())
test_pipeline(pipe)

# Metric results

In [None]:
cache.metric_data

In [None]:
normed_metrics = cache.metric_data - cache.metric_data.min()
normed_metrics /= normed_metrics.max()

In [None]:
normed_metrics.columns

In [None]:
normed_metrics[['corr', 'nme', 'mbe', 'sd_diff']].plot(kind='bar')

In [None]:
normed_metrics[['extreme_5','extreme_95']].plot(kind='bar')

In [None]:
cache

In [None]:
?cache

In [None]:
pipe

In [None]:
?pd.DataFrame.values

In [None]:
?base_repr(

In [None]:
np.unsignedinteger(1)

In [None]:
hash(KNeighborsRegressor(n_neighbors=1000))


In [None]:
hash(object())

In [None]:
import hashlib

In [None]:
joblib.hash(str(pipe.get_params()))

In [None]:
l=pipe.named_steps['linearregression']
l.score()

In [None]:
import joblib

In [None]:
a = np.array([321,12.3,1,1.4,1])

In [None]:
hash(a)