In [None]:
import numpy as np
import pylab as pl
import xray
import pandas as pd

In [None]:
pd.options.display.max_rows = 8

In [None]:
%pylab inline
rcParams['figure.figsize'] = (14.0, 3.0)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

In [None]:
from sklearn.linear_model import LinearRegression, Perceptron, SGDRegressor, LogisticRegression, PassiveAggressiveRegressor
from sklearn.svm import SVR, NuSVR  #, LinearSVR
from sklearn.neural_network import MultilayerPerceptronRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
met_vars = ['SWdown', 'Tair', 'LWdown', 'Wind', 'Rainf', 'PSurf', 'Qair']
met_data = xray.open_dataset('/home/naught101/phd/data/PALS/datasets/met/TumbaFluxnet.1.4_met.nc')
met_df = met_data.to_dataframe().reset_index(['x','y','z']).ix[:, met_vars]

flux_vars = ['Qh', 'Qle', 'Rnet', 'NEE']
flux_data = xray.open_dataset('/home/naught101/phd/data/PALS/datasets/flux/TumbaFluxnet.1.4_flux.nc')
flux_df = flux_data.to_dataframe().reset_index(['x','y']).ix[:, flux_vars]


In [None]:
flux_df[0:2]

In [None]:
met_df[0:2]

In [None]:
import time

def timeit(f):
    def timed(*args, **kw):
        ts = time.time()
        result = f(*args, **kw)
        te = time.time()
        print(f.__name__, 'took: {:2.4f} sec'.format(te-ts))        
        return result
    return timed

In [None]:
@timeit
def fit_pipeline(pipe, X, Y):
    pipe.fit(X, Y)
    
    
@timeit
def get_pipeline_prediction(pipe, X):    
    return(pipe.predict(X))

In [None]:
def test_pipeline(name, pipe):
    y_var = 'Qh'
    X = met_df
    Y = np.array(flux_df[y_var])
    
    train_len = (7*len(X)//10)
    
    X_train = X[:train_len]
    X_validate = X[train_len:]
    Y_train = Y[:train_len]
    Y_validate = Y[train_len:]    
    
    print(name)
    [print('\t', k, ': ', v) for (k, v) in pipe.steps]
    print('---')
    fit_pipeline(pipe, X_train, Y_train)
    Y_pred = get_pipeline_prediction(pipe, X_validate)
    print('---')
    print(Y_pred.shape)
    if len(Y_pred.shape) > 1:
        Y_pred = Y_pred[:,0]
    print('RMSE: {:.2f}'.format(sqrt(mean((Y_pred-Y_validate)**2))))
    plot_data = pd.DataFrame({y_var+'_obs': Y_validate[1:350], y_var+'_pred': Y_pred[1:350]}) 
    pl.plot(plot_data)
    pl.legend(plot_data.columns)
    pl.show()
    

## Linear regression

In [None]:
test_pipeline("LinearRegression", make_pipeline(LinearRegression()))

In [None]:
pipe = make_pipeline(StandardScaler(), LinearRegression())
test_pipeline("LinearRegression", pipe)

In [None]:
pipe = make_pipeline(PCA(), LinearRegression())
test_pipeline("LinearRegression", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), PCA(), LinearRegression())
test_pipeline("LinearRegression", pipe)

## SGD

In [None]:
pipe = make_pipeline(SGDRegressor())
test_pipeline("SGDRegressor", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), SGDRegressor())
test_pipeline("SGDRegressor", pipe)

In [None]:
pipe = make_pipeline(PCA(), SGDRegressor())
test_pipeline("SGDRegressor", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), PCA(), SGDRegressor())
test_pipeline("SGDRegressor", pipe)

In [None]:
#test_model("LogisticRegression", LogisticRegression())

In [None]:
#test_model("PassiveAggressiveRegressor", PassiveAggressiveRegressor())

## Support Vector Machines

In [None]:
pipe = make_pipeline(StandardScaler(), SVR())
test_pipeline("SVR - linear Support Vector Regression", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), PCA(), SVR())
test_pipeline("SVR - linear Support Vector Regression", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), SVR(kernel='poly'))
test_pipeline("SVR - poly", pipe)

## Multilayer Perceptron

In [None]:
pipe = make_pipeline(MultilayerPerceptronRegressor())
test_pipeline("MultilayerPerceptronRegressor - default", pipe)  

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor())
test_pipeline("MultilayerPerceptronRegressor - default", pipe)  

In [None]:
pipe = make_pipeline(PCA(), MultilayerPerceptronRegressor())
test_pipeline("MultilayerPerceptronRegressor - default", pipe)  

In [None]:
pipe = make_pipeline(StandardScaler(), PCA(), MultilayerPerceptronRegressor())
test_pipeline("MultilayerPerceptronRegressor - default", pipe)           

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(activation='logistic'))
test_pipeline("MultilayerPerceptronRegressor- logistic", pipe)  

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(20,20,20,)))
test_pipeline("MultilayerPerceptronRegressor - 3 hidden layer", pipe)  

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(10,10,)))
test_pipeline("MultilayerPerceptronRegressor - 2 small hidden layer", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(10,30,)))
test_pipeline("MultilayerPerceptronRegressor - 2 small hidden layer", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), MultilayerPerceptronRegressor(hidden_layer_sizes=(20,20,)))
test_pipeline("MultilayerPerceptronRegressor - 2 small hidden layer", pipe)

## K-nearest neighbours 

In [None]:
pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
test_pipeline("KNeighborsRegressor", pipe)

In [None]:
pipe = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=1000))
test_pipeline("KNeighborsRegressor - 1000 neighbours", pipe)

## Decision Trees

In [None]:
pipe = make_pipeline(DecisionTreeRegressor())
test_pipeline("DecisionTreeRegressor", pipe)

In [None]:
pipe = make_pipeline(ExtraTreesRegressor())
test_pipeline("ExtraTreesRegressor", pipe)

In [None]:
import math
import numpy
import scipy
from scipy.stats import gaussian_kde
from scipy.integrate import dblquad

def mutual_info(x,y):
    # Constants
    MIN_DOUBLE = 4.9406564584124654e-324 
                        # The minimum size of a Float64; used here to prevent the
                        #  logarithmic function from hitting its undefined region
                        #  at its asymptote of 0.
    INF = float('inf')  # The floating-point representation for "infinity"

    # x and y are previously defined as collections of 
    # floating point values with the same length

    # Kernel estimation
    gkde_x = gaussian_kde(x)
    gkde_y = gaussian_kde(y)
    gkde_xy = gaussian_kde([x,y])

    mutual_info = lambda a,b: gkde_xy([a,b]) * \
               math.log((gkde_xy([a,b]) / (gkde_x(a) * gkde_y(b))) + MIN_DOUBLE)

    # Compute MI(X,Y)
    (minfo_xy, err_xy) = dblquad(mutual_info, -INF, INF, lambda a: 0, lambda a: INF)

    print('minfo_xy = ', minfo_xy)


In [None]:
mutual_info(met_df.SWdown, flux_df.Qh)

In [None]:
met_df.corr()

In [None]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
import numpy as np

iris = datasets.load_iris()
iris.data[0:30, 1] = np.random.rand(30) * iris.data[0:30, 1]
X_train = iris.data[0:100, :2]
Y_train = iris.data[0:100, 3]
X_test = iris.data[100:150, :2]
Y_test = iris.data[100:150, 3]

scaler = StandardScaler()
model = LinearRegression(normalize=True)

model.fit(X_train, Y_train)
pred = model.predict(X_test)

print('RMSE    raw: ', (np.mean((pred-Y_test)**2))**0.5)

model.fit(scaler.fit_transform(X_train), Y_train)
pred = model.predict(scaler.transform(X_test))

print('RMSE scaled: ', (np.mean((pred-Y_test)**2))**0.5)


In [None]:
plot(pred, Y_test)