** Objective **

* Learn how to structure the experiments ?
* Hyper-parameter tuning for the model

In [1]:
import pandas as pd
import numpy as np
import os, sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.externals import joblib

from scipy.optimize import nnls
from scipy.stats import pearsonr

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from models import cross_validation, eval_metric

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
# load a dataset

train_1    = joblib.load(os.path.join(basepath, 'data/processed/dataset_1/train')) # MIR features
test_1     = joblib.load(os.path.join(basepath, 'data/processed/dataset_1/test'))

In [4]:
# define target variables

y_Ca    = train.Ca
y_P     = train.P
y_Sand  = train.Sand
y_SOC   = train.SOC
y_pH    = train.pH

** Split datasets into training and test set. **

In [5]:
params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train_1), **params)

In [6]:
def get_Xs(X, itrain, itest):
    X_train = X.iloc[itrain]
    X_test  = X.iloc[itest]
    
    return X_train, X_test
    
def get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest):
    y_train_Ca = y_Ca.iloc[itrain]
    y_test_Ca  = y_Ca.iloc[itest]
    
    y_train_P  = y_P.iloc[itrain]
    y_test_P  = y_P.iloc[itest]
    
    y_train_Sand  = y_Sand.iloc[itrain]
    y_test_Sand  = y_Sand.iloc[itest]
    
    y_train_SOC  = y_SOC.iloc[itrain]
    y_test_SOC  = y_SOC.iloc[itest]
    
    y_train_pH  = y_pH.iloc[itrain]
    y_test_pH  = y_pH.iloc[itest]
    
    
    return ([y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH],
            [y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH])

X_train_1, X_test_1    = get_Xs(train_1, itrain, itest)

y_trains, y_tests = get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests

** List of Models. **

In [7]:
def instantiate_models(targets, **params):
    pipelines = []
    
    for i in range(len(targets)):
        # based on the target variable create a pipeline
        
        pipeline = Pipeline([
                ('pca', RandomizedPCA(n_components=params['n_components'], whiten=True, random_state=11)),
                ('scaler', StandardScaler()),
                ('model', SVR(kernel=params['kernel']))
            ])
        
        pipelines.append(pipeline)
    
    return pipelines

In [10]:
# rbf kernel

params = {
    'n_components': 100,
    'kernel': 'rbf'
}

pipelines_rbf = instantiate_models(['Ca', 'P', 'Sand', 'SOC', 'pH'], **params)

In [11]:
# linear kernel

params = {
    'n_components': 100,
    'kernel': 'linear'
}

pipelines_linear = instantiate_models(['Ca', 'P', 'Sand', 'SOC', 'pH'], **params)

In [12]:
# polynomial kernel

params = {
    'n_components': 100,
    'kernel': 'poly'
}

pipelines_poly = instantiate_models(['Ca', 'P', 'Sand', 'SOC', 'pH'], **params)

In [19]:
print('============ Rbf Kernel')
print('Target: Ca\n')
cv_score = cross_validation.cv_scheme([pipelines_rbf[0]], [X_train_1], [y_train_Ca])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: Ca


MCRMSE score: 0.261146


MCRMSE score: 0.771985


MCRMSE score: 0.909863


MCRMSE score: 1.030925


MCRMSE score: 1.068632


MCRMSE score: 0.392604


MCRMSE score: 0.281089


MCRMSE score: 0.536600


MCRMSE score: 0.447675


MCRMSE score: 0.710902

Mean CV score: 0.641142


In [20]:
print('============ Linear Kernel')
print('Target: Ca\n')
cv_score = cross_validation.cv_scheme([pipelines_linear[0]], [X_train_1], [y_train_Ca])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: Ca


MCRMSE score: 0.149296


MCRMSE score: 0.296696


MCRMSE score: 0.498995


MCRMSE score: 0.569838


MCRMSE score: 0.497455


MCRMSE score: 0.419168


MCRMSE score: 0.162333


MCRMSE score: 0.291380


MCRMSE score: 0.316343


MCRMSE score: 0.391761

Mean CV score: 0.359327


In [21]:
print('============ Poly Kernel')
print('Target: Ca\n')
cv_score = cross_validation.cv_scheme([pipelines_poly[0]], [X_train_1], [y_train_Ca])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: Ca


MCRMSE score: 0.408076


MCRMSE score: 0.541938


MCRMSE score: 0.624262


MCRMSE score: 1.534261


MCRMSE score: 5.572011


MCRMSE score: 0.549818


MCRMSE score: 0.415179


MCRMSE score: 0.792534


MCRMSE score: 0.753022


MCRMSE score: 0.377009

Mean CV score: 1.156811


In [23]:
def train_model(pipeline, X, y):
    """
    Takes in a pipeline and corresponding X and y
    and trains a model
    """
    return pipeline.fit(X, y)

def train_pipelines(pipelines, Xs, ys):
    for i in range(len(pipelines)):
        pipelines[i] = train_model(pipelines[i], Xs[i], ys[i])
        
    return pipelines

def predictions(pipelines, Xtest):
    return np.array([pipelines[i].predict(Xtest[i]) for i in range(len(pipelines))])

In [24]:
def correlate_predictions(y_1, y_2, labels):
    """
    y_1 : Dataframe representing predictions for all of the target variables
    y_2 : Dataframe representing predictions for all of the target variables
    """
    
    for i in range(len(labels)):
        print('For %s correlation coefficient is: %f'%(labels[i], pearsonr(y_1[i], y_2[i])[0]))

** Train linear, rbf and poly kernels on the various target variables on dataset **

In [25]:
# rbf kernel
pipelines_rbf_trained = train_pipelines(pipelines_rbf, [X_train_1, X_train_1, X_train_1, X_train_1, X_train_1],
                                    [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

pipelines_rbf_predictions = predictions(pipelines_rbf_trained, [X_test_1, X_test_1, X_test_1, X_test_1, X_test_1])

In [26]:
# linear kernel
pipelines_linear_trained = train_pipelines(pipelines_linear, [X_train_1, X_train_1, X_train_1, X_train_1, X_train_1],
                                    [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

pipelines_linear_predictions = predictions(pipelines_linear_trained, [X_test_1, X_test_1, X_test_1, X_test_1, X_test_1])

In [27]:
# poly kernel
pipelines_poly_trained = train_pipelines(pipelines_poly, [X_train_1, X_train_1, X_train_1, X_train_1, X_train_1],
                                    [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

pipelines_poly_predictions = predictions(pipelines_poly_trained, [X_test_1, X_test_1, X_test_1, X_test_1, X_test_1])

** See how correlations among predictions. **

In [29]:
correlate_predictions(pipelines_rbf_predictions, pipelines_linear_predictions, ['Ca', 'P', 'Sand', 'SOC', 'pH'])

For Ca correlation coefficient is: 0.752028
For P correlation coefficient is: 0.746886
For Sand correlation coefficient is: 0.951844
For SOC correlation coefficient is: 0.874771
For pH correlation coefficient is: 0.887052


In [30]:
correlate_predictions(pipelines_rbf_predictions, pipelines_poly_predictions, ['Ca', 'P', 'Sand', 'SOC', 'pH'])

For Ca correlation coefficient is: 0.675932
For P correlation coefficient is: 0.821524
For Sand correlation coefficient is: 0.841555
For SOC correlation coefficient is: 0.801434
For pH correlation coefficient is: 0.753194


In [31]:
correlate_predictions(pipelines_linear_predictions, pipelines_poly_predictions, ['Ca', 'P', 'Sand', 'SOC', 'pH'])

For Ca correlation coefficient is: 0.881251
For P correlation coefficient is: 0.600292
For Sand correlation coefficient is: 0.837486
For SOC correlation coefficient is: 0.889768
For pH correlation coefficient is: 0.843601


** Performance of individual models on test set. **

In [36]:
# rbf kernel
print('MCRMSE: %f'%(eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH],
                                       pipelines_rbf_predictions)))

MCRMSE: 0.611984


In [37]:
# linear kernel
print('MCRMSE: %f'%(eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH],
                                       pipelines_linear_predictions)))

MCRMSE: 0.424516


In [38]:
# poly kernel
print('MCRMSE: %f'%(eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH],
                                       pipelines_poly_predictions)))

MCRMSE: 0.628008


** Choose weights. **

In [46]:
def weight_selected(data, labels):
    weights, _ = nnls(data[:len(labels)], labels)
    return weights

In [47]:
preds_Ca   = np.vstack([pipelines_rbf_predictions[0], pipelines_linear_predictions[0], pipelines_poly_predictions[0]]).T
preds_P    = np.vstack([pipelines_rbf_predictions[1], pipelines_linear_predictions[1], pipelines_poly_predictions[1]]).T
preds_Sand = np.vstack([pipelines_rbf_predictions[2], pipelines_linear_predictions[2], pipelines_poly_predictions[2]]).T
preds_SOC  = np.vstack([pipelines_rbf_predictions[3], pipelines_linear_predictions[3], pipelines_poly_predictions[3]]).T
preds_pH   = np.vstack([pipelines_rbf_predictions[4], pipelines_linear_predictions[4], pipelines_poly_predictions[4]]).T

In [48]:
weights_Ca = weight_selected(preds_Ca, y_test_Ca)
weights_P = weight_selected(preds_P, y_test_P)
weights_Sand = weight_selected(preds_Sand, y_test_Sand)
weights_SOC = weight_selected(preds_SOC, y_test_SOC)
weights_pH = weight_selected(preds_pH, y_test_pH)

In [49]:
balanced_pred_Ca = preds_Ca[:, weights_Ca > 0].mean(axis=1)[:len(y_test_Ca)]
balanced_pred_P = preds_P[:, weights_P > 0].mean(axis=1)[:len(y_test_P)]
balanced_pred_Sand = preds_Sand[:, weights_Sand > 0].mean(axis=1)[:len(y_test_Sand)]
balanced_pred_SOC = preds_SOC[:, weights_SOC > 0].mean(axis=1)[:len(y_test_SOC)]
balanced_pred_pH = preds_pH[:, weights_pH > 0].mean(axis=1)[:len(y_test_pH)]

In [50]:
print('MCRMSE after balancing: %f' %(eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH],
                                                 [balanced_pred_Ca, balanced_pred_P, balanced_pred_Sand, balanced_pred_SOC,
                                                  balanced_pred_pH])))

MCRMSE after balancing: 0.468917


** Training on full dataset. **

In [51]:
# rbf kernel
pipelines_rbf_trained = train_pipelines(pipelines_rbf, [train_1, train_1, train_1, train_1, train_1],
                                    [y_Ca, y_P, y_Sand, y_SOC, y_pH])

pipelines_rbf_predictions = predictions(pipelines_rbf_trained, [test_1, test_1, test_1, test_1, test_1])

In [52]:
# linear kernel
pipelines_linear_trained = train_pipelines(pipelines_linear, [train_1, train_1, train_1, train_1, train_1],
                                    [y_Ca, y_P, y_Sand, y_SOC, y_pH])

pipelines_linear_predictions = predictions(pipelines_linear_trained, [test_1, test_1, test_1, test_1, test_1])

In [53]:
# poly kernel
pipelines_poly_trained = train_pipelines(pipelines_poly, [train_1, train_1, train_1, train_1, train_1],
                                    [y_Ca, y_P, y_Sand, y_SOC, y_pH])

pipelines_poly_predictions = predictions(pipelines_poly_trained, [test_1, test_1, test_1, test_1, test_1])

In [64]:
preds_Ca   = np.vstack([pipelines_rbf_predictions[0], pipelines_linear_predictions[0], pipelines_poly_predictions[0]]).T
preds_P    = np.vstack([pipelines_rbf_predictions[1], pipelines_linear_predictions[1], pipelines_poly_predictions[1]]).T
preds_Sand = np.vstack([pipelines_rbf_predictions[2], pipelines_linear_predictions[2], pipelines_poly_predictions[2]]).T
preds_SOC  = np.vstack([pipelines_rbf_predictions[3], pipelines_linear_predictions[3], pipelines_poly_predictions[3]]).T
preds_pH   = np.vstack([pipelines_rbf_predictions[4], pipelines_linear_predictions[4], pipelines_poly_predictions[4]]).T

In [65]:
balanced_pred_Ca   = preds_Ca[:, weights_Ca > 0].mean(axis=1)[:len(y_Ca)]
balanced_pred_P    = preds_P[:, weights_P > 0].mean(axis=1)[:len(y_P)]
balanced_pred_Sand = preds_Sand[:, weights_Sand > 0].mean(axis=1)[:len(y_Sand)]
balanced_pred_SOC  = preds_SOC[:, weights_SOC > 0].mean(axis=1)[:len(y_SOC)]
balanced_pred_pH   = preds_pH[:, weights_pH > 0].mean(axis=1)[:len(y_pH)]

In [67]:
sample_sub['Ca']   = balanced_pred_Ca
sample_sub['P']    = balanced_pred_P
sample_sub['pH']   = balanced_pred_pH
sample_sub['SOC']  = balanced_pred_SOC
sample_sub['Sand'] = balanced_pred_Sand

** Public Leaderboard Score: 0.61319 , Private Leaderboard Score: 0.53293 **

In [68]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/dataset_1_svr.csv'), index=False)