** Objective **

* Learn how to structure the experiments ?
* Hyper-parameter tuning for the model

In [1]:
import pandas as pd
import numpy as np
import os, sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.externals import joblib

from scipy.optimize import nnls
from scipy.stats import pearsonr

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from models import cross_validation, eval_metric

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
# load a dataset

train_1    = joblib.load(os.path.join(basepath, 'data/processed/dataset_1/train')) # MIR features
test_1     = joblib.load(os.path.join(basepath, 'data/processed/dataset_1/test'))

In [4]:
# define target variables

y_Ca    = train.Ca
y_P     = train.P
y_Sand  = train.Sand
y_SOC   = train.SOC
y_pH    = train.pH

** Split datasets into training and test set. **

In [5]:
params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train_1), **params)

In [6]:
def get_Xs(X, itrain, itest):
    X_train = X.iloc[itrain]
    X_test  = X.iloc[itest]
    
    return X_train, X_test
    
def get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest):
    y_train_Ca = y_Ca.iloc[itrain]
    y_test_Ca  = y_Ca.iloc[itest]
    
    y_train_P  = y_P.iloc[itrain]
    y_test_P  = y_P.iloc[itest]
    
    y_train_Sand  = y_Sand.iloc[itrain]
    y_test_Sand  = y_Sand.iloc[itest]
    
    y_train_SOC  = y_SOC.iloc[itrain]
    y_test_SOC  = y_SOC.iloc[itest]
    
    y_train_pH  = y_pH.iloc[itrain]
    y_test_pH  = y_pH.iloc[itest]
    
    
    return ([y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH],
            [y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH])

X_train_1, X_test_1    = get_Xs(train_1, itrain, itest)

y_trains, y_tests = get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests

** List of Models. **

In [9]:
def instantiate_models(targets, **params):
    pipelines = []
    
    for i in range(len(targets)):
        # based on the target variable create a pipeline
        
        pipeline = Pipeline([
                ('pca', RandomizedPCA(n_components=params['n_components'], whiten=True, random_state=11)),
                ('scaler', StandardScaler()),
                ('model', SVR(kernel=params['kernel']))
            ])
        
        pipelines.append(pipeline)
    
    return pipelines

In [10]:
params = {
    'n_components': 100,
    'kernel': 'rbf'
}

pipelines = instantiate_models(['Ca', 'P', 'Sand', 'SOC', 'pH'], **params)

In [11]:
params = {
    'n_components': 100,
    'kernel': 'linear'
}

pipelines_linear = instantiate_models(['Ca', 'P', 'Sand', 'SOC', 'pH'], **params)

In [12]:
params = {
    'n_components': 100,
    'kernel': 'poly'
}

pipelines_poly = instantiate_models(['Ca', 'P', 'Sand', 'SOC', 'pH'], **params)

In [15]:
# set up local cross validation scheme

pipelines = [Pipeline([
                ('scaler', StandardScaler()),
                ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=11)),
                ('model', SVR(kernel='linear'))
            ])
            ]

In [16]:
print('Target: Ca\n')
cv_score = cross_validation.cv_scheme(pipelines, [X_train_1], [y_train_Ca])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: Ca


MCRMSE score: 0.159422


MCRMSE score: 0.318131


MCRMSE score: 0.498546


MCRMSE score: 0.492167


MCRMSE score: 0.617636


MCRMSE score: 0.391185


MCRMSE score: 0.172568


MCRMSE score: 0.267217


MCRMSE score: 0.328175


MCRMSE score: 0.408929

Mean CV score: 0.365398


In [61]:
pipelines = [Pipeline([
                ('scaler', MinMaxScaler()),
                ('pca', PCA(n_components=100, whiten=True)),
                ('model', SVR(C=10., kernel='linear'))
            ])
            ]

In [None]:
print('Target: P\n')
cv_score = cross_validation.cv_scheme(pipelines, [X_train_1], [y_train_P])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: P


MCRMSE score: 1.507841




In [24]:
pipelines = [Pipeline([
                ('scaler', StandardScaler()),
                ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=11)),
                ('model', SVR(kernel='linear'))
            ])
            ]

In [25]:
print('Target: Sand\n')
cv_score = cross_validation.cv_scheme(pipelines, [X_train_1], [y_train_Sand])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: Sand


MCRMSE score: 0.354989


MCRMSE score: 0.319652


MCRMSE score: 0.470740


MCRMSE score: 0.427449


MCRMSE score: 0.455251


MCRMSE score: 0.415363


MCRMSE score: 0.362000


MCRMSE score: 0.342332


MCRMSE score: 0.406449


MCRMSE score: 0.350251

Mean CV score: 0.390447


In [26]:
print('Target: SOC\n')
cv_score = cross_validation.cv_scheme(pipelines, [X_train_1], [y_train_SOC])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: SOC


MCRMSE score: 0.415484


MCRMSE score: 0.224141


MCRMSE score: 0.295390


MCRMSE score: 0.297568


MCRMSE score: 0.414757


MCRMSE score: 0.398907


MCRMSE score: 0.323590


MCRMSE score: 0.304843


MCRMSE score: 0.609300


MCRMSE score: 0.328634

Mean CV score: 0.361261


In [27]:
pipelines = [Pipeline([
                ('scaler', StandardScaler()),
                ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=11)),
                ('model', SVR(kernel='linear'))
            ])
            ]

In [28]:
print('Target: pH\n')
cv_score = cross_validation.cv_scheme(pipelines, [X_train_1], [y_train_pH])
print('=============================')
print('Mean CV score: %f'%cv_score)

Target: pH


MCRMSE score: 0.335811


MCRMSE score: 0.316056


MCRMSE score: 0.393786


MCRMSE score: 0.392708


MCRMSE score: 0.445078


MCRMSE score: 0.371086


MCRMSE score: 0.399387


MCRMSE score: 0.366781


MCRMSE score: 0.394995


MCRMSE score: 0.467595

Mean CV score: 0.388328


In [21]:
def train_model(pipeline, X, y):
    """
    Takes in a pipeline and corresponding X and y
    and trains a model
    """
    return pipeline.fit(X, y)

def train_pipelines(pipelines, Xs, ys):
    for i in range(len(pipelines)):
        pipelines[i] = train_model(pipelines[i], Xs[i], ys[i])
        
    return pipelines

def predictions(pipelines, Xtest):
    return np.array([pipelines[i].predict(Xtest[i]) for i in range(len(pipelines))])
                               
pipelines_trained = train_pipelines(pipelines, [X_train_1, X_train_1, X_train_1, X_train_1, X_train_1],
                                    [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

pipelines_predictions = predictions(pipelines_trained, [X_test_1, X_test_1, X_test_1, X_test_1, X_test_1])

In [None]:
def correlate_predictions(y_1, y_2, labels):
    """
    y_1 : Dataframe representing predictions for all of the target variables
    y_2 : Dataframe representing predictions for all of the target variables
    """
    
    for i in range(len(labels)):
        print('For %s correlation coefficient is: %f'%(labels[i], pearsonr(y_1[i], y_2[i])[0]))

In [32]:
pipelines_linear_trained = train_pipelines(pipelines_linear, [X_train_1, X_train_1, X_train_1, X_train_1, X_train_1],
                                    [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

pipelines_linear_predictions = predictions(pipelines_linear_trained, [X_test_1, X_test_1, X_test_1, X_test_1, X_test_1])

In [40]:
pipelines_poly_trained = train_pipelines(pipelines_poly, [X_train_1, X_train_1, X_train_1, X_train_1, X_train_1],
                                    [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

pipelines_poly_predictions = predictions(pipelines_poly_trained, [X_test_1, X_test_1, X_test_1, X_test_1, X_test_1])

In [41]:
correlate_predictions(pipelines_predictions, pipelines_linear_predictions, ['Ca', 'P', 'Sand', 'SOC', 'pH'])

For Ca correlation coefficient is: 0.752028
For P correlation coefficient is: 0.746886
For Sand correlation coefficient is: 0.951844
For SOC correlation coefficient is: 0.874771
For pH correlation coefficient is: 0.887052


In [42]:
correlate_predictions(pipelines_predictions, pipelines_poly_predictions, ['Ca', 'P', 'Sand', 'SOC', 'pH'])

For Ca correlation coefficient is: 0.675932
For P correlation coefficient is: 0.821524
For Sand correlation coefficient is: 0.841555
For SOC correlation coefficient is: 0.801434
For pH correlation coefficient is: 0.753194


In [43]:
correlate_predictions(pipelines_linear_predictions, pipelines_poly_predictions, ['Ca', 'P', 'Sand', 'SOC', 'pH'])

For Ca correlation coefficient is: 0.881251
For P correlation coefficient is: 0.600292
For Sand correlation coefficient is: 0.837486
For SOC correlation coefficient is: 0.889768
For pH correlation coefficient is: 0.843601


In [7]:
def weight_selected(data, labels):
    weights, _ = nnls(data[:len(labels)], labels)
    return weights

In [9]:
preds_Ca = np.vstack([y_dataset_1_Ca, y_dataset_2_Ca, y_dataset_3_Ca]).T
preds_P = np.vstack([y_dataset_1_P, y_dataset_2_P, y_dataset_3_P]).T
preds_Sand = np.vstack([y_dataset_1_Sand, y_dataset_2_Sand, y_dataset_3_Sand]).T
preds_SOC = np.vstack([y_dataset_1_SOC, y_dataset_2_SOC, y_dataset_3_SOC]).T
preds_pH = np.vstack([y_dataset_1_pH, y_dataset_2_pH, y_dataset_3_pH]).T

In [13]:
weights_Ca = weight_selected(preds_Ca, y_test_Ca)
weights_P = weight_selected(preds_P, y_test_P)
weights_Sand = weight_selected(preds_Sand, y_test_Sand)
weights_SOC = weight_selected(preds_SOC, y_test_SOC)
weights_pH = weight_selected(preds_pH, y_test_pH)

In [14]:
balanced_pred_Ca = preds_Ca[:, weights_Ca > 0].mean(axis=1)[:len(y_test_Ca)]
balanced_pred_P = preds_P[:, weights_P > 0].mean(axis=1)[:len(y_test_P)]
balanced_pred_Sand = preds_Sand[:, weights_Sand > 0].mean(axis=1)[:len(y_test_Sand)]
balanced_pred_SOC = preds_SOC[:, weights_SOC > 0].mean(axis=1)[:len(y_test_SOC)]
balanced_pred_pH = preds_pH[:, weights_pH > 0].mean(axis=1)[:len(y_test_pH)]

In [15]:
print('MCRMSE after balancing: %f' %(eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH],
                                                 [balanced_pred_Ca, balanced_pred_P, balanced_pred_Sand, balanced_pred_SOC,
                                                  balanced_pred_pH])))

MCRMSE after balancing: 0.412842


** Training on full dataset. **

In [16]:
pipeline_1.fit(train_1, y_Ca)
pipeline_2.fit(train_1, y_P)
pipeline_3.fit(train_1, y_Sand)
pipeline_4.fit(train_1, y_SOC)
pipeline_5.fit(train_1, y_pH)

pipeline_6.fit(train_2, y_Ca)
pipeline_7.fit(train_2, y_P)
pipeline_8.fit(train_2, y_Sand)
pipeline_9.fit(train_2, y_SOC)
pipeline_10.fit(train_2, y_pH)

pipeline_11.fit(train_3, y_Ca)
pipeline_12.fit(train_3, y_P)
pipeline_13.fit(train_3, y_Sand)
pipeline_14.fit(train_3, y_SOC)
pipeline_15.fit(train_3, y_pH)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [18]:
joblib.dump(pipeline_1, os.path.join(basepath, 'data/processed/pipeline_full_Ca/dataset_1/model/Ca'))
joblib.dump(pipeline_2, os.path.join(basepath, 'data/processed/pipeline_full_P/dataset_1/model/P'))
joblib.dump(pipeline_3, os.path.join(basepath, 'data/processed/pipeline_full_Sand/dataset_1/model/Sand'))
joblib.dump(pipeline_4, os.path.join(basepath, 'data/processed/pipeline_full_SOC/dataset_1/model/SOC'))
joblib.dump(pipeline_5, os.path.join(basepath, 'data/processed/pipeline_full_pH/dataset_1/model/pH'))

joblib.dump(pipeline_6, os.path.join(basepath, 'data/processed/pipeline_full_Ca/dataset_2/model/Ca'))
joblib.dump(pipeline_7, os.path.join(basepath, 'data/processed/pipeline_full_P/dataset_2/model/P'))
joblib.dump(pipeline_8, os.path.join(basepath, 'data/processed/pipeline_full_Sand/dataset_2/model/Sand'))
joblib.dump(pipeline_9, os.path.join(basepath, 'data/processed/pipeline_full_SOC/dataset_2/model/SOC'))
joblib.dump(pipeline_10, os.path.join(basepath, 'data/processed/pipeline_full_pH/dataset_2/model/pH'))

joblib.dump(pipeline_11, os.path.join(basepath, 'data/processed/pipeline_full_Ca/dataset_3/model/Ca'))
joblib.dump(pipeline_12, os.path.join(basepath, 'data/processed/pipeline_full_P/dataset_3/model/P'))
joblib.dump(pipeline_13, os.path.join(basepath, 'data/processed/pipeline_full_Sand/dataset_3/model/Sand'))
joblib.dump(pipeline_14, os.path.join(basepath, 'data/processed/pipeline_full_SOC/dataset_3/model/SOC'))
joblib.dump(pipeline_15, os.path.join(basepath, 'data/processed/pipeline_full_pH/dataset_3/model/pH'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_02.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_03.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_04.npy']

In [19]:
y_dataset_1_Ca    = pipeline_1.predict(test_1)
y_dataset_1_P     = pipeline_2.predict(test_1)
y_dataset_1_Sand  = pipeline_3.predict(test_1)
y_dataset_1_SOC   = pipeline_4.predict(test_1)
y_dataset_1_pH    = pipeline_5.predict(test_1)

y_dataset_2_Ca    = pipeline_6.predict(test_2)
y_dataset_2_P     = pipeline_7.predict(test_2)
y_dataset_2_Sand  = pipeline_8.predict(test_2)
y_dataset_2_SOC   = pipeline_9.predict(test_2)
y_dataset_2_pH    = pipeline_10.predict(test_2)

y_dataset_3_Ca    = pipeline_11.predict(test_3)
y_dataset_3_P     = pipeline_12.predict(test_3)
y_dataset_3_Sand  = pipeline_13.predict(test_3)
y_dataset_3_SOC   = pipeline_14.predict(test_3)
y_dataset_3_pH    = pipeline_15.predict(test_3)

In [20]:
preds_Ca = np.vstack([y_dataset_1_Ca, y_dataset_2_Ca, y_dataset_3_Ca]).T
preds_P = np.vstack([y_dataset_1_P, y_dataset_2_P, y_dataset_3_P]).T
preds_Sand = np.vstack([y_dataset_1_Sand, y_dataset_2_Sand, y_dataset_3_Sand]).T
preds_SOC = np.vstack([y_dataset_1_SOC, y_dataset_2_SOC, y_dataset_3_SOC]).T
preds_pH = np.vstack([y_dataset_1_pH, y_dataset_2_pH, y_dataset_3_pH]).T

In [24]:
balanced_pred_Ca = preds_Ca[:, weights_Ca > 0].mean(axis=1)[:len(y_Ca)]
balanced_pred_P = preds_P[:, weights_P > 0].mean(axis=1)[:len(y_P)]
balanced_pred_Sand = preds_Sand[:, weights_Sand > 0].mean(axis=1)[:len(y_Sand)]
balanced_pred_SOC = preds_SOC[:, weights_SOC > 0].mean(axis=1)[:len(y_SOC)]
balanced_pred_pH = preds_pH[:, weights_pH > 0].mean(axis=1)[:len(y_pH)]

In [25]:
sample_sub['Ca']   = balanced_pred_Ca
sample_sub['P']    = balanced_pred_P
sample_sub['pH']   = balanced_pred_pH
sample_sub['SOC']  = balanced_pred_SOC
sample_sub['Sand'] = balanced_pred_Sand

** Public Leaderboard Score: 0.49149 , Private Leaderboard Score: 0.53293 **

In [26]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/average_models.csv'), index=False)