** Objective **

* Learn to create an experiment for different types of datasets and classifiers ?
* Learn to do blending and multi-stage prediction ?


In [2]:
import pandas as pd
import numpy as np
import os, sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from models import cross_validation, eval_metric, models_definition, find_weights
from helper import utils

In [3]:
dataset_name = 'dataset_2'

In [4]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

** Dataset should will be loaded into the memory ** 

In [5]:
# load a dataset
def load_dataset(train_filepath, test_filepath):
    train_    = joblib.load(os.path.join(basepath, train_filepath))
    test_     = joblib.load(os.path.join(basepath, test_filepath))
    
    return train_, test_

In [7]:
labels  = ['Ca', 'P', 'Sand', 'SOC', 'pH']

trains_ = []
tests_  = []

for i in range(len(labels)):
    # let's load a dataset
    train_filepath = 'data/processed/%s/%s/train/train'%(dataset_name, labels[i])
    test_filepath  = 'data/processed/%s/%s/test/test'%(dataset_name, labels[i])

    train_, test_  = load_dataset(train_filepath, test_filepath)
    
    trains_.append(train_)
    tests_.append(test_)

In [8]:
# define target variables

def define_target_variables(train):    
    y_Ca    = train.Ca
    y_P     = train.P
    y_Sand  = train.Sand
    y_SOC   = train.SOC
    y_pH    = train.pH
    
    return y_Ca, y_P, y_Sand, y_SOC, y_pH

y_Ca, y_P, y_Sand, y_SOC, y_pH = define_target_variables(train)

** Split datasets into training and test set. **

In [9]:
# lets get the train and test indices

params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train), **params)

In [10]:
X_train_Ca, X_test_Ca        = utils.get_Xs(trains_[0], itrain, itest) 
X_train_P, X_test_P          = utils.get_Xs(trains_[1], itrain, itest) 
X_train_Sand, X_test_Sand    = utils.get_Xs(trains_[2], itrain, itest) 
X_train_SOC, X_test_SOC      = utils.get_Xs(trains_[3], itrain, itest) 
X_train_pH, X_test_pH        = utils.get_Xs(trains_[4], itrain, itest)

X_trains = [X_train_Ca, X_train_P, X_train_Sand, X_train_SOC, X_train_pH]
X_tests = [X_test_Ca, X_test_P, X_test_Sand, X_test_SOC, X_test_pH]

y_trains, y_tests  = utils.get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests

** List of Models. **

In [77]:
models = models_definition.get_models_by_dataset('dataset_2')

** Train Models **

In [None]:
labels      = ['Ca', 'P', 'Sand', 'SOC', 'pH']
model_names = ['rbf', 'linear', 'poly']

test_preds = np.empty((len(labels), len(model_names)), dtype=np.ndarray)

for i in range(len(labels)):
    for j in range(len(model_names)):
        model = utils.train_model(models[j], X_trains[i], y_trains[i], 'dataset_2', labels[i], model_names[j])
        test_preds[i, j] = utils.predict_targets(model, X_tests[i])

Model saved successfully


** Load and predict. **

In [12]:
labels      = ['Ca', 'P', 'Sand', 'SOC', 'pH']
model_names = ['rbf', 'linear', 'poly']

test_preds = np.empty((len(labels), len(model_names)), dtype=np.ndarray)

for i in range(len(labels)):
    for j in range(len(model_names)):
        model = joblib.load(os.path.join(basepath, 'data/processed/%s/%s/models/%s/%s'%(dataset_name, labels[i], model_names[j], model_names[j])))
        test_preds[i, j] = utils.predict_targets(model, X_tests[i])

In [13]:
print('MCRMSE for linear kernel for Ca: ', eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH], 
                                                              [
                                                                test_preds[0][1],
                                                                test_preds[1][1],
                                                                test_preds[2][1],
                                                                test_preds[3][1],
                                                                test_preds[4][1]
                                                              ]))

MCRMSE for linear kernel for Ca:  0.401379293931


In [14]:
print('MCRMSE for rbf kernel for Ca: ', eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH], 
                                                              [
                                                                test_preds[0][0],
                                                                test_preds[1][0],
                                                                test_preds[2][0],
                                                                test_preds[3][0],
                                                                test_preds[4][0]
                                                              ]))

MCRMSE for rbf kernel for Ca:  0.61664174585


In [15]:
print('MCRMSE for poly kernel for Ca: ', eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH], 
                                                              [
                                                                test_preds[0][2],
                                                                test_preds[1][2],
                                                                test_preds[2][2],
                                                                test_preds[3][2],
                                                                test_preds[4][2]
                                                              ]))

MCRMSE for poly kernel for Ca:  0.680766693543


In [16]:
n_labels = 5

weights = []
balanced_preds = []
y_true = [y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH]

for i in range(n_labels):
    weight, balanced_pred = find_weights.find(y_true[i], test_preds[i])
    
    weights.append(weight)
    balanced_preds.append(balanced_pred)

In [17]:
labels      = ['Ca', 'P', 'Sand', 'SOC', 'pH']
model_names = ['rbf', 'linear', 'poly']

for i in range(len(labels)):
    for j in range(len(model_names)):
        score = eval_metric.mcrmse([y_true[i]], [test_preds[i][j]])
        print('MCRMSE for model: %s for target: %s is %f'%(model_names[j], labels[i], score))
    
    print('\nWeights assigned to the predictions are: %s'%weights[i])
    print('MCRMSE for target: %s for balanced predictions is %f'%(labels[i], eval_metric.mcrmse([y_tests[i]], [balanced_preds[i]])))
    print('\n==========================\n')

MCRMSE for model: rbf for target: Ca is 0.929916
MCRMSE for model: linear for target: Ca is 0.350117
MCRMSE for model: poly for target: Ca is 0.830624

Weights assigned to the predictions are: [ 0.          1.19156338  0.        ]
MCRMSE for target: Ca for balanced predictions is 0.350117


MCRMSE for model: rbf for target: P is 0.707813
MCRMSE for model: linear for target: P is 0.741905
MCRMSE for model: poly for target: P is 0.721823

Weights assigned to the predictions are: [ 1.2798643   0.          0.43037823]
MCRMSE for target: P for balanced predictions is 0.711938


MCRMSE for model: rbf for target: Sand is 0.361651
MCRMSE for model: linear for target: Sand is 0.318861
MCRMSE for model: poly for target: Sand is 0.620944

Weights assigned to the predictions are: [ 0.48101993  0.533108    0.10877343]
MCRMSE for target: Sand for balanced predictions is 0.362202


MCRMSE for model: rbf for target: SOC is 0.610732
MCRMSE for model: linear for target: SOC is 0.255781
MCRMSE for model: