** Objective **

* Learn to create an experiment for different types of datasets and classifiers ?
* Learn to do blending and multi-stage prediction ?

** Blending for each and every target. **

In [1]:
import pandas as pd
import numpy as np
import os, sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from models import cross_validation, eval_metric, models_definition, find_weights
from helper import utils

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

** Dataset should will be loaded into the memory ** 

In [3]:
# load a dataset
def load_dataset(train_filepath, test_filepath):
    train_    = joblib.load(os.path.join(basepath, train_filepath))
    test_     = joblib.load(os.path.join(basepath, test_filepath))
    
    return train_, test_

# let's load a dataset
train_filepath = 'data/processed/dataset_1/train/train'
test_filepath  = 'data/processed/dataset_1/test/test'

train_, test_  = load_dataset(train_filepath, test_filepath)

In [4]:
# define target variables

def define_target_variables(train):    
    y_Ca    = train.Ca
    y_P     = train.P
    y_Sand  = train.Sand
    y_SOC   = train.SOC
    y_pH    = train.pH
    
    return y_Ca, y_P, y_Sand, y_SOC, y_pH

y_Ca, y_P, y_Sand, y_SOC, y_pH = define_target_variables(train)

** Split datasets into training and test set. **

In [5]:
# lets get the train and test indices

params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train_), **params)

In [6]:
X_train, X_test    = utils.get_Xs(train_, itrain, itest) # split the dataset into training and test set.

y_trains, y_tests  = utils.get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests

** List of Models. **

In [7]:
models = models_definition.get_models_by_dataset('dataset_3')

In [9]:
# Note: This takes considerable amount of time to train different models on a given dataset.
# we are not explicitly stating the orer in any routine
# but remember that the order of target labels are [Ca, P, Sand, SOC, pH]

trained_models = utils.train_models(models, [X_train, X_train, X_train, X_train, X_train], \
                                      [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

In [10]:
# predictions
predictions    = utils.predict_targets(trained_models, [X_test, X_test, X_test, X_test, X_test]) 

In [11]:
n_labels = 5

weights = []
balanced_preds = []

for i in range(n_labels):
    weight, balanced_pred = find_weights.find(y_tests[i], predictions[i])
    
    weights.append(weight)
    balanced_preds.append(balanced_pred)

In [12]:
labels      = ['Ca', 'P', 'Sand', 'SOC', 'pH']
model_names = ['rbf', 'linear', 'poly']

for i in range(len(labels)):
    for j in range(len(model_names)):
        score = eval_metric.mcrmse([y_tests[i]], predictions[i][j])
        print('MCRMSE for model: %s for target: %s is %f'%(model_names[j], labels[i], score))
    
    print('\nWeights assigned to the predictions are: %s'%weights[i])
    print('MCRMSE for target: %s for balanced predictions is %f'%(labels[i], eval_metric.mcrmse([y_tests[i]], [balanced_preds[i]])))
    print('\n==========================\n')

MCRMSE for model: rbf for target: Ca is 1.174373
MCRMSE for model: linear for target: Ca is 1.186922
MCRMSE for model: poly for target: Ca is 1.175080

Weights assigned to the predictions are: [ 0.          0.88535948  0.        ]
MCRMSE for target: Ca for balanced predictions is 0.936562


MCRMSE for model: rbf for target: P is 0.794164
MCRMSE for model: linear for target: P is 0.786300
MCRMSE for model: poly for target: P is 0.796661

Weights assigned to the predictions are: [ 0.1688672   0.01180847  0.05437928]
MCRMSE for target: P for balanced predictions is 0.879014


MCRMSE for model: rbf for target: Sand is 0.987496
MCRMSE for model: linear for target: Sand is 1.007229
MCRMSE for model: poly for target: Sand is 0.988067

Weights assigned to the predictions are: [ 0.          0.          0.05468524]
MCRMSE for target: Sand for balanced predictions is 1.033417


MCRMSE for model: rbf for target: SOC is 1.052535
MCRMSE for model: linear for target: SOC is 1.068493
MCRMSE for model: