** Objective **

* Learn to create an experiment for different types of datasets and classifiers ?
* Learn to do blending and multi-stage prediction ?

** Blending for each and every target. **

In [1]:
import pandas as pd
import numpy as np
import os, sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import RandomizedPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from models import cross_validation, eval_metric, models_definition, find_weights

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

** We have created 4 different types of dataset and a feature relevance data frame from which we can create a new dataframe, let us write a utility that would load a dataset given filepath. **

In [3]:
# load a dataset
def load_dataset(train_filepath, test_filepath):
    train_    = joblib.load(os.path.join(basepath, train_filepath))
    test_     = joblib.load(os.path.join(basepath, test_filepath))
    
    return train_, test_

# let's load a dataset
train_filepath = 'data/processed/dataset_1/train'
test_filepath  = 'data/processed/dataset_1/test'

train_, test_  = load_dataset(train_filepath, test_filepath)

In [4]:
# define target variables

def define_target_variables(train):    
    y_Ca    = train.Ca
    y_P     = train.P
    y_Sand  = train.Sand
    y_SOC   = train.SOC
    y_pH    = train.pH
    
    return y_Ca, y_P, y_Sand, y_SOC, y_pH

y_Ca, y_P, y_Sand, y_SOC, y_pH = define_target_variables(train)

** Split datasets into training and test set. **

In [5]:
# lets get the train and test indices

params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train_), **params)

In [6]:
def get_Xs(X, itrain, itest):
    X_train = X.iloc[itrain]
    X_test  = X.iloc[itest]
    
    return X_train, X_test
    
def get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest):
    y_train_Ca = y_Ca.iloc[itrain]
    y_test_Ca  = y_Ca.iloc[itest]
    
    y_train_P  = y_P.iloc[itrain]
    y_test_P  = y_P.iloc[itest]
    
    y_train_Sand  = y_Sand.iloc[itrain]
    y_test_Sand  = y_Sand.iloc[itest]
    
    y_train_SOC  = y_SOC.iloc[itrain]
    y_test_SOC  = y_SOC.iloc[itest]
    
    y_train_pH  = y_pH.iloc[itrain]
    y_test_pH  = y_pH.iloc[itest]
    
    
    return ([y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH],
            [y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH])

X_train, X_test    = get_Xs(train_, itrain, itest) # split the dataset into training and test set.

y_trains, y_tests = get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests

** List of Models. **

In [7]:
models = models_definition.get_models_by_dataset('dataset_1')

In [8]:
# train models for all of the target varibles

def train_models(models, Xs, ys):
    """
    models : List of models that should be trained on a given (X, y)
    Xs     : List of feature set for all of the target variables
    ys     : List of the target variables
    """
    
    n_target = len(ys)
    n_models = len(models)
    
    trained_models = np.empty((n_target, n_models), dtype=Pipeline)
    
    for i in range(n_target):
        for j in range(n_models):
            trained_models[i, j] = models[j].fit(Xs[i], ys[i])

    return trained_models


def predict_targets(trained_models, Xs):
    """
    trained_models : List of the trained models
    Xs             : Held out examples for each of the target variables
    """
    
    n_target = len(Xs)
    n_models = len(trained_models[0])
    
    predictions = np.empty((n_target, n_models), dtype=np.ndarray)
    
    for i in range(n_target):
        for j in range(n_models):
            predictions[i, j] = trained_models[i, j].predict(Xs[i])
        
    return predictions

In [9]:
# Note: This takes considerable amount of time to train different models on a given dataset.
# we are not explicitly stating the orer in any routine
# but remember that the order of target labels are [Ca, P, Sand, SOC, pH]

trained_models = train_models(models, [X_train, X_train, X_train, X_train, X_train], \
                                      [y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH])

In [10]:
# predictions
predictions    = predict_targets(trained_models, [X_test, X_test, X_test, X_test, X_test]) 

In [11]:
n_labels = 5

weights = []
balanced_preds = []

for i in range(n_labels):
    weight, balanced_pred = find_weights.find(y_tests[i], predictions[i])
    
    weights.append(weight)
    balanced_preds.append(balanced_pred)

In [16]:
labels = ['Ca', 'P', 'Sand', 'SOC', 'pH']
model_names = ['rbf', 'linear', 'poly']

for i in range(len(labels)):
    for j in range(len(model_names)):
        score = eval_metric.mcrmse([y_tests[i]], predictions[i][j])
        print('MCRMSE for model: %s for target: %s is %f'%(model_names[j], labels[i], score))
    
    print('\nWeights assigned to the predictions are: %s'%weights[i])
    print('MCRMSE for target: %s for balanced predictions is %f'%(labels[i], eval_metric.mcrmse([y_tests[i]], [balanced_preds[i]])))
    print('\n==========================\n')

MCRMSE for model: rbf for target: Ca is 1.179297
MCRMSE for model: linear for target: Ca is 1.206549
MCRMSE for model: poly for target: Ca is 1.177072

Weights assigned to the predictions are: [ 0.          0.88179231  0.        ]
MCRMSE for target: Ca for balanced predictions is 0.941775


MCRMSE for model: rbf for target: P is 0.808535
MCRMSE for model: linear for target: P is 0.804593
MCRMSE for model: poly for target: P is 0.802667

Weights assigned to the predictions are: [ 0.16906729  0.          0.        ]
MCRMSE for target: P for balanced predictions is 0.920405


MCRMSE for model: rbf for target: Sand is 0.992012
MCRMSE for model: linear for target: Sand is 1.032291
MCRMSE for model: poly for target: Sand is 0.989859

Weights assigned to the predictions are: [ 0.          0.          0.05225188]
MCRMSE for target: Sand for balanced predictions is 1.116491


MCRMSE for model: rbf for target: SOC is 1.057484
MCRMSE for model: linear for target: SOC is 1.091076
MCRMSE for model: