** Objectives **

* Find balanced predictions across different datasets.
* Blend different models trained across different taraget variables.

In [1]:
import numpy as np
import pandas as pd
import os, sys

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

from sklearn.externals import joblib

np.random.seed(5)

from helper import utils
from models import eval_metric, cross_validation, find_weights

In [2]:
train       = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test        = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub  = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
y_Ca, y_P, y_Sand, y_SOC, y_pH = utils.define_target_variables(train)

params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train), **params)

In [3]:
labels  = ['Ca', 'P', 'Sand', 'SOC', 'pH']

In [4]:
# load all the different datasets

trains_1, tests_1 = utils.load_datasets('dataset_1', labels)
trains_2, tests_2 = utils.load_datasets('dataset_2', labels)
trains_3, tests_3 = utils.load_datasets('dataset_3', labels)
trains_4, tests_4 = utils.load_datasets('dataset_4', labels)
trains_5, tests_5 = utils.load_datasets('dataset_5', labels)

In [6]:
y_trains, y_tests  = utils.get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests


###############################

X_train_1_Ca, X_test_1_Ca        = utils.get_Xs(trains_1[0], itrain, itest) 
X_train_1_P, X_test_1_P          = utils.get_Xs(trains_1[1], itrain, itest) 
X_train_1_Sand, X_test_1_Sand    = utils.get_Xs(trains_1[2], itrain, itest) 
X_train_1_SOC, X_test_1_SOC      = utils.get_Xs(trains_1[3], itrain, itest) 
X_train_1_pH, X_test_1_pH        = utils.get_Xs(trains_1[4], itrain, itest)

X_trains_1 = [X_train_1_Ca, X_train_1_P, X_train_1_Sand, X_train_1_SOC, X_train_1_pH]
X_tests_1 = [X_test_1_Ca, X_test_1_P, X_test_1_Sand, X_test_1_SOC, X_test_1_pH]

###############################

X_train_2_Ca, X_test_2_Ca        = utils.get_Xs(trains_2[0], itrain, itest) 
X_train_2_P, X_test_2_P          = utils.get_Xs(trains_2[1], itrain, itest) 
X_train_2_Sand, X_test_2_Sand    = utils.get_Xs(trains_2[2], itrain, itest) 
X_train_2_SOC, X_test_2_SOC      = utils.get_Xs(trains_2[3], itrain, itest) 
X_train_2_pH, X_test_2_pH        = utils.get_Xs(trains_2[4], itrain, itest)

X_trains_2 = [X_train_2_Ca, X_train_2_P, X_train_2_Sand, X_train_2_SOC, X_train_2_pH]
X_tests_2 = [X_test_2_Ca, X_test_2_P, X_test_2_Sand, X_test_2_SOC, X_test_2_pH]

###############################

X_train_3_Ca, X_test_3_Ca        = utils.get_Xs(trains_3[0], itrain, itest) 
X_train_3_P, X_test_3_P          = utils.get_Xs(trains_3[1], itrain, itest) 
X_train_3_Sand, X_test_3_Sand    = utils.get_Xs(trains_3[2], itrain, itest) 
X_train_3_SOC, X_test_3_SOC      = utils.get_Xs(trains_3[3], itrain, itest) 
X_train_3_pH, X_test_3_pH        = utils.get_Xs(trains_3[4], itrain, itest)

X_trains_3 = [X_train_3_Ca, X_train_3_P, X_train_3_Sand, X_train_3_SOC, X_train_3_pH]
X_tests_3 = [X_test_3_Ca, X_test_3_P, X_test_3_Sand, X_test_3_SOC, X_test_3_pH]

# ###############################

X_train_4_Ca, X_test_4_Ca        = utils.get_Xs(trains_4[0], itrain, itest) 
X_train_4_P, X_test_4_P          = utils.get_Xs(trains_4[1], itrain, itest) 
X_train_4_Sand, X_test_4_Sand    = utils.get_Xs(trains_4[2], itrain, itest) 
X_train_4_SOC, X_test_4_SOC      = utils.get_Xs(trains_4[3], itrain, itest) 
X_train_4_pH, X_test_4_pH        = utils.get_Xs(trains_4[4], itrain, itest)

X_trains_4 = [X_train_4_Ca, X_train_4_P, X_train_4_Sand, X_train_4_SOC, X_train_4_pH]
X_tests_4 = [X_test_4_Ca, X_test_4_P, X_test_4_Sand, X_test_4_SOC, X_test_4_pH]

# ###############################

X_train_5_Ca, X_test_5_Ca        = utils.get_Xs(trains_5[0], itrain, itest) 
X_train_5_P, X_test_5_P          = utils.get_Xs(trains_5[1], itrain, itest) 
X_train_5_Sand, X_test_5_Sand    = utils.get_Xs(trains_5[2], itrain, itest) 
X_train_5_SOC, X_test_5_SOC      = utils.get_Xs(trains_5[3], itrain, itest) 
X_train_5_pH, X_test_5_pH        = utils.get_Xs(trains_5[4], itrain, itest)

X_trains_5 = [X_train_5_Ca, X_train_5_P, X_train_5_Sand, X_train_5_SOC, X_train_5_pH]
X_tests_5 = [X_test_5_Ca, X_test_5_P, X_test_5_Sand, X_test_5_SOC, X_test_5_pH]


In [5]:
# load all of the different models trained

model_names = ['rbf', 'linear', 'poly']

test_preds_dataset_1 = np.empty((len(labels), len(model_names)), dtype=np.ndarray)

for i in range(len(labels)):
    for j in range(len(model_names)):
        model = joblib.load(os.path.join(basepath, 'data/processed/%s/%s/models/%s/%s'%('dataset_1', labels[i], model_names[j], model_names[j])))
        test_preds_dataset_1[i, j] = utils.predict_targets(model, tests_1[i])

In [6]:
model_names = ['rbf', 'linear', 'poly']

test_preds_dataset_2 = np.empty((len(labels), len(model_names)), dtype=np.ndarray)

for i in range(len(labels)):
    for j in range(len(model_names)):
        model = joblib.load(os.path.join(basepath, 'data/processed/%s/%s/models/%s/%s'%('dataset_2', labels[i], model_names[j], model_names[j])))
        test_preds_dataset_2[i, j] = utils.predict_targets(model, tests_2[i])

In [7]:
# load all of the different models trained

model_names = ['rbf', 'linear', 'poly']

test_preds_dataset_3 = np.empty((len(labels), len(model_names)), dtype=np.ndarray)

for i in range(len(labels)):
    for j in range(len(model_names)):
        model = joblib.load(os.path.join(basepath, 'data/processed/%s/%s/models/%s/%s'%('dataset_3', labels[i], model_names[j], model_names[j])))
        test_preds_dataset_3[i, j] = utils.predict_targets(model, tests_3[i])

In [8]:
# load all of the different models trained

model_names = ['rbf', 'linear', 'poly']

test_preds_dataset_4 = np.empty((len(labels), len(model_names)), dtype=np.ndarray)

for i in range(len(labels)):
    for j in range(len(model_names)):
        model = joblib.load(os.path.join(basepath, 'data/processed/%s/%s/models/%s/%s'%('dataset_4', labels[i], model_names[j], model_names[j])))
        test_preds_dataset_4[i, j] = utils.predict_targets(model, tests_4[i])

In [9]:
# load all of the different models trained

model_names = ['rbf', 'linear', 'poly']

test_preds_dataset_5 = np.empty((len(labels), len(model_names)), dtype=np.ndarray)

for i in range(len(labels)):
    for j in range(len(model_names)):
        model = joblib.load(os.path.join(basepath, 'data/processed/%s/%s/models/%s/%s'%('dataset_5', labels[i], model_names[j], model_names[j])))
        test_preds_dataset_5[i, j] = utils.predict_targets(model, tests_5[i])

In [10]:
# should have shape of (n_diff_models, len(test))

predictions_Ca    = [
                    test_preds_dataset_1[0][0],
                    test_preds_dataset_1[0][1],
                    test_preds_dataset_1[0][2],
                    test_preds_dataset_2[0][0],
                    test_preds_dataset_2[0][1],
                    test_preds_dataset_2[0][2],
                    test_preds_dataset_3[0][0],
                    test_preds_dataset_3[0][1],
                    test_preds_dataset_3[0][2],
                    test_preds_dataset_4[0][0],
                    test_preds_dataset_4[0][1],
                    test_preds_dataset_4[0][2],
                    test_preds_dataset_5[0][0],
                    test_preds_dataset_5[0][1],
                    test_preds_dataset_5[0][2]
                    ]

predictions_P     = [
                    test_preds_dataset_1[1][0],
                    test_preds_dataset_1[1][1],
                    test_preds_dataset_1[1][2],
                    test_preds_dataset_2[1][0],
                    test_preds_dataset_2[1][1],
                    test_preds_dataset_2[1][2],
                    test_preds_dataset_3[1][0],
                    test_preds_dataset_3[1][1],
                    test_preds_dataset_3[1][2],
                    test_preds_dataset_4[1][0],
                    test_preds_dataset_4[1][1],
                    test_preds_dataset_4[1][2],
                    test_preds_dataset_5[1][0],
                    test_preds_dataset_5[1][1],
                    test_preds_dataset_5[1][2],

                    ]

predictions_Sand     = [
                    test_preds_dataset_1[2][0],
                    test_preds_dataset_1[2][1],
                    test_preds_dataset_1[2][2],
                    test_preds_dataset_2[2][0],
                    test_preds_dataset_2[2][1],
                    test_preds_dataset_2[2][2],
                    test_preds_dataset_3[2][0],
                    test_preds_dataset_3[2][1],
                    test_preds_dataset_3[2][2],
                    test_preds_dataset_4[2][0],
                    test_preds_dataset_4[2][1],
                    test_preds_dataset_4[2][2],
                    test_preds_dataset_5[2][0],
                    test_preds_dataset_5[2][1],
                    test_preds_dataset_5[2][2]
                    ]

predictions_SOC     = [
                    test_preds_dataset_1[3][0],
                    test_preds_dataset_1[3][1],
                    test_preds_dataset_1[3][2],
                    test_preds_dataset_2[3][0],
                    test_preds_dataset_2[3][1],
                    test_preds_dataset_2[3][2],
                    test_preds_dataset_3[3][0],
                    test_preds_dataset_3[3][1],
                    test_preds_dataset_3[3][2],
                    test_preds_dataset_4[3][0],
                    test_preds_dataset_4[3][1],
                    test_preds_dataset_4[3][2],
                    test_preds_dataset_5[3][0],
                    test_preds_dataset_5[3][1],
                    test_preds_dataset_5[3][2]
                    ]

predictions_pH     = [
                    test_preds_dataset_1[4][0],
                    test_preds_dataset_1[4][1],
                    test_preds_dataset_1[4][2],
                    test_preds_dataset_2[4][0],
                    test_preds_dataset_2[4][1],
                    test_preds_dataset_2[4][2],
                    test_preds_dataset_3[4][0],
                    test_preds_dataset_3[4][1],
                    test_preds_dataset_3[4][2],
                    test_preds_dataset_4[4][0],
                    test_preds_dataset_4[4][1],
                    test_preds_dataset_4[4][2],
                    test_preds_dataset_5[4][0],
                    test_preds_dataset_5[4][1],
                    test_preds_dataset_5[4][2]
                    ]

predictions_all_targets = [predictions_Ca, predictions_P, predictions_Sand, predictions_SOC, predictions_pH]

In [13]:
balanced_preds = []
weights = []

for i in range(5):
    weight, balanced_pred = find_weights.find(y_tests[i], predictions_all_targets[i])
    
    weights.append(weight)
    print('MCRMSE for index:%d is: %f'%(i+1, eval_metric.mcrmse([y_tests[i]], [balanced_pred])))
    balanced_preds.append(balanced_pred)

# print(len(balanced_preds[0]))
print('\n=================================')
print('MCRMSE for all of the targets: ', eval_metric.mcrmse(y_tests, balanced_preds))

MCRMSE for index:1 is: 0.492119
MCRMSE for index:2 is: 0.710202
MCRMSE for index:3 is: 0.362098
MCRMSE for index:4 is: 0.437219
MCRMSE for index:5 is: 0.422529

MCRMSE for all of the targets:  0.484833185494


In [14]:
joblib.dump(weights, os.path.join(basepath, 'data/interim/weights/weights'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/weights/weights',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/weights/weights_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/weights/weights_02.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/weights/weights_03.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/weights/weights_04.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/interim/weights/weights_05.npy']

In [11]:
weights = joblib.load(os.path.join(basepath, 'data/interim/weights/weights'))

In [12]:
predictions_Ca_stacked = find_weights.stack_predictions(predictions_Ca)
predictions_P_stacked = find_weights.stack_predictions(predictions_P)
predictions_Sand_stacked = find_weights.stack_predictions(predictions_Sand)
predictions_SOC_stacked = find_weights.stack_predictions(predictions_SOC)
predictions_pH_stacked = find_weights.stack_predictions(predictions_pH)

In [13]:
final_preds_Ca   = find_weights.balance_predictions(tests_1[0], predictions_Ca_stacked, weights[0])
final_preds_P    = find_weights.balance_predictions(tests_1[0], predictions_P_stacked, weights[1])
final_preds_Sand = find_weights.balance_predictions(tests_1[0], predictions_Sand_stacked, weights[2])
final_preds_SOC  = find_weights.balance_predictions(tests_1[0], predictions_SOC_stacked, weights[3])
final_preds_pH   = find_weights.balance_predictions(tests_1[0], predictions_pH_stacked, weights[4])

In [14]:
sample_sub['Ca']   = final_preds_Ca
sample_sub['P']    = final_preds_P
sample_sub['Sand'] = final_preds_Sand
sample_sub['SOC']  = final_preds_SOC
sample_sub['pH']   = final_preds_pH

In [15]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/5_datasets_only_SVR_regularized'), index=False)