** Objective **

* Learn how to structure the experiments ?
* Learn how to average models trained on different datasets only if their predictions are not correlated ?
* How to assign weights to different models when averaging ?

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
sns.set_context('poster')

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.externals import joblib

from scipy.optimize import nnls

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from data import make_dataset, spectral_band_aggregated
from models import cross_validation, eval_metric

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
# create different datasets
d1 = make_dataset.Data(train, test)
d2 = make_dataset.Data(train, test, remove_CO2_features=True)
d3 = spectral_band_aggregated.Data(train, test)

In [4]:
train_1, test_1 = d1.prepare()
train_2, test_2 = d2.prepare()
train_3, test_3 = d3.prepare()

In [5]:
y_Ca    = train.Ca
y_P     = train.P
y_Sand  = train.Sand
y_SOC   = train.SOC
y_pH    = train.pH

** Split datasets into training and test set. **

In [11]:
params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train_1), **params)

In [12]:
def get_Xs(X, itrain, itest):
    X_train = X.iloc[itrain]
    X_test  = X.iloc[itest]
    
    return X_train, X_test
    
def get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest):
    y_train_Ca = y_Ca.iloc[itrain]
    y_test_Ca  = y_Ca.iloc[itest]
    
    y_train_P  = y_P.iloc[itrain]
    y_test_P  = y_P.iloc[itest]
    
    y_train_Sand  = y_Sand.iloc[itrain]
    y_test_Sand  = y_Sand.iloc[itest]
    
    y_train_SOC  = y_SOC.iloc[itrain]
    y_test_SOC  = y_SOC.iloc[itest]
    
    y_train_pH  = y_pH.iloc[itrain]
    y_test_pH  = y_pH.iloc[itest]
    
    
    return ([y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH],
            [y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH])

X_train_1, X_test_1 = get_Xs(train_1, itrain, itest)
X_train_2, X_test_2 = get_Xs(train_2, itrain, itest)
X_train_3, X_test_3 = get_Xs(train_3, itrain, itest)

y_trains, y_tests = get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests

** List of Models. **

In [6]:
# different models

pipeline_1 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_2 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_3 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_4 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_5 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_6 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_7 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_8 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_9 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_10 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_11 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_12 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_13 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_14 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_15 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

** Fit models. **

In [None]:
pipeline_1.fit(X_train_1, y_train_Ca)
pipeline_2.fit(X_train_1, y_train_P)
pipeline_3.fit(X_train_1, y_train_Sand)
pipeline_4.fit(X_train_1, y_train_SOC)
pipeline_5.fit(X_train_1, y_train_pH)

pipeline_6.fit(X_train_2, y_train_Ca)
pipeline_7.fit(X_train_2, y_train_P)
pipeline_8.fit(X_train_2, y_train_Sand)
pipeline_9.fit(X_train_2, y_train_SOC)
pipeline_10.fit(X_train_2, y_train_pH)

pipeline_11.fit(X_train_3, y_train_Ca)
pipeline_12.fit(X_train_3, y_train_P)
pipeline_13.fit(X_train_3, y_train_Sand)
pipeline_14.fit(X_train_3, y_train_SOC)
pipeline_15.fit(X_train_3, y_train_pH)

In [None]:
joblib.dump(pipeline_1, os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_1/model/Ca'))
joblib.dump(pipeline_2, os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_1/model/P'))
joblib.dump(pipeline_3, os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_1/model/Sand'))
joblib.dump(pipeline_4, os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_1/model/SOC'))
joblib.dump(pipeline_5, os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_1/model/pH'))

joblib.dump(pipeline_6, os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_2/model/Ca'))
joblib.dump(pipeline_7, os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_2/model/P'))
joblib.dump(pipeline_8, os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_2/model/Sand'))
joblib.dump(pipeline_9, os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_2/model/SOC'))
joblib.dump(pipeline_10, os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_2/model/pH'))

joblib.dump(pipeline_11, os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_3/model/Ca'))
joblib.dump(pipeline_12, os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_3/model/P'))
joblib.dump(pipeline_13, os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_3/model/Sand'))
joblib.dump(pipeline_14, os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_3/model/SOC'))
joblib.dump(pipeline_15, os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_3/model/pH'))

In [None]:
y_dataset_1_Ca    = pipeline_1.predict(X_test_1)
y_dataset_1_P     = pipeline_2.predict(X_test_1)
y_dataset_1_Sand  = pipeline_3.predict(X_test_1)
y_dataset_1_SOC   = pipeline_4.predict(X_test_1)
y_dataset_1_pH    = pipeline_5.predict(X_test_1)

y_dataset_2_Ca    = pipeline_6.predict(X_test_2)
y_dataset_2_P     = pipeline_7.predict(X_test_2)
y_dataset_2_Sand  = pipeline_8.predict(X_test_2)
y_dataset_2_SOC   = pipeline_9.predict(X_test_2)
y_dataset_2_pH    = pipeline_10.predict(X_test_2)

y_dataset_3_Ca    = pipeline_11.predict(X_test_3)
y_dataset_3_P     = pipeline_12.predict(X_test_3)
y_dataset_3_Sand  = pipeline_13.predict(X_test_3)
y_dataset_3_SOC   = pipeline_14.predict(X_test_3)
y_dataset_3_pH    = pipeline_15.predict(X_test_3)

In [None]:
joblib.dump(y_dataset_1_Ca, os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_1/predictions/Ca'))
joblib.dump(y_dataset_1_P, os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_1/predictions/P'))
joblib.dump(y_dataset_1_Sand, os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_1/predictions/Sand'))
joblib.dump(y_dataset_1_SOC, os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_1/predictions/SOC'))
joblib.dump(y_dataset_1_pH, os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_1/predictions/pH'))

joblib.dump(y_dataset_2_Ca, os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_2/predictions/Ca'))
joblib.dump(y_dataset_2_P, os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_2/predictions/P'))
joblib.dump(y_dataset_2_Sand, os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_2/predictions/Sand'))
joblib.dump(y_dataset_2_SOC, os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_2/predictions/SOC'))
joblib.dump(y_dataset_2_pH, os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_2/predictions/pH'))

joblib.dump(y_dataset_3_Ca, os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_3/predictions/Ca'))
joblib.dump(y_dataset_3_P, os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_3/predictions/P'))
joblib.dump(y_dataset_3_Sand, os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_3/predictions/Sand'))
joblib.dump(y_dataset_3_SOC, os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_3/predictions/SOC'))
joblib.dump(y_dataset_3_pH, os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_3/predictions/pH'))

In [8]:
# Load predictions from disk

y_dataset_1_Ca   = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_1/predictions/Ca'))
y_dataset_1_P    = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_1/predictions/P'))
y_dataset_1_Sand = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_1/predictions/Sand'))
y_dataset_1_SOC  = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_1/predictions/SOC'))
y_dataset_1_pH   = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_1/predictions/pH'))

y_dataset_2_Ca   = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_2/predictions/Ca'))
y_dataset_2_P    = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_2/predictions/P'))
y_dataset_2_Sand = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_2/predictions/Sand'))
y_dataset_2_SOC  = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_2/predictions/SOC'))
y_dataset_2_pH   = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_2/predictions/pH'))

y_dataset_3_Ca   = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_Ca/dataset_3/predictions/Ca'))
y_dataset_3_P    = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_P/dataset_3/predictions/P'))
y_dataset_3_Sand = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_Sand/dataset_3/predictions/Sand'))
y_dataset_3_SOC  = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_SOC/dataset_3/predictions/SOC'))
y_dataset_3_pH   = joblib.load(os.path.join(basepath, 'data/processed/pipeline_train_pH/dataset_3/predictions/pH'))

In [7]:
def weight_selected(data, labels):
    weights, _ = nnls(data[:len(labels)], labels)
    return weights

In [9]:
preds_Ca = np.vstack([y_dataset_1_Ca, y_dataset_2_Ca, y_dataset_3_Ca]).T
preds_P = np.vstack([y_dataset_1_P, y_dataset_2_P, y_dataset_3_P]).T
preds_Sand = np.vstack([y_dataset_1_Sand, y_dataset_2_Sand, y_dataset_3_Sand]).T
preds_SOC = np.vstack([y_dataset_1_SOC, y_dataset_2_SOC, y_dataset_3_SOC]).T
preds_pH = np.vstack([y_dataset_1_pH, y_dataset_2_pH, y_dataset_3_pH]).T

In [13]:
weights_Ca = weight_selected(preds_Ca, y_test_Ca)
weights_P = weight_selected(preds_P, y_test_P)
weights_Sand = weight_selected(preds_Sand, y_test_Sand)
weights_SOC = weight_selected(preds_SOC, y_test_SOC)
weights_pH = weight_selected(preds_pH, y_test_pH)

In [14]:
balanced_pred_Ca = preds_Ca[:, weights_Ca > 0].mean(axis=1)[:len(y_test_Ca)]
balanced_pred_P = preds_P[:, weights_P > 0].mean(axis=1)[:len(y_test_P)]
balanced_pred_Sand = preds_Sand[:, weights_Sand > 0].mean(axis=1)[:len(y_test_Sand)]
balanced_pred_SOC = preds_SOC[:, weights_SOC > 0].mean(axis=1)[:len(y_test_SOC)]
balanced_pred_pH = preds_pH[:, weights_pH > 0].mean(axis=1)[:len(y_test_pH)]

In [15]:
print('MCRMSE after balancing: %f' %(eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH],
                                                 [balanced_pred_Ca, balanced_pred_P, balanced_pred_Sand, balanced_pred_SOC,
                                                  balanced_pred_pH])))

MCRMSE after balancing: 0.412842


** Training on full dataset. **

In [16]:
pipeline_1.fit(train_1, y_Ca)
pipeline_2.fit(train_1, y_P)
pipeline_3.fit(train_1, y_Sand)
pipeline_4.fit(train_1, y_SOC)
pipeline_5.fit(train_1, y_pH)

pipeline_6.fit(train_2, y_Ca)
pipeline_7.fit(train_2, y_P)
pipeline_8.fit(train_2, y_Sand)
pipeline_9.fit(train_2, y_SOC)
pipeline_10.fit(train_2, y_pH)

pipeline_11.fit(train_3, y_Ca)
pipeline_12.fit(train_3, y_P)
pipeline_13.fit(train_3, y_Sand)
pipeline_14.fit(train_3, y_SOC)
pipeline_15.fit(train_3, y_pH)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [18]:
joblib.dump(pipeline_1, os.path.join(basepath, 'data/processed/pipeline_full_Ca/dataset_1/model/Ca'))
joblib.dump(pipeline_2, os.path.join(basepath, 'data/processed/pipeline_full_P/dataset_1/model/P'))
joblib.dump(pipeline_3, os.path.join(basepath, 'data/processed/pipeline_full_Sand/dataset_1/model/Sand'))
joblib.dump(pipeline_4, os.path.join(basepath, 'data/processed/pipeline_full_SOC/dataset_1/model/SOC'))
joblib.dump(pipeline_5, os.path.join(basepath, 'data/processed/pipeline_full_pH/dataset_1/model/pH'))

joblib.dump(pipeline_6, os.path.join(basepath, 'data/processed/pipeline_full_Ca/dataset_2/model/Ca'))
joblib.dump(pipeline_7, os.path.join(basepath, 'data/processed/pipeline_full_P/dataset_2/model/P'))
joblib.dump(pipeline_8, os.path.join(basepath, 'data/processed/pipeline_full_Sand/dataset_2/model/Sand'))
joblib.dump(pipeline_9, os.path.join(basepath, 'data/processed/pipeline_full_SOC/dataset_2/model/SOC'))
joblib.dump(pipeline_10, os.path.join(basepath, 'data/processed/pipeline_full_pH/dataset_2/model/pH'))

joblib.dump(pipeline_11, os.path.join(basepath, 'data/processed/pipeline_full_Ca/dataset_3/model/Ca'))
joblib.dump(pipeline_12, os.path.join(basepath, 'data/processed/pipeline_full_P/dataset_3/model/P'))
joblib.dump(pipeline_13, os.path.join(basepath, 'data/processed/pipeline_full_Sand/dataset_3/model/Sand'))
joblib.dump(pipeline_14, os.path.join(basepath, 'data/processed/pipeline_full_SOC/dataset_3/model/SOC'))
joblib.dump(pipeline_15, os.path.join(basepath, 'data/processed/pipeline_full_pH/dataset_3/model/pH'))

['/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_01.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_02.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_03.npy',
 '/home/abhishek/Desktop/src/African_Soil_Property_Prediction/data/processed/pipeline_full_pH/dataset_3/model/pH_04.npy']

In [19]:
y_dataset_1_Ca    = pipeline_1.predict(test_1)
y_dataset_1_P     = pipeline_2.predict(test_1)
y_dataset_1_Sand  = pipeline_3.predict(test_1)
y_dataset_1_SOC   = pipeline_4.predict(test_1)
y_dataset_1_pH    = pipeline_5.predict(test_1)

y_dataset_2_Ca    = pipeline_6.predict(test_2)
y_dataset_2_P     = pipeline_7.predict(test_2)
y_dataset_2_Sand  = pipeline_8.predict(test_2)
y_dataset_2_SOC   = pipeline_9.predict(test_2)
y_dataset_2_pH    = pipeline_10.predict(test_2)

y_dataset_3_Ca    = pipeline_11.predict(test_3)
y_dataset_3_P     = pipeline_12.predict(test_3)
y_dataset_3_Sand  = pipeline_13.predict(test_3)
y_dataset_3_SOC   = pipeline_14.predict(test_3)
y_dataset_3_pH    = pipeline_15.predict(test_3)

In [20]:
preds_Ca = np.vstack([y_dataset_1_Ca, y_dataset_2_Ca, y_dataset_3_Ca]).T
preds_P = np.vstack([y_dataset_1_P, y_dataset_2_P, y_dataset_3_P]).T
preds_Sand = np.vstack([y_dataset_1_Sand, y_dataset_2_Sand, y_dataset_3_Sand]).T
preds_SOC = np.vstack([y_dataset_1_SOC, y_dataset_2_SOC, y_dataset_3_SOC]).T
preds_pH = np.vstack([y_dataset_1_pH, y_dataset_2_pH, y_dataset_3_pH]).T

In [24]:
balanced_pred_Ca = preds_Ca[:, weights_Ca > 0].mean(axis=1)[:len(y_Ca)]
balanced_pred_P = preds_P[:, weights_P > 0].mean(axis=1)[:len(y_P)]
balanced_pred_Sand = preds_Sand[:, weights_Sand > 0].mean(axis=1)[:len(y_Sand)]
balanced_pred_SOC = preds_SOC[:, weights_SOC > 0].mean(axis=1)[:len(y_SOC)]
balanced_pred_pH = preds_pH[:, weights_pH > 0].mean(axis=1)[:len(y_pH)]

In [25]:
sample_sub['Ca']   = balanced_pred_Ca
sample_sub['P']    = balanced_pred_P
sample_sub['pH']   = balanced_pred_pH
sample_sub['SOC']  = balanced_pred_SOC
sample_sub['Sand'] = balanced_pred_Sand

** Public Leaderboard Score: 0.49149 , Private Leaderboard Score: 0.53293 **

In [26]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/average_models.csv'), index=False)