** Objective **

* Learn how to structure the experiments ?
* Learn how to average models trained on different datasets only if their predictions are not correlated ?
* How to assign weights to different models when averaging ?

In [75]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
sns.set_context('poster')

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from data import make_dataset, spectral_band_aggregated
from models import cross_validation, eval_metric

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
# create different datasets
d1 = make_dataset.Data(train, test)
d2 = make_dataset.Data(train, test, remove_CO2_features=True)
d3 = spectral_band_aggregated.Data(train, test)

In [4]:
train_1, test_1 = d1.prepare()
train_2, test_2 = d2.prepare()
train_3, test_3 = d3.prepare()

In [5]:
y_Ca    = train.Ca
y_P     = train.P
y_Sand  = train.Sand
y_SOC   = train.SOC
y_pH    = train.pH

** Split datasets into training and test set. **

In [6]:
params = {
    'test_size' : 0.2,
    'random_state' : 4
}

itrain, itest = cross_validation.split_dataset(len(train_1), **params)

In [12]:
def get_Xs(X, itrain, itest):
    X_train = X.iloc[itrain]
    X_test  = X.iloc[itest]
    
    return X_train, X_test
    
def get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest):
    y_train_Ca = y_Ca.iloc[itrain]
    y_test_Ca  = y_Ca.iloc[itest]
    
    y_train_P  = y_P.iloc[itrain]
    y_test_P  = y_P.iloc[itest]
    
    y_train_Sand  = y_Sand.iloc[itrain]
    y_test_Sand  = y_Sand.iloc[itest]
    
    y_train_SOC  = y_SOC.iloc[itrain]
    y_test_SOC  = y_SOC.iloc[itest]
    
    y_train_pH  = y_pH.iloc[itrain]
    y_test_pH  = y_pH.iloc[itest]
    
    
    return ([y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH],
            [y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH])

X_train_1, X_test_1 = get_Xs(train_1, itrain, itest)
X_train_2, X_test_2 = get_Xs(train_2, itrain, itest)
X_train_3, X_test_3 = get_Xs(train_3, itrain, itest)

y_trains, y_tests = get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)

y_train_Ca, y_train_P, y_train_Sand, y_train_SOC, y_train_pH = y_trains
y_test_Ca, y_test_P, y_test_Sand, y_test_SOC, y_test_pH = y_tests

** Model Library. **

In [39]:
class ModelLibrary:
    
    def __init__(self, pipelines, dataset_name):
        self.pipelines = pipelines
        self.dataset_name = dataset_name
        
        self.index_dict = {
            'Ca': 0,
            'P': 1,
            'Sand': 2,
            'SOC': 3,
            'pH': 4
        }
        
    def map_indexes_by_label(self, label):
        return self.index_dict[label]
    
    def get_model_by_label(self, label):
        model_index = self.map_indexes_by_label(label)
        
        return self.pipelines[model_index]

In [77]:
# different models

pipeline_1 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_2 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_3 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_4 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_5 = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('model', SVR(kernel='linear'))
    ])

pipeline_6 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_7 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_8 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_9 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

pipeline_10 = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

In [78]:
ml_dataset_1 = ModelLibrary([pipeline_1, pipeline_2, pipeline_3, pipeline_4, pipeline_5], 'dataset_1')
ml_dataset_2 = ModelLibrary([pipeline_1, pipeline_2, pipeline_3, pipeline_4, pipeline_5], 'dataset_2')
ml_dataset_3 = ModelLibrary([pipeline_6, pipeline_7, pipeline_8, pipeline_9, pipeline_10], 'dataset_3')

In [79]:
class ModelAveraging:
    
    def __init__(self, **params):
        self.ml_libraries = params['ml_libraries']
        
        self.datasets = {
            'dataset_1': (params['X_train_1'], params['X_test_1']),
            'dataset_2': (params['X_train_2'], params['X_test_2']),
            'dataset_3': (params['X_train_3'], params['X_test_3'])
        }
        
        self.labels = {
            'Ca': (params['y_train_Ca'], params['y_test_Ca']),
            'P': (params['y_train_P'], params['y_test_P']),
            'Sand': (params['y_train_Sand'], params['y_test_Sand']),
            'SOC': (params['y_train_SOC'], params['y_test_SOC']),
            'pH': (params['y_train_pH'], params['y_test_pH'])
        }
        
    def get_predictions(self):
        """
        For every label train a model on the training set
        on a given dataset
        """
        predictions = {}
        datasets = ['dataset_1', 'dataset_2', 'dataset_3']
        labels = ['Ca', 'P', 'Sand', 'SOC', 'pH']
        
        for dataset_name in datasets:
            predictions[dataset_name] = {}
            Xtr, Xte = self.datasets[dataset_name]
            
            for label in labels:
                ytr, yte = self.labels[label]
                
                for ml in self.ml_libraries:
                    if ml.dataset_name == dataset_name:
                        model = ml.get_model_by_label(label)
                        
                        model.fit(Xtr, ytr)
                        ypred = model.predict(Xte)
                        
                        predictions[dataset_name][label] = ypred
        
        self.predictions = predictions
        
        return self.predictions
    
    def get_mcrmse(self):
        y_test = [self.labels[label][1] for label in ['Ca', 'P', 'Sand', 'SOC', 'pH']]
        
        for k, v in self.predictions.items():
            y_pred = [v[key] for key in ['Ca', 'P', 'Sand', 'SOC', 'pH']]
            
            print('For: %s'%k)
            print('MCRMSE: %f'%eval_metric.mcrmse(y_test, y_pred))
            print('\n')

In [None]:
params = {
    'ml_libraries': [ml_dataset_1, ml_dataset_2, ml_dataset_3],
    'X_train_1': X_train_1,
    'X_test_1': X_test_1,
    'X_train_2': X_train_2,
    'X_test_2': X_test_2,
    'X_train_3': X_train_3,
    'X_test_3': X_test_3,
    'y_train_Ca': y_train_Ca,
    'y_test_Ca': y_test_Ca,
    'y_train_P': y_train_P,
    'y_test_P': y_test_P,
    'y_train_Sand': y_train_Sand,
    'y_test_Sand': y_test_Sand,
    'y_train_SOC': y_train_SOC,
    'y_test_SOC': y_test_SOC,
    'y_train_pH': y_train_pH,
    'y_test_pH': y_test_pH
}

ma = ModelAveraging(**params)
predictions = ma.get_predictions()
ma.get_mcrmse()