** Playground. **

* Test different various pipelines on different types of dataset prepared.


** Notes **

* SVR is not working on the transformed ( tsne )  dataset. 
* Random forest regressor works good on the transformed dataset.
* What if we remove features with low standard deviation ? ( Yes this is a sensible idea )

In [3]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.decomposition import RandomizedPCA
from sklearn.externals import joblib

sns.set_style('whitegrid')
sns.set_context('poster')

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(0)

from models import eval_metric, cross_validation, models_definition
from helper import utils

In [4]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [5]:
# labels
labels = utils.get_labels()

In [6]:
dataset_name = 'dataset_6'

# load training and test sets for every target variable
trains, tests = utils.load_datasets(dataset_name, labels)

In [7]:
y_Ca, y_P, y_Sand, y_SOC, y_pH = utils.define_target_variables(train)

** Cross-validation. **

In [8]:
def cross_validate(dataset_name, X_trains, y_Ca, y_P, y_Sand, y_SOC, y_pH):
    params = {
        'test_size': 0.2,
        'random_state': 3
        }

    itrain, itest = cross_validation.split_dataset(len(train), **params)
    models        = models_definition.get_models_by_dataset(dataset_name)
    model_names   = os.listdir(os.path.join(basepath, 'data/processed/%s/%s/models'%(dataset_name, 'Ca')))
    
    
    X_train_Ca, X_test_Ca      = utils.get_Xs(X_trains[0], itrain, itest)
    X_train_P, X_test_P        = utils.get_Xs(X_trains[1], itrain, itest)
    X_train_Sand, X_test_Sand  = utils.get_Xs(X_trains[2], itrain, itest)
    X_train_SOC, X_test_SOC    = utils.get_Xs(X_trains[3], itrain, itest)
    X_train_pH, X_test_pH      = utils.get_Xs(X_trains[4], itrain, itest)

    X_trains = [X_train_Ca, X_train_P, X_train_Sand, X_train_SOC, X_train_pH]
    X_tests  = [X_test_Ca, X_test_P, X_test_Sand, X_test_SOC, X_test_pH]

    y_trains, y_tests = utils.get_Ys(y_Ca, y_P, y_Sand, y_SOC, y_pH, itrain, itest)
    
    for i in range(len(models)):
        print('Model: %s\n'%model_names[i])
        cv_score = cross_validation.cv_scheme(models[i], X_trains, y_trains)
        print('Mean cross validations score: %f\n'%cv_score)

In [9]:
cross_validate('dataset_6', trains, y_Ca, y_P, y_Sand, y_SOC, y_pH)

Model: gbr

Fold: 0
MCRMSE score for label: Ca -> 0.706801
MCRMSE score for label: P -> 1.463820
MCRMSE score for label: Sand -> 0.678648
MCRMSE score for label: SOC -> 0.800042
MCRMSE score for label: pH -> 0.651900

MCRMSE score: 0.860242

Fold: 1
MCRMSE score for label: Ca -> 1.060429
MCRMSE score for label: P -> 1.300924
MCRMSE score for label: Sand -> 0.673482
MCRMSE score for label: SOC -> 0.518281
MCRMSE score for label: pH -> 0.627801

MCRMSE score: 0.836184

Fold: 2
MCRMSE score for label: Ca -> 0.881758
MCRMSE score for label: P -> 0.808320
MCRMSE score for label: Sand -> 0.567008
MCRMSE score for label: SOC -> 0.647887
MCRMSE score for label: pH -> 0.642886

MCRMSE score: 0.709572

Fold: 3
MCRMSE score for label: Ca -> 0.586624
MCRMSE score for label: P -> 1.481323
MCRMSE score for label: Sand -> 0.521044
MCRMSE score for label: SOC -> 0.634650
MCRMSE score for label: pH -> 0.508060

MCRMSE score: 0.746340

Fold: 4
MCRMSE score for label: Ca -> 0.917614
MCRMSE score for labe

In [12]:
trains[0].columns

Index(['2000_0', '2000_1', '4000_0', '4000_1', '3000_0', '3000_1', '1000_0',
       '1000_1', '<1000_0', '<1000_1'],
      dtype='object')