** Objective: Create a model that would serve as benchmark for rest of the sessions. **

* Use the feature relevance df prepared in the data exploration phase for different target variable.
* See which of the target variables have high mcrmse score.
* Set up a solid cross validation scheme.
* Verify gains in the cross-validations translate to public leaderboard.

In [74]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.decomposition import RandomizedPCA
from sklearn.externals import joblib

import xgboost as xgb

sns.set_style('whitegrid')
sns.set_context('poster')

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(0)

from models import eval_metric, cross_validation

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
class Data:
    def __init__(self, train, test):
        self.train = train
        self.test = test
    
    def concat_data(self):
        self.data = pd.concat((self.train, self.test), axis=0)
        return self.data
    
    def non_infrared_features(self):
        return self.train.columns[1:-5]
    
    def get_train_test(self):
        mask = self.data.Ca.notnull()
        train = self.data.loc[mask]
        test = self.data.loc[~mask]
        
        return train, test
    
    def encode_categorical_features(self, feature_name):
        lbl = LabelEncoder()
        lbl.fit(self.data[feature_name])
        
        self.data[feature_name] = lbl.transform(self.data[feature_name])
        return self.data[feature_name]

In [4]:
d = Data(train, test)

In [5]:
features = d.non_infrared_features()
d.concat_data()
d.encode_categorical_features('Depth')
train_, test_ = d.get_train_test()

In [6]:
X       = train_[features]
Xtest   = test_[features]

y_Ca    = train_.Ca
y_SOC   = train_.SOC
y_Sand  = train_.Sand
y_pH    = train_.pH
y_P     = train_.P

** Evaluation Metric : Mean Column Root Mean Squared Error ( MCRMSE ) ** 

In [7]:
params = {
    'test_size': 0.2,
    'random_state': 3
}

itrain, itest = cross_validation.split_dataset(len(X), **params)

X_train       = X.iloc[itrain]
X_test        = X.iloc[itest]

y_train_Ca    = y_Ca.iloc[itrain]
y_test_Ca     = y_Ca.iloc[itest]

y_train_P     = y_P.iloc[itrain]
y_test_P      = y_P.iloc[itest]

y_train_Sand  = y_Sand.iloc[itrain]
y_test_Sand   = y_Sand.iloc[itest]

y_train_SOC   = y_SOC.iloc[itrain]
y_test_SOC    = y_SOC.iloc[itest]

y_train_pH    = y_pH.iloc[itrain]
y_test_pH     = y_pH.iloc[itest]

In [96]:
pipeline1 = Pipeline([
        ('pca', RandomizedPCA(n_components=200, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', xgb.XGBRegressor(colsample_bytree=0.6))
    ])

pipeline2 = Pipeline([
        ('pca', RandomizedPCA(n_components=200, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', xgb.XGBRegressor(colsample_bytree=0.6))
    ])

pipeline3 = Pipeline([
        ('pca', RandomizedPCA(n_components=200, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', xgb.XGBRegressor(colsample_bytree=0.6))
    ])

pipeline4 = Pipeline([
        ('pca', RandomizedPCA(n_components=200, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', xgb.XGBRegressor(colsample_bytree=0.6))
    ])

pipeline5 = Pipeline([
        ('pca', RandomizedPCA(n_components=200, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', xgb.XGBRegressor(colsample_bytree=0.6))
    ])

** Choose features based on the feature relevance data frame. **

In [97]:
feature_rel_df = joblib.load(os.path.join(basepath, 'data/interim/feature_relevance'))

In [98]:
top_2000_features_Ca    = feature_rel_df.sort_values(by='Ca_r_squared').variables[:1000]
top_2000_features_P     = feature_rel_df.sort_values(by='P_r_squared').variables[:1000]
top_2000_features_Sand  = feature_rel_df.sort_values(by='Sand_r_squared').variables[:1000]
top_2000_features_SOC   = feature_rel_df.sort_values(by='SOC_r_squared').variables[:1000]
top_2000_features_pH    = feature_rel_df.sort_values(by='pH_r_squared').variables[:1000]

** Public Leaderboard Score: 0.55002, Private Leaderboard Score: 0.53763 **

In [99]:
pipeline1.fit(X_train[top_2000_features_Ca], y_train_Ca)
pipeline2.fit(X_train[top_2000_features_P], y_train_P)
pipeline3.fit(X_train[top_2000_features_Sand], y_train_Sand)
pipeline4.fit(X_train[top_2000_features_SOC], y_train_SOC)
pipeline5.fit(X_train[top_2000_features_pH], y_train_pH)

Pipeline(steps=[('pca', RandomizedPCA(copy=True, iterated_power=3, n_components=200, random_state=4,
       whiten=True)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0, learning_rate=0.1, ...g:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [100]:
y_pred_Ca   = pipeline1.predict(X_test[top_2000_features_Ca])
y_pred_P    = pipeline2.predict(X_test[top_2000_features_P])
y_pred_Sand = pipeline3.predict(X_test[top_2000_features_Sand])
y_pred_SOC  = pipeline4.predict(X_test[top_2000_features_SOC])
y_pred_pH   = pipeline5.predict(X_test[top_2000_features_pH])

In [101]:
print('MCRMSE on unseen examples: %f'%eval_metric.mcrmse([y_test_Ca, y_test_P, y_test_pH, y_test_Sand, y_test_SOC], [y_pred_Ca, y_pred_P, y_pred_pH, y_pred_Sand, y_pred_SOC]))

MCRMSE on unseen examples: 0.576012


** Train model on full dataset. **

In [87]:
pipeline1.fit(X[top_2000_features_Ca], y_Ca)
pipeline2.fit(X[top_2000_features_P], y_P)
pipeline3.fit(X[top_2000_features_Sand], y_Sand)
pipeline4.fit(X[top_2000_features_SOC], y_SOC)
pipeline5.fit(X[top_2000_features_pH], y_pH)

Pipeline(steps=[('pca', RandomizedPCA(copy=True, iterated_power=3, n_components=50, random_state=4,
       whiten=True)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max...g:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [88]:
predict_Ca   = pipeline1.predict(Xtest[top_2000_features_Ca]) 
predict_P    = pipeline2.predict(Xtest[top_2000_features_P])
predict_Sand = pipeline3.predict(Xtest[top_2000_features_Sand])
predict_SOC  = pipeline4.predict(Xtest[top_2000_features_SOC])
predict_pH   = pipeline5.predict(Xtest[top_2000_features_pH])

In [89]:
sample_sub['Ca']   = predict_Ca
sample_sub['P']    = predict_P
sample_sub['pH']   = predict_pH
sample_sub['SOC']  = predict_SOC
sample_sub['Sand'] = predict_Sand

In [90]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/feature_relevance_xgbregressor.csv'), index=False)