In [146]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.decomposition import RandomizedPCA

sns.set_style('whitegrid')
sns.set_context('poster')

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(0)

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [86]:
class Data:
    def __init__(self, train, test):
        self.train = train
        self.test = test
    
    def concat_data(self):
        self.data = pd.concat((self.train, self.test), axis=0)
        return self.data
    
    def non_infrared_features(self):
        return self.train.columns[1:-5]
    
    def get_train_test(self):
        mask = self.data.Ca.notnull()
        train = self.data.loc[mask]
        test = self.data.loc[~mask]
        
        return train, test
    
    def encode_categorical_features(self, feature_name):
        lbl = LabelEncoder()
        lbl.fit(self.data[feature_name])
        
        self.data[feature_name] = lbl.transform(self.data[feature_name])
        return self.data[feature_name]

In [110]:
d = Data(train, test)

In [111]:
features = d.non_infrared_features()
d.concat_data()
d.encode_categorical_features('Depth')
train_, test_ = d.get_train_test()

In [112]:
X = train_[features]
Xtest = test_[features]

y_Ca = train_.Ca
y_SOC = train_.SOC
y_Sand = train_.Sand
y_pH = train_.pH
y_P = train_.P

** Evaluation Metric : Mean Column Root Mean Squared Error ( MCRMSE ) ** 

In [113]:
def mcrmse(ytrue, ypred):
    s = 0
    
    for j in range(len(ytrue)):
        diff = (ytrue[j] - ypred[j]) ** 2
        diff_sum = np.sum(diff)
        s = s + np.sqrt((1/len(ytrue[j])) * diff_sum)
    
    return (1/5) * s

In [114]:
def split_dataset(train_length, **params):
    itrain, itest = train_test_split(range(train_length), **params)
    
    return itrain, itest

In [115]:
params = {
    'test_size': 0.2,
    'random_state': 3
}

itrain, itest = split_dataset(len(X), **params)

X_train = X.iloc[itrain]
X_test = X.iloc[itest]

y_train_Ca = y_Ca.iloc[itrain]
y_test_Ca = y_Ca.iloc[itest]

y_train_P = y_P.iloc[itrain]
y_test_P = y_P.iloc[itest]

y_train_Sand = y_Sand.iloc[itrain]
y_test_Sand = y_Sand.iloc[itest]

y_train_SOC = y_SOC.iloc[itrain]
y_test_SOC = y_SOC.iloc[itest]

y_train_pH = y_pH.iloc[itrain]
y_test_pH = y_pH.iloc[itest]

In [161]:
pipeline1 = Pipeline([
        ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', SVR(kernel='rbf'))
    ])

pipeline2 = Pipeline([
        ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', SVR(kernel='rbf'))
    ])

pipeline3 = Pipeline([
        ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', SVR(kernel='rbf'))
    ])

pipeline4 = Pipeline([
        ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', SVR(kernel='rbf'))
    ])

pipeline5 = Pipeline([
        ('pca', RandomizedPCA(n_components=100, whiten=True, random_state=4)),
        ('scale', StandardScaler()),
        ('model', SVR(kernel='rbf'))
    ])

** Private Leaderboard Score: 0.54610 **

In [153]:
pipeline1.fit(X_train, y_train_Ca)
pipeline2.fit(X_train, y_train_P)
pipeline3.fit(X_train, y_train_Sand)
pipeline4.fit(X_train, y_train_SOC)
pipeline5.fit(X_train, y_train_pH)

Pipeline(steps=[('pca', RandomizedPCA(copy=True, iterated_power=3, n_components=100, random_state=4,
       whiten=True)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [154]:
y_pred_Ca = pipeline1.predict(X_test)
y_pred_P = pipeline2.predict(X_test)
y_pred_Sand = pipeline3.predict(X_test)
y_pred_SOC = pipeline4.predict(X_test)
y_pred_pH = pipeline5.predict(X_test)

In [155]:
print('MCRMSE on unseen examples: %f'%mcrmse([y_test_Ca, y_test_P, y_test_pH, y_test_Sand, y_test_SOC], [y_pred_Ca, y_pred_P, y_pred_pH, y_pred_Sand, y_pred_SOC]))

MCRMSE on unseen examples: 0.479048


** Train model on full dataset. **

In [157]:
pipeline1.fit(X, y_Ca)
pipeline2.fit(X, y_P)
pipeline3.fit(X, y_Sand)
pipeline4.fit(X, y_SOC)
pipeline5.fit(X, y_pH)

Pipeline(steps=[('pca', RandomizedPCA(copy=True, iterated_power=3, n_components=100, random_state=4,
       whiten=True)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [158]:
predict_Ca = pipeline1.predict(Xtest) 
predict_P = pipeline2.predict(Xtest)
predict_Sand = pipeline3.predict(Xtest)
predict_SOC = pipeline4.predict(Xtest)
predict_pH = pipeline5.predict(Xtest)

In [159]:
sample_sub['Ca'] = predict_Ca
sample_sub['P'] = predict_P
sample_sub['pH'] = predict_pH
sample_sub['SOC'] = predict_SOC
sample_sub['Sand'] = predict_Sand

In [160]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/randomized_pca.csv'), index=False)