In [13]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression

sns.set_style('whitegrid')
sns.set_context('poster')

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/African_Soil_Property_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(0)

In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/training.csv'))
test = pd.read_csv(os.path.join(basepath, 'data/raw/sorted_test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [51]:
class Data:
    def __init__(self, train, test):
        self.train = train
        self.test = test
    
    def concat_data(self):
        self.data = pd.concat((self.train, self.test), axis=0)
        return self.data
    
    def non_infrared_features(self):
        return self.train.columns[1:-6]
    
    def get_train_test(self):
        mask = self.data.Ca.notnull()
        train = self.data.loc[mask]
        test = self.data.loc[~mask]
        
        return train, test

In [52]:
d = Data(train, test)

In [53]:
features = d.non_infrared_features()
d.concat_data()
train_, test_ = d.get_train_test()

In [55]:
X = train_[features]
Xtest = test_[features]

y_Ca = train_.Ca
y_SOC = train_.SOC
y_Sand = train_.Sand
y_pH = train_.pH
y_P = train_.P

** Evaluation Metric : Mean Column Root Mean Squared Error ( MCRMSE ) ** 

In [74]:
def mcrmse(ytrue, ypred):
    s = 0
    
    for j in range(len(ytrue)):
        diff = (ytrue[j] - ypred[j]) ** 2
        diff_sum = np.sum(diff)
        s = s + np.sqrt((1/len(ytrue[j])) * diff_sum)
    
    print()
    return (1/5) * s

In [75]:
def split_dataset(train_length, **params):
    itrain, itest = train_test_split(range(train_length), **params)
    
    return itrain, itest

In [76]:
params = {
    'test_size': 0.2,
    'random_state': 3
}

itrain, itest = split_dataset(len(X), **params)

X_train = X.iloc[itrain]
X_test = X.iloc[itest]

y_train_Ca = y_Ca.iloc[itrain]
y_test_Ca = y_Ca.iloc[itest]

y_train_P = y_P.iloc[itrain]
y_test_P = y_P.iloc[itest]

y_train_Sand = y_Sand.iloc[itrain]
y_test_Sand = y_Sand.iloc[itest]

y_train_SOC = y_SOC.iloc[itrain]
y_test_SOC = y_SOC.iloc[itest]

y_train_pH = y_pH.iloc[itrain]
y_test_pH = y_pH.iloc[itest]

In [77]:
est_1 = LinearRegression()
est_2 = LinearRegression()
est_3 = LinearRegression()
est_4 = LinearRegression()
est_5 = LinearRegression()

est_1.fit(X_train, y_train_Ca)
est_2.fit(X_train, y_train_P)
est_3.fit(X_train, y_train_Sand)
est_4.fit(X_train, y_train_SOC)
est_5.fit(X_train, y_train_pH)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [78]:
y_pred_Ca = est_1.predict(X_test)
y_pred_P = est_2.predict(X_test)
y_pred_Sand = est_3.predict(X_test)
y_pred_SOC = est_4.predict(X_test)
y_pred_pH = est_5.predict(X_test)

In [79]:
print('MCRMSE on unseen examples: %f'%mcrmse([y_test_Ca, y_test_P, y_test_pH, y_test_Sand, y_test_SOC], [y_pred_Ca, y_pred_P, y_pred_pH, y_pred_Sand, y_pred_SOC]))

diff sum  39.27354300057705
diff sum  383.3054763151937
diff sum  77.40729357350868
diff sum  88.4542401877954
diff sum  59.03077071743069
MCRMSE on unseen examples: 0.679266


** Train model on full dataset. **

In [63]:
est_1.fit(X, y_Ca)
est_2.fit(X, y_P)
est_3.fit(X, y_Sand)
est_4.fit(X, y_SOC)
est_5.fit(X, y_pH)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [64]:
predict_Ca = est_1.predict(Xtest) 
predict_P = est_2.predict(Xtest)
predict_Sand = est_3.predict(Xtest)
predict_SOC = est_4.predict(Xtest)
predict_pH = est_5.predict(Xtest)

In [68]:
sample_sub['Ca'] = predict_Ca
sample_sub['P'] = predict_P
sample_sub['pH'] = predict_pH
sample_sub['SOC'] = predict_SOC
sample_sub['Sand'] = predict_Sand

In [69]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/benchmark.csv'), index=False)