In [1]:
# -*- coding: utf-8 -*- 
"""
Example of y-randammization

Created on Tue Oct  11 16:00:00 2018
@author: Akitaka
"""
# Demonstration of y-randomization
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from time                    import time
from sklearn.datasets        import make_regression
from sklearn.model_selection import GridSearchCV, ShuffleSplit, KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler, MinMaxScaler
from sklearn.neighbors       import KNeighborsRegressor
from my_library              import print_gscv_score_rgr, dcv_rgr, ad_knn
from sklearn.metrics         import mean_absolute_error
from sklearn.metrics         import mean_squared_error
from sklearn.metrics         import r2_score

start = time()

# settings
scaler = MinMaxScaler()
scaler = StandardScaler()
range_k = np.arange(  3, 11, dtype=int)
param_grid = [{'n_neighbors':range_k}]
cv = ShuffleSplit(n_splits=5, test_size=0.2)
cv = KFold(n_splits=5, shuffle=True)

# generate sample dataset
X, y = make_regression(n_samples=1000, n_features=4, n_informative=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# autoscaling
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("# modeling and prediction")
model = KNeighborsRegressor()
gscv = GridSearchCV(model, param_grid, cv=cv)
gscv.fit(X_train, y_train)
print_gscv_score_rgr(gscv, X_train, X_test, y_train, y_test, cv)

niter=10
print("# y-randomization")
scores = np.zeros((niter,3))
for iiter in range(niter):
    y_train_rand = np.random.permutation(y_train)
    model = KNeighborsRegressor()
    gscv = GridSearchCV(model, param_grid, cv=cv)
    gscv.fit(X_train, y_train_rand)
    y_pred = gscv.predict(X_train)
    rmse  = np.sqrt(mean_squared_error (y_train_rand, y_pred))
    mae   =         mean_absolute_error(y_train_rand, y_pred)
    r2    =         r2_score           (y_train_rand, y_pred)
    scores[iiter,:] = np.array([rmse,mae,r2])
means, stds = np.mean(scores, axis=0),np.std(scores, axis=0)
print('In {:} iterations, average +/- standard deviation'.format(niter))
print('RMSE: {:.3f} (+/-{:.3f})'.format(means[0], stds[0]))
print('MAE: {:.3f} (+/-{:.3f})'.format(means[1], stds[1]))
print('R^2: {:.3f} (+/-{:.3f})'.format(means[2], stds[2]))




# modeling and prediction

Best parameters set found on development set:
{'n_neighbors': 5}
C:  RMSE, MAE, R^2 = 15.406, 10.984, 0.964
CV: RMSE, MAE, R^2 = 19.598, 13.817, 0.942
P:  RMSE, MAE, R^2 = 21.047, 14.909, 0.936

# y-randomization

Best parameters set found on development set:
{'n_neighbors': 10}
C:  RMSE, MAE, R^2 = 86.090, 69.849, -0.126
CV: RMSE, MAE, R^2 = 20.089, 14.220, 0.939
P:  RMSE, MAE, R^2 = 88.885, 71.909, -0.140


Best parameters set found on development set:
{'n_neighbors': 10}
C:  RMSE, MAE, R^2 = 85.021, 66.632, -0.098
CV: RMSE, MAE, R^2 = 19.990, 13.701, 0.939
P:  RMSE, MAE, R^2 = 84.035, 66.140, -0.019


Best parameters set found on development set:
{'n_neighbors': 10}
C:  RMSE, MAE, R^2 = 85.264, 67.644, -0.104
CV: RMSE, MAE, R^2 = 20.216, 14.207, 0.938
P:  RMSE, MAE, R^2 = 85.718, 68.834, -0.061


Best parameters set found on development set:
{'n_neighbors': 10}
C:  RMSE, MAE, R^2 = 87.585, 68.960, -0.165
CV: RMSE, MAE, R^2 = 19.704, 13.695, 0.941
P:  RMSE,