In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_percentage_error
from scipy.stats import randint 
from sklearn.preprocessing import MinMaxScaler


np.random.seed(42)

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
remove = ['P1', 'P11', 'P22', 'P22', 'P25', 'P37', 'P4', 'P47', 'P48', 'P49', 'P54', 'P56', 'P59','P16', 'P24', 'P68', 'P72','P74','P75','P30','P67']

In [4]:
dataset = np.load("../data/Final_features_3k5k_order5_53.npy",allow_pickle=True)
dataset = dataset[~np.isin(dataset[:,0],remove)]

data2 = pd.read_csv("../data/feature_30_joint_51.csv")
data2 = data2[~data2.iloc[:,0].isin([21,48])]

X = pd.DataFrame(data2.iloc[:,1:],dtype=float)
Y= pd.DataFrame(dataset[:,-1], dtype=float)

In [5]:
X.shape, Y.shape

((51, 30), (51, 1))

In [6]:
X = MinMaxScaler().fit_transform(X)
Y = np.array(dataset[:,-1].astype("float"))

In [7]:
train_idx= [1, 40, 38, 15, 45, 10, 19, 35]
test_idx= [0, 42, 30, 6, 17, 8, 33, 44, 11, 36]
query_idx = [i for i in range(0,51) if i not in train_idx+test_idx]

In [8]:
print(query_idx)

[2, 3, 4, 5, 7, 9, 12, 13, 14, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 34, 37, 39, 41, 43, 46, 47, 48, 49, 50]


In [9]:
def mape(model, feat, train_label):
  pred = model.predict(feat)
  mpe = 100*np.mean(np.abs((train_label.reshape(-1) -pred)/train_label.reshape(-1)))
  return mpe

In [10]:
def loss_fnc(train_label,pred):
    mape = 100*np.mean(np.abs((train_label.reshape(-1) -pred)/train_label.reshape(-1)))
    return mape

from sklearn.metrics import make_scorer
loss = make_scorer(loss_fnc, greater_is_better=False)

In [11]:
print("Starting RF")
param_grid = {
    'bootstrap': [True, False],
    'max_depth': randint(10,50),
    'max_features': [2, 3, 4, 'sqrt','auto'],
    'min_samples_leaf': randint(1,10),
    'min_samples_split': randint(2,10),
    'n_estimators': randint(10,150)
}
print("params_initialised")
rf = RandomForestRegressor(random_state=0,verbose=0)
print("model_done")
grid_search_rf = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv=2,n_jobs = -1, verbose = 0, scoring=loss, n_iter=20,random_state=100)
print("grid_done")
    
                   
print("Initialising Indexes")
indexAdded = []
recordedMPE = []
best_param_dict = {}

original_train = train_idx.copy()

while len(query_idx):
    print("Current length of Pool set is = {}".format(len(query_idx)))

    lowest_mpe = 100 #reset this for each run

    for datapoint_idx in query_idx:
        train_idx = original_train.copy()
        train_idx.append(datapoint_idx)

        X_train, X_test = np.float32(X[train_idx]), np.float32(X[test_idx])
        Y_train, Y_test = np.float32(Y[train_idx]), np.float32(Y[test_idx])

        grid_search_rf.fit(X_train, Y_train)
        #grid_search.best_params_
        best_grid = grid_search_rf.best_estimator_
        grid_mpe = mape(best_grid, X_test, Y_test)

        #lowest_mpe_datapoint = datapoint_idx

        if(grid_mpe) < lowest_mpe:
            idx_best_reduction = datapoint_idx;
            final_grid = best_grid;
            lowest_mpe = grid_mpe

    indexAdded.append(idx_best_reduction)
    recordedMPE.append(lowest_mpe)
    best_param_dict[idx_best_reduction] = final_grid
    print("Best Grid:",final_grid)
    print("Lowest MPE recorded = {}\n".format(lowest_mpe))

    #remove the index from the query which was just detected to give the max differnece in MPE
    query_idx.remove(indexAdded[len(indexAdded)-1]) 
    #add the index to the original train set which was just detected to give the max differnece in MPE
    original_train.append(indexAdded[len(indexAdded)-1])

Starting RF
params_initialised
model_done
grid_done
Initialising Indexes
Current length of Pool set is = 33
Best Grid: RandomForestRegressor(bootstrap=False, max_depth=11, max_features='sqrt',
                      min_samples_split=4, n_estimators=131, random_state=0)
Lowest MPE recorded = 5.010336821897234

Current length of Pool set is = 32
Best Grid: RandomForestRegressor(bootstrap=False, max_depth=11, max_features='sqrt',
                      min_samples_split=4, n_estimators=131, random_state=0)
Lowest MPE recorded = 4.3921962342401075

Current length of Pool set is = 31
Best Grid: RandomForestRegressor(max_depth=24, max_features=4, min_samples_leaf=2,
                      n_estimators=34, random_state=0)
Lowest MPE recorded = 4.50224887025255

Current length of Pool set is = 30
Best Grid: RandomForestRegressor(bootstrap=False, max_depth=40, max_features='sqrt',
                      min_samples_leaf=3, min_samples_split=5, n_estimators=27,
                      random_state=0)

In [12]:
final_grid

In [13]:
recordedMPE

[5.010336821897234,
 4.3921962342401075,
 4.50224887025255,
 3.8250877790203273,
 3.7094442298703476,
 3.4242737259583365,
 3.2069468972946193,
 3.0258858843193597,
 3.120691320309941,
 3.040827097473292,
 2.9019784628011127,
 3.01828003882067,
 2.958581135197995,
 2.7720419400554657,
 2.962946340684955,
 2.7535436527223927,
 2.544567149449337,
 2.5091622118842145,
 2.637942340289102,
 2.6970083218535232,
 2.794408540186443,
 2.9113591475017615,
 3.0004851267729262,
 2.9160698842547146,
 2.898355560238237,
 2.9147098592285237,
 2.989589021003574,
 3.1325498067943927,
 3.2184407027214057,
 3.320881082514346,
 3.681268114545423,
 3.708418602221715,
 4.0845091264641065]

In [14]:
import pickle

# save
with open('../results/figure4/oracle.pkl','wb') as f:
    pickle.dump(recordedMPE,f)