In [2]:
import numpy as np
from sklearn import preprocessing
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import bz2

In [3]:
# load training data
Xtr_loadpath = 'Xtr.csv'
Xts_loadpath = 'Xts.csv'
ytr_loadpath = 'ytr.csv'

Xtr = np.loadtxt(Xtr_loadpath, delimiter=",")
Xts = np.loadtxt(Xts_loadpath, delimiter=",")
ytr = np.loadtxt(ytr_loadpath, delimiter=",")
# standardize the training data
scaler = StandardScaler()
Xtr_standardized = scaler.fit_transform(Xtr)
Xts_standardized = scaler.transform(Xts)

# save the standardized training data
Xtr_savepath = 'Xtr_standardized.csv'
Xts_savepath = 'Xts_standardized.csv'
ytr_savepath = 'ytr.csv'

# save the standardized training data
Xtr_savepath = 'Xtr_standardized.csv'
Xts_savepath = 'Xts_standardized.csv'
ytr_savepath = 'ytr.csv'
yts_hat_savepath = 'yts_hat_RF.csv'

np.savetxt(Xtr_savepath, Xtr_standardized, delimiter=",")
np.savetxt(ytr_savepath, ytr, delimiter=",")

In [15]:
# create regressor object
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
 
# Fit the algorithm on the data
regressor.fit(Xtr_standardized,ytr)
  
regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [4]:
# Get training R2 and testing R2  
# Training set R2:
r2 = r2_score(ytr,regressor.predict(Xtr_standardized))
print('training R2: ',r2)
# 5-fold cross validation R2 for these parameters
kf = KFold(n_splits=5,shuffle=True,random_state=0)
scores = cross_val_score(regressor, Xtr_standardized, ytr, cv=kf, scoring='r2')
rsq_cv = np.mean(scores)
print("cross-validation R^2 = %f" % rsq_cv)

training R2:  0.9262031818673279
cross-validation R^2 = 0.434714


In [5]:
regressor.feature_importances_

array([0.04581799, 0.02862986, 0.17030439, 0.0273833 , 0.02744233,
       0.02988117, 0.05631938, 0.02899558, 0.03279808, 0.02734625,
       0.04012237, 0.04117503, 0.02889334, 0.02963665, 0.03299408,
       0.02880184, 0.02971115, 0.0303794 , 0.02595213, 0.0272026 ,
       0.03104661, 0.03680931, 0.0377986 , 0.02509774, 0.04165201,
       0.03780883])

### Tune parameters of bootstrap, max_depth, max_features, min_samples_leaf, min_samples_split, n_estimators

In [17]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 300, 1000]
}
# Create a based model
regressor = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = regressor, param_grid = param_grid, scoring='r2', 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search .fit(Xtr_standardized, ytr)
print("Best parameters:",grid_search.best_params_)
print("Highest R2: ", grid_search.best_score_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best parameters: {'bootstrap': True, 'max_depth': 100, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 300}
Highest R2:  0.38890267869124395


In [19]:
table1 = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["R2 score"])],axis=1)
table1

Unnamed: 0,bootstrap,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,R2 score
0,True,80,2,3,8,200,0.379042
1,True,80,2,3,8,300,0.380059
2,True,80,2,3,8,1000,0.380790
3,True,80,2,3,10,200,0.370186
4,True,80,2,3,10,300,0.373300
...,...,...,...,...,...,...,...
211,True,110,3,5,10,300,0.367333
212,True,110,3,5,10,1000,0.367504
213,True,110,3,5,12,200,0.361922
214,True,110,3,5,12,300,0.360515


In [31]:
regressor = RandomForestRegressor(bootstrap=True, max_depth=10, max_features=3, min_samples_leaf=1, min_samples_split= 5, n_estimators=300)

In [30]:
# Fit the algorithm on the data
regressor.fit(Xtr_standardized,ytr)
# Get training R2 and testing R2  
# Training set R2:
r2 = r2_score(ytr,regressor.predict(Xtr_standardized))
print('training R2: ',r2)
# 5-fold cross validation R2 for these parameters
kf = KFold(n_splits=5,shuffle=True,random_state=0)
scores = cross_val_score(regressor, Xtr_standardized, ytr, cv=kf, scoring='r2')
rsq_cv = np.mean(scores)
print("cross-validation R^2 = %f" % rsq_cv)

training R2:  0.867497151587342
cross-validation R^2 = 0.456493


In [33]:
# Create the parameter grid based on the results of random search 
param_grid2 = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [3,5,8],
    'n_estimators': [300,500]
}
# Create a based model
regressor = RandomForestRegressor(bootstrap=True, min_samples_leaf=1, max_features=3)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = regressor, param_grid = param_grid2, scoring='r2', 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search .fit(Xtr_standardized, ytr)
print("Best parameters:",grid_search.best_params_)
print("Highest R2: ", grid_search.best_score_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters: {'max_depth': 15, 'min_samples_split': 3, 'n_estimators': 500}
Highest R2:  0.4012207397528469


In [34]:
# Fit the algorithm on the data
regressor.fit(Xtr_standardized,ytr)
# Get training R2 and testing R2  
# Training set R2:
r2 = r2_score(ytr,regressor.predict(Xtr_standardized))
print('training R2: ',r2)
# 5-fold cross validation R2 for these parameters
kf = KFold(n_splits=5,shuffle=True,random_state=0)
scores = cross_val_score(regressor, Xtr_standardized, ytr, cv=kf, scoring='r2')
rsq_cv = np.mean(scores)
print("cross-validation R^2 = %f" % rsq_cv)

training R2:  0.9307226855091405
cross-validation R^2 = 0.472775


In [35]:
# Create the parameter grid based on the results of random search 
param_grid3 = {
    'max_depth': [15,20,25,30,100],
    'min_samples_split': [1,2,3],
    'n_estimators': [500,800]
}
# Create a based model
regressor = RandomForestRegressor(bootstrap=True, min_samples_leaf=1, max_features=3)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = regressor, param_grid = param_grid3, scoring='r2', 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search .fit(Xtr_standardized, ytr)
print("Best parameters:",grid_search.best_params_)
print("Highest R2: ", grid_search.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xiang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\xiang\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "C:\Users\xiang\anaconda3\lib\site-packages\joblib\parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\xiang\anaconda3\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\xiang\anacon

Best parameters: {'max_depth': 100, 'min_samples_split': 2, 'n_estimators': 500}
Highest R2:  0.43590511740508336


In [40]:
pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["R2 score"])],axis=1)

Unnamed: 0,max_depth,min_samples_split,n_estimators,R2 score
0,15,1,500,
1,15,1,800,
2,15,2,500,0.402522
3,15,2,800,0.403212
4,15,3,500,0.397612
5,15,3,800,0.39903
6,20,1,500,
7,20,1,800,
8,20,2,500,0.426131
9,20,2,800,0.42965


In [39]:
regressor.set_params(max_depth=100, min_samples_split=2, n_estimators=500)
# Fit the algorithm on the data
regressor.fit(Xtr_standardized,ytr)
# Get training R2 and testing R2  
# Training set R2:
r2 = r2_score(ytr,regressor.predict(Xtr_standardized))
print('training R2: ',r2)
# 5-fold cross validation R2 for these parameters
kf = KFold(n_splits=5,shuffle=True,random_state=0)
scores = cross_val_score(regressor, Xtr_standardized, ytr, cv=kf, scoring='r2')
rsq_cv = np.mean(scores)
print("cross-validation R^2 = %f" % rsq_cv)

training R2:  0.9334310165639778
cross-validation R^2 = 0.475371


In [46]:
# save the model: you must use the .json format for xgboost models!
# save the model: you must use the .bz2 format for sklearn models!
model_savepath = 'model_RF.bz2'
with bz2.BZ2File(model_savepath, 'w') as f:
    pickle.dump(regressor,f)

In [47]:
# generate kaggle submission file using the validation script
!python {"validation.py " + model_savepath + " --Xts_path " + Xts_savepath + " --Xtr_path " + Xtr_savepath + " --yts_hat_path " + yts_hat_savepath }

training R2 =  0.9334310165639778
test target predictions saved in yts_hat_xgboost_RF.csv
