In [179]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.linear_model import BayesianRidge,Lasso,LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV,KFold
from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from scipy import stats
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,RobustScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.ensemble import ExtraTreesClassifier
from scipy import stats


def split_data(X_train,y_train) :
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,test_size = 0.25, random_state = 43)
    return X_train, X_test, y_train, y_test

def standardize_data(X_train,X_test) :
    s = StandardScaler()
    train_scaler = RobustScaler()
    scaled_train_data = train_scaler.fit_transform(X_train)
    scaled_test_data = train_scaler.transform(X_test)
    
    return scaled_train_data,scaled_test_data

In [180]:
X_test = pd.read_csv("X_test.csv", index_col=0).values
X_train = pd.read_csv("X_train.csv", index_col=0).values
y_train = pd.read_csv("y_train.csv", index_col=0).values

#X_train, X_test, y_train, y_test = split_data(X_train,y_train)



In [188]:
X_train, X_test = imputation(imputers[0], X_train, X_test)



X_train_0,X_test_0 = standardize_data(X_train,X_test)
## Outlier detection with zscore

In [189]:
#Define some imputers
imputers = [
    SimpleImputer(missing_values=np.nan, strategy='median'),
    IterativeImputer(random_state=0, estimator=BayesianRidge()),
    IterativeImputer(random_state=0, estimator=DecisionTreeRegressor(max_features="sqrt", random_state=0)),
    IterativeImputer(random_state=0, estimator=ExtraTreesRegressor(n_estimators=15, random_state=0, max_depth=7, min_samples_leaf=2)),
    IterativeImputer(random_state=0, estimator=KNeighborsRegressor(n_neighbors=15)),
    KNNImputer(n_neighbors=10, weights="uniform"),
    IterativeImputer(random_state=0, estimator=RandomForestRegressor(n_estimators= 35, random_state = 0, max_depth= 30, min_samples_leaf=2))
]

def lasso(X_train,y_train,X_test) :
    pipeline = Pipeline([('scaler',StandardScaler()),('model',Lasso())])
    search = GridSearchCV(pipeline,{'model__alpha':np.arange(0.1,10,0.1)},cv = 5, scoring="r2",verbose=3)
    search.fit(X_train,y_train)
    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)
    X_train = X_train[:,importance > 0]
    X_test = X_test[:,importance > 0]
    return X_train,X_test

def lasso1(X_train,y_train,X_test) :
    ls=LassoCV(cv=5)
    ls.fit(X_train,y_train)
    mask=ls.coef_!=0
    X_train=X_train[:,mask]
    X_test = X_test[:,mask]
    

def imputation(imputer, X_train, X_test):
    imputer.fit(X_train)
    X_train_0 = imputer.transform(X_train)
    X_test_0 = imputer.transform(X_test)
    return X_train_0, X_test_0

def features_selection(X_train, y_train, X_test,  n_features):
    from sklearn import feature_selection
    model = feature_selection.SelectKBest(score_func=feature_selection.f_regression,k=n_features)
    model = model.fit(X_train, y_train)
    cols = model.get_support(indices=True)
    
    return cols

def outlier_detection(X_train, y_train):
    clf = IsolationForest(max_samples=100, random_state = 4)
    preds = clf.fit_predict(X_train)
    X_train_1 = X_train[preds==1]
    y_train_1 = y_train[preds==1]
    return X_train_1, y_train_1

In [177]:
class XGB():
    def __init__(self, X_train, y_train, X_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.n_original_features = X_train.shape[1]
        self.selected_features = np.arange(self.n_original_features)

        #self.outlier_detection(self.selected_features, self.y_train)

      
        #Use stacking regressor
        self.estimators = [('lasso', Lasso(alpha=0.0005, random_state =  0, max_iter=100)),
                            ('xgb',XGBRegressor(max_depth=6,# depth of the tree
                                    learning_rate=0.08,
                                    n_estimators=100,# number of the tree
                                    )),
                           ('abr',AdaBoostRegressor(random_state=0, n_estimators=100)),
                           ('dtr', DecisionTreeRegressor(max_features="sqrt", random_state=0)), 
                           ('etr', ExtraTreesRegressor(n_estimators=15, random_state=0, max_depth=7, min_samples_leaf=2)),
                           ('rfr', RandomForestRegressor(n_estimators= 15, random_state = 0, max_depth= 6, min_samples_leaf=2)),
                           ('knr', KNeighborsRegressor(n_neighbors=15)), 
                           ('gbr', GradientBoostingRegressor(n_estimators = 100,learning_rate=0.05,
                                                              max_depth = 10, random_state=0))]

        
        self.stacked_regressor = StackingRegressor(estimators=self.estimators)
        
        self.regressor = XGBRegressor(n_estimators = 105,
learning_rate =0.09,
subsample =0.6,
colsample_bytree = 1.0,
min_child_weight = 5,
gamma=0.4,
reg_alpha=0.5,
reg_lambda=1.0)
            

        
    def feature_selection(self, n_features = 200):
        self.regressor.fit(self.X_train, self.y_train)
        self.selected_features = np.argsort(self.regressor.feature_importances_)[::-1][:n_features]
        return self.selected_features
    
    def feature_selection_stacked(self, n_features = 200):
        self.stacked_regressor.fit(self.X_train, self.y_train)
        self.selected_features = np.argsort(self.stacked_regressor.feature_importances_)[::-1][:n_features]
        
        return self.selected_features

    def cross_validation(self, n_split = 8):
        ret = cross_val_score(self.regressor, self.X_train[:,self.selected_features], self.y_train, scoring='r2', cv=n_split)
        return ret

    def predict(self, write2csv = True):
        self.regressor.fit(self.X_train, self.y_train)
        pred = self.regressor.predict(self.X_test)

        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred
    
    def predict_stacked(self, write2csv = True):
        self.stacked_regressor.fit(self.X_train[:,self.selected_features], self.y_train)
        pred = self.stacked_regressor.predict(self.X_test[:,self.selected_features])
        
        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred

    def do_all(self, n_features = 200):
        self.feature_selection(n_features)
        pred = self.predict()
        return pred
  

In [190]:
selected_features = features_selection(X_train_0, y_train.ravel(), X_test_0,n_features=200)

# data imputation for X_train and X_test, then using SelectKbest to pick up the best 200 features

new_train = X_train[:,selected_features]
new_test = X_test[:,selected_features]

new_train,new_test = standardize_data(new_train,new_test)

X_train, X_test = imputation(imputers[5], new_train, new_test)


  corr /= X_norms


In [193]:
X_train.shape

(1104, 200)

In [192]:
#X_train, X_test =lasso(X_train,y_train,X_test)

X_train,y_train = outlier_detection(X_train,y_train)

In [178]:
samples = X_train.shape[1]
print(samples)

#estimator 100, learning rate 0.1, max_depth=7
xgb4 = XGB(X_train, y_train, X_test)
#xgb4.feature_selection(n_features=samples)
pred = xgb4.predict(write2csv=True)

#print(r2_score(y_test.ravel(),pred))

200


In [194]:
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp,space_eval,rand,Trials,partial,STATUS_OK

def GBM(argsDict):
    max_depth = argsDict["max_depth"] + 5
    n_estimators = argsDict['n_estimators'] * 5 + 70
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.6
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.6
    min_child_weight = argsDict["min_child_weight"] + 1
    gamma = argsDict["min_child_weight"] * 0.1
    reg_alpha = argsDict["reg_alpha"] * 0.1
    reg_lambda = argsDict["reg_lambda"] * 0.1
    print("max_depth:" + str(max_depth))
    print("n_estimators:" + str(n_estimators))
    print("learning_rate:" + str(learning_rate))
    print("subsample:" + str(subsample))
    print("colsample_bytree:" + str(colsample_bytree))
    print("min_child_weight:" + str(min_child_weight))
    print("gamma:" + str(gamma))
    print("reg_alpha:" + str(reg_alpha))
    print("reg_lambda:" + str(reg_lambda))

    gbm = XGBRegressor(learning_rate = learning_rate,
                        n_estimators = n_estimators,
                        max_depth = max_depth,
                        min_child_weight = min_child_weight,
                        subsample = subsample,
                        colsample_bytree = colsample_bytree,
                        gamma = gamma,
                        reg_alpha = reg_alpha,
                        reg_lambda = reg_lambda
    )

    metric = cross_val_score(gbm,X_train,y_train,cv=10,scoring="r2") 
    print(str(metric) + '\n')
    metric_m= metric.mean()
    return -metric_m

space = {"max_depth":hp.randint("max_depth",15),
         "n_estimators":hp.randint("n_estimators",10),  #[0,1,2,3,4,5] -> [50,]
         "learning_rate":hp.randint("learning_rate",6),  #[0,1,2,3,4,5] -> 0.05,0.06
         "subsample":hp.randint("subsample",5),#[0,1,2,3,4] -> [0.6,0.7,0.8,0.9,1.0]
         "colsample_bytree":hp.randint("colsample_bytree",5),#[0,1,2,3,4] -> [0.6,0.7,0.8,0.9,1.0]
         "min_child_weight":hp.randint("min_child_weight",7), #[0,1,2,3,4,5,6] -> +1
         "gamma":hp.randint("gamma", 7), # * 0.1
         "reg_alpha":hp.randint("reg_alpha", 30), # * 0.1
         "reg_lambda":hp.randint("reg_lambda", 30), # * 0.1
        }
algo = partial(tpe.suggest,n_startup_jobs=1)
best = fmin(GBM,space,algo=algo,max_evals=200)

print(best)
print(GBM(best))

max_depth:13                                                                    
n_estimators:75                                                                 
learning_rate:0.05                                                              
subsample:0.9                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:1.0                                                                   
reg_lambda:1.2000000000000002                                                   
[0.53175018 0.61221201 0.61057358 0.55871776 0.5686452  0.59591346              
 0.65388351 0.49454658 0.56629316 0.55204706]

max_depth:13                                                                    
n_estimators:75                                               

min_child_weight:4                                                              
gamma:0.30000000000000004                                                       
reg_alpha:1.6                                                                   
reg_lambda:0.0                                                                  
[0.52932522 0.64033695 0.68537114 0.56559228 0.57875466 0.61533352              
 0.65201167 0.60928167 0.61150643 0.534183  ]

max_depth:5                                                                     
n_estimators:105                                                                
learning_rate:0.09                                                              
subsample:0.6                                                                   
colsample_bytree:0.8                                                            
min_child_weight:2                                                              
gamma:0.1                                                     

max_depth:19                                                                    
n_estimators:105                                                                
learning_rate:0.09                                                              
subsample:1.0                                                                   
colsample_bytree:1.0                                                            
min_child_weight:3                                                              
gamma:0.2                                                                       
reg_alpha:2.8000000000000003                                                    
reg_lambda:0.2                                                                  
[0.59452321 0.63033999 0.6694707  0.56670436 0.63329869 0.59073295              
 0.68838568 0.6056768  0.61474782 0.56831532]

max_depth:15                                                                    
n_estimators:80                                               

min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:2.4000000000000004                                                    
reg_lambda:0.5                                                                  
[0.54385967 0.62006153 0.58093573 0.55638589 0.58404446 0.61566319              
 0.66229339 0.49174305 0.55200268 0.5737974 ]

max_depth:5                                                                     
n_estimators:105                                                                
learning_rate:0.15000000000000002                                               
subsample:0.7                                                                   
colsample_bytree:1.0                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                      

max_depth:8                                                                     
n_estimators:115                                                                
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:0.7000000000000001                                                    
reg_lambda:1.3                                                                  
[0.57507713 0.63998675 0.64978695 0.57201836 0.63371519 0.66146295              
 0.69865603 0.65361173 0.63820284 0.61690553]

max_depth:9                                                                     
n_estimators:100                                              

min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:1.6                                                                   
reg_lambda:1.6                                                                  
[0.5713592  0.6494975  0.64092365 0.59581181 0.61948707 0.63701778              
 0.68639196 0.61045576 0.59988593 0.59417091]

max_depth:13                                                                    
n_estimators:100                                                                
learning_rate:0.13                                                              
subsample:0.7                                                                   
colsample_bytree:0.9                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                      

max_depth:19                                                                    
n_estimators:70                                                                 
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:1.0                                                            
min_child_weight:2                                                              
gamma:0.1                                                                       
reg_alpha:1.9000000000000001                                                    
reg_lambda:1.1                                                                  
[0.55137807 0.66932789 0.63462226 0.58108334 0.65666197 0.5918907               
 0.67005607 0.5532782  0.58645894 0.57088049]

max_depth:6                                                                     
n_estimators:100                                              

min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:0.7000000000000001                                                    
reg_lambda:1.8                                                                  
[0.58582337 0.6505046  0.68391891 0.62419537 0.64818299 0.68946031              
 0.7127155  0.5733988  0.64315238 0.60893167]

max_depth:13                                                                    
n_estimators:115                                                                
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                      

max_depth:14                                                                    
n_estimators:75                                                                 
learning_rate:0.05                                                              
subsample:0.9                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:0.30000000000000004                                                   
reg_lambda:1.3                                                                  
[0.49560409 0.61024056 0.58530468 0.55330002 0.57420118 0.61754877              
 0.65311334 0.48038334 0.57414429 0.5671184 ]

max_depth:16                                                                    
n_estimators:100                                              

min_child_weight:3                                                              
gamma:0.2                                                                       
reg_alpha:0.6000000000000001                                                    
reg_lambda:0.9                                                                  
[0.60874029 0.65620303 0.66132303 0.5775324  0.63428361 0.6149057               
 0.68196949 0.58549758 0.64334183 0.57411425]

max_depth:13                                                                    
n_estimators:115                                                                
learning_rate:0.13                                                              
subsample:0.8                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                      

max_depth:15                                                                    
n_estimators:90                                                                 
learning_rate:0.13                                                              
subsample:1.0                                                                   
colsample_bytree:0.7                                                            
min_child_weight:6                                                              
gamma:0.5                                                                       
reg_alpha:1.7000000000000002                                                    
reg_lambda:1.7000000000000002                                                   
[0.57445448 0.64584582 0.62750576 0.59697049 0.63848205 0.63156955              
 0.6906188  0.60536881 0.61239696 0.57498015]

max_depth:13                                                                    
n_estimators:105                                              

min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:0.2                                                                   
reg_lambda:1.4000000000000001                                                   
[0.57174116 0.60406906 0.64209298 0.52852373 0.6479846  0.59762894              
 0.66752088 0.63207761 0.60721843 0.58000692]

max_depth:8                                                                     
n_estimators:115                                                                
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:1.0                                                            
min_child_weight:6                                                              
gamma:0.5                                                     

max_depth:15                                                                    
n_estimators:100                                                                
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:1.0                                                            
min_child_weight:4                                                              
gamma:0.30000000000000004                                                       
reg_alpha:2.6                                                                   
reg_lambda:0.0                                                                  
[0.63467905 0.63425267 0.66260181 0.59070138 0.61503412 0.65408996              
 0.67030836 0.58983216 0.60040296 0.56420887]

max_depth:13                                                                    
n_estimators:115                                              

min_child_weight:4                                                              
gamma:0.30000000000000004                                                       
reg_alpha:0.7000000000000001                                                    
reg_lambda:2.0                                                                  
[0.55072475 0.59935447 0.58988337 0.52627478 0.64217532 0.594691                
 0.63880839 0.61600675 0.65503072 0.56628047]

max_depth:8                                                                     
n_estimators:70                                                                 
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.9                                                            
min_child_weight:3                                                              
gamma:0.2                                                     

max_depth:14                                                                    
n_estimators:90                                                                 
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.6                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:0.0                                                                   
reg_lambda:2.4000000000000004                                                   
[0.58117383 0.67136377 0.6684626  0.61136158 0.65226522 0.66908015              
 0.6783412  0.59072468 0.64407669 0.5970362 ]

max_depth:15                                                                    
n_estimators:80                                               

min_child_weight:5                                                              
gamma:0.4                                                                       
reg_alpha:0.30000000000000004                                                   
reg_lambda:2.1                                                                  
[0.58954165 0.61138019 0.64518959 0.58097444 0.61596159 0.64170438              
 0.70200918 0.55214955 0.61086855 0.5769822 ]

max_depth:11                                                                    
n_estimators:100                                                                
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.8                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                      

max_depth:14                                                                    
n_estimators:80                                                                 
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:2.8000000000000003                                                    
reg_lambda:0.2                                                                  
[0.61207269 0.65967972 0.63663144 0.55575776 0.63075912 0.6651473               
 0.67847246 0.60429324 0.60245222 0.61291153]

max_depth:13                                                                    
n_estimators:110                                              

min_child_weight:5                                                              
gamma:0.4                                                                       
reg_alpha:0.9                                                                   
reg_lambda:2.0                                                                  
[0.57916606 0.65094496 0.65294359 0.57174753 0.65697226 0.65931112              
 0.70214443 0.62362018 0.6119509  0.60478307]

max_depth:17                                                                    
n_estimators:100                                                                
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                      

max_depth:17                                                                    
n_estimators:115                                                                
learning_rate:0.07                                                              
subsample:0.9                                                                   
colsample_bytree:0.7                                                            
min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:0.8                                                                   
reg_lambda:1.8                                                                  
[0.59866195 0.64258622 0.68857077 0.5804728  0.64733567 0.67066707              
 0.70621516 0.5878756  0.64448217 0.60997806]

max_depth:13                                                                    
n_estimators:85                                               

min_child_weight:7                                                              
gamma:0.6000000000000001                                                        
reg_alpha:0.1                                                                   
reg_lambda:2.1                                                                  
[0.57665644 0.5815789  0.63916921 0.61773782 0.58901006 0.66219624              
 0.66992354 0.622738   0.62151368 0.59289609]

max_depth:7                                                                     
n_estimators:90                                                                 
learning_rate:0.07                                                              
subsample:1.0                                                                   
colsample_bytree:0.6                                                            
min_child_weight:1                                                              
gamma:0.0                                                     

max_depth:17                                                                    
n_estimators:75                                                                 
learning_rate:0.13                                                              
subsample:1.0                                                                   
colsample_bytree:0.6                                                            
min_child_weight:2                                                              
gamma:0.1                                                                       
reg_alpha:0.1                                                                   
reg_lambda:2.0                                                                  
[0.58567074 0.64100462 0.62650119 0.56296377 0.62219647 0.56813629              
 0.67192532 0.57109312 0.58936808 0.55401784]

max_depth:19                                                                    
n_estimators:100                                              

min_child_weight:6                                                              
gamma:0.5                                                                       
reg_alpha:1.7000000000000002                                                    
reg_lambda:2.2                                                                  
[0.56013984 0.65119162 0.64863858 0.56106326 0.62161148 0.66056105              
 0.68525404 0.57434896 0.57609286 0.54266159]

100%|██████| 200/200 [29:46<00:00,  8.93s/trial, best loss: -0.6436253281460141]
{'colsample_bytree': 0, 'gamma': 1, 'learning_rate': 1, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 9, 'reg_alpha': 9, 'reg_lambda': 7, 'subsample': 3}
max_depth:17
n_estimators:115
learning_rate:0.07
subsample:0.9
colsample_bytree:0.6
min_child_weight:7
gamma:0.6000000000000001
reg_alpha:0.9
reg_lambda:0.7000000000000001
[0.5976535  0.6697987  0.66337462 0.59076718 0.64398373 0.66329453
 0.69408908 0.64697428 0.65342785 0.61288982]

-0.64362532814601