In [28]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.linear_model import BayesianRidge,Lasso,LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV,KFold
from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from scipy import stats
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,RobustScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.ensemble import ExtraTreesClassifier
from scipy import stats


def split_data(X_train,y_train) :
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,test_size = 0.25, random_state = 43)
    return X_train, X_test, y_train, y_test

def standardize_data(X_train,X_test) :
    s = StandardScaler()
    train_scaler = RobustScaler()
    scaled_train_data = train_scaler.fit_transform(X_train)
    scaled_test_data = train_scaler.transform(X_test)
    
    return scaled_train_data,scaled_test_data

In [29]:
X_test = pd.read_csv("X_test.csv", index_col=0).values
X_train = pd.read_csv("X_train.csv", index_col=0).values
y_train = pd.read_csv("y_train.csv", index_col=0).values

#X_train, X_test, y_train, y_test = split_data(X_train,y_train)



In [30]:
#Define some imputers
imputers = [
    SimpleImputer(missing_values=np.nan, strategy='median'),
    IterativeImputer(random_state=0, estimator=BayesianRidge()),
    IterativeImputer(random_state=0, estimator=DecisionTreeRegressor(max_features="sqrt", random_state=0)),
    IterativeImputer(random_state=0, estimator=ExtraTreesRegressor(n_estimators=15, random_state=0, max_depth=7, min_samples_leaf=2)),
    IterativeImputer(random_state=0, estimator=KNeighborsRegressor(n_neighbors=15)),
    KNNImputer(n_neighbors=10, weights="uniform"),
    IterativeImputer(random_state=0, estimator=RandomForestRegressor(n_estimators= 35, random_state = 0, max_depth= 30, min_samples_leaf=2))
]

In [23]:
X_train, X_test = imputation(imputers[0], X_train, X_test)

X_train_0,X_test_0 = standardize_data(X_train,X_test)


In [24]:


def lasso(X_train,y_train,X_test) :
    pipeline = Pipeline([('scaler',StandardScaler()),('model',Lasso())])
    search = GridSearchCV(pipeline,{'model__alpha':np.arange(0.1,10,0.1)},cv = 5, scoring="r2",verbose=3)
    search.fit(X_train,y_train)
    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)
    X_train = X_train[:,importance > 0]
    X_test = X_test[:,importance > 0]
    return X_train,X_test

def lasso1(X_train,y_train,X_test) :
    ls=LassoCV(cv=5)
    ls.fit(X_train,y_train)
    mask=ls.coef_!=0
    X_train=X_train[:,mask]
    X_test = X_test[:,mask]
    

def imputation(imputer, X_train, X_test):
    imputer.fit(X_train)
    X_train_0 = imputer.transform(X_train)
    X_test_0 = imputer.transform(X_test)
    return X_train_0, X_test_0

def features_selection(X_train, y_train, X_test,  n_features):
    from sklearn import feature_selection
    model = feature_selection.SelectKBest(score_func=feature_selection.f_regression,k=n_features)
    model = model.fit(X_train, y_train)
    cols = model.get_support(indices=True)
    
    return cols

def outlier_detection(X_train, y_train):
    clf = IsolationForest(max_samples=100, random_state = 4)
    preds = clf.fit_predict(X_train)
    X_train_1 = X_train[preds==1]
    y_train_1 = y_train[preds==1]
    return X_train_1, y_train_1

In [25]:
class XGB():
    def __init__(self, X_train, y_train, X_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.n_original_features = X_train.shape[1]
        self.selected_features = np.arange(self.n_original_features)

        #self.outlier_detection(self.selected_features, self.y_train)

      
        #Use stacking regressor
        self.estimators = [('lasso', Lasso(alpha=0.0005, random_state =  0, max_iter=100)),
                            ('xgb',XGBRegressor(max_depth=6,# depth of the tree
                                    learning_rate=0.08,
                                    n_estimators=100,# number of the tree
                                    )),
                           ('abr',AdaBoostRegressor(random_state=0, n_estimators=100)),
                           ('dtr', DecisionTreeRegressor(max_features="sqrt", random_state=0)), 
                           ('etr', ExtraTreesRegressor(n_estimators=15, random_state=0, max_depth=7, min_samples_leaf=2)),
                           ('rfr', RandomForestRegressor(n_estimators= 15, random_state = 0, max_depth= 6, min_samples_leaf=2)),
                           ('knr', KNeighborsRegressor(n_neighbors=15)), 
                           ('gbr', GradientBoostingRegressor(n_estimators = 100,learning_rate=0.05,
                                                              max_depth = 10, random_state=0))]

        
        self.stacked_regressor = StackingRegressor(estimators=self.estimators)
        
        self.regressor = XGBRegressor(max_depth=17,
n_estimators=115,
learning_rate=0.07,
subsample=0.9,
colsample_bytree=0.6,
min_child_weight=7,
gamma=0.6000000000000001,
reg_alpha=0.9,
reg_lambda=0.7000000000000001)
            

        
    def feature_selection(self, n_features = 200):
        self.regressor.fit(self.X_train, self.y_train)
        self.selected_features = np.argsort(self.regressor.feature_importances_)[::-1][:n_features]
        return self.selected_features
    
    def feature_selection_stacked(self, n_features = 200):
        self.stacked_regressor.fit(self.X_train, self.y_train)
        self.selected_features = np.argsort(self.stacked_regressor.feature_importances_)[::-1][:n_features]
        
        return self.selected_features

    def cross_validation(self, n_split = 8):
        ret = cross_val_score(self.regressor, self.X_train[:,self.selected_features], self.y_train, scoring='r2', cv=n_split)
        return ret

    def predict(self, write2csv = True):
        self.regressor.fit(self.X_train, self.y_train)
        pred = self.regressor.predict(self.X_test)

        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred
    
    def predict_stacked(self, write2csv = True):
        self.stacked_regressor.fit(self.X_train[:,self.selected_features], self.y_train)
        pred = self.stacked_regressor.predict(self.X_test[:,self.selected_features])
        
        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred

    def do_all(self, n_features = 200):
        self.feature_selection(n_features)
        pred = self.predict()
        return pred
  

In [26]:
selected_features = features_selection(X_train_0, y_train.ravel(), X_test_0,n_features=200)

  corr /= X_norms


In [27]:
df_tr= pd.DataFrame(X_train_0)
df_tr.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,822,823,824,825,826,827,828,829,830,831
0,-0.46669,-0.282618,-0.263665,-0.618864,-0.976612,-0.786602,1.195982,1.385506,1.227129,0.253626,...,-0.004431,-0.439093,0.203576,0.060724,-1.100703,-0.780246,-0.128824,-0.493355,-1.829907,-0.714606
1,1.145548,-0.068001,1.574828,-0.755075,-0.013823,-0.010194,0.032176,1.236136,-1.506105,1.515109,...,-0.004431,1.406965,-0.199857,0.52649,-0.594855,-0.013314,-1.725024,0.008588,-1.409717,1.818198
2,-0.440705,0.057534,-0.063645,2.577001,-1.648303,-0.766725,0.007776,-0.015279,-0.005384,0.86113,...,0.281152,0.340408,-0.425105,-0.482133,1.632975,0.350198,0.471372,-1.850498,-0.125838,-0.439005
3,-2.894254,-2.34902,-3.00159,0.016241,-0.077519,0.214123,0.070541,0.011594,0.105183,0.112141,...,-0.065329,-3.386258,-2.778864,0.043016,-0.078849,-2.76168,0.030894,-1.695968,-0.141913,0.000119
4,-0.631951,1.536183,-0.04777,-0.852693,-0.476465,-0.449983,-0.33628,0.952266,0.44174,2.109532,...,0.029017,0.242487,-0.107779,0.000324,-1.749368,0.424026,-0.085102,0.436115,0.671595,-0.573007


In [13]:
print(selected_features)

[  2  15  18  21  23  26  27  29  40  69  77  87  89  92  98 100 101 107
 113 114 115 132 133 141 143 144 146 148 151 159 169 172 174 177 193 194
 200 202 203 209 213 214 218 220 230 231 232 233 240 242 245 248 254 260
 263 272 276 278 283 286 287 288 298 300 306 309 310 312 315 318 319 320
 325 326 327 333 334 342 345 349 350 358 359 362 369 370 374 380 381 383
 392 395 399 402 410 414 415 425 431 437 440 442 445 452 456 458 465 479
 484 485 493 496 502 507 512 517 520 523 528 531 538 542 543 546 547 548
 554 558 562 565 568 571 579 590 594 596 602 603 608 610 612 613 614 621
 633 636 640 641 642 644 648 649 654 657 659 665 668 670 671 672 675 677
 681 685 690 696 702 703 711 712 713 715 720 721 725 726 727 731 734 742
 745 748 759 766 768 769 773 774 777 778 780 783 788 790 796 801 817 819
 823 824]


In [None]:
# data imputation for X_train and X_test, then using SelectKbest to pick up the best 200 features

new_train = X_train[:,selected_features]
new_test = X_test[:,selected_features]

new_train,new_test = standardize_data(new_train,new_test)

X_train, X_test = imputation(imputers[5], new_train, new_test)

In [None]:
X_test.shape

In [None]:
#X_train, X_test =lasso(X_train,y_train,X_test)

X_train,y_train = outlier_detection(X_train,y_train)

In [None]:
samples = X_train.shape[1]
print(samples)

#estimator 100, learning rate 0.1, max_depth=7
xgb4 = XGB(X_train, y_train, X_test)
#xgb4.feature_selection(n_features=samples)
pred = xgb4.predict(write2csv=True)

#print(r2_score(y_test.ravel(),pred))

In [None]:
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp,space_eval,rand,Trials,partial,STATUS_OK

def GBM(argsDict):
    max_depth = argsDict["max_depth"] + 5
    n_estimators = argsDict['n_estimators'] * 5 + 70
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.6
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.6
    min_child_weight = argsDict["min_child_weight"] + 1
    gamma = argsDict["min_child_weight"] * 0.1
    reg_alpha = argsDict["reg_alpha"] * 0.1
    reg_lambda = argsDict["reg_lambda"] * 0.1
    print("max_depth:" + str(max_depth))
    print("n_estimators:" + str(n_estimators))
    print("learning_rate:" + str(learning_rate))
    print("subsample:" + str(subsample))
    print("colsample_bytree:" + str(colsample_bytree))
    print("min_child_weight:" + str(min_child_weight))
    print("gamma:" + str(gamma))
    print("reg_alpha:" + str(reg_alpha))
    print("reg_lambda:" + str(reg_lambda))

    gbm = XGBRegressor(learning_rate = learning_rate,
                        n_estimators = n_estimators,
                        max_depth = max_depth,
                        min_child_weight = min_child_weight,
                        subsample = subsample,
                        colsample_bytree = colsample_bytree,
                        gamma = gamma,
                        reg_alpha = reg_alpha,
                        reg_lambda = reg_lambda
    )

    metric = cross_val_score(gbm,X_train,y_train,cv=10,scoring="r2") 
    print(str(metric) + '\n')
    metric_m= metric.mean()
    return -metric_m

space = {"max_depth":hp.randint("max_depth",15),
         "n_estimators":hp.randint("n_estimators",10),  #[0,1,2,3,4,5] -> [50,]
         "learning_rate":hp.randint("learning_rate",6),  #[0,1,2,3,4,5] -> 0.05,0.06
         "subsample":hp.randint("subsample",5),#[0,1,2,3,4] -> [0.6,0.7,0.8,0.9,1.0]
         "colsample_bytree":hp.randint("colsample_bytree",5),#[0,1,2,3,4] -> [0.6,0.7,0.8,0.9,1.0]
         "min_child_weight":hp.randint("min_child_weight",7), #[0,1,2,3,4,5,6] -> +1
         "gamma":hp.randint("gamma", 7), # * 0.1
         "reg_alpha":hp.randint("reg_alpha", 30), # * 0.1
         "reg_lambda":hp.randint("reg_lambda", 30), # * 0.1
        }
algo = partial(tpe.suggest,n_startup_jobs=1)
best = fmin(GBM,space,algo=algo,max_evals=200)

print(best)
print(GBM(best))