In [26]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.linear_model import BayesianRidge,Lasso,LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV,KFold
from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from scipy import stats
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,RobustScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

from scipy import stats

def split_data(X_train,y_train) :
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,test_size = 0.25, random_state = 43)
    return X_train, X_test, y_train, y_test

def standardize_data(X_train,X_test) :
    train_scaler = RobustScaler()
    scaled_train_data = train_scaler.fit_transform(X_train)
    scaled_test_data = train_scaler.transform(X_test)
    
    return scaled_train_data,scaled_test_data

#X_test = pd.read_csv("X_test.csv", index_col=0).values
X_train = pd.read_csv("X_train.csv", index_col=0).values
y_train = pd.read_csv("y_train.csv", index_col=0).values

X_train, X_test, y_train, y_test = split_data(X_train,y_train)

df_train = pd.DataFrame(X_train)
df_test = pd.DataFrame(X_test)

#df_train = df_train.fillna(df_train.mean())

#df_test = df_test.fillna(df_test.mean())

## Outlier detection with zscore

for i in range(X_train.shape[1]):
    data_d=df_train[i]
    data_d[(np.nan_to_num(np.abs(stats.zscore(data_d, nan_policy='omit')),0) > 3)]=np.nan
    df_train[i]=data_d

# Outlier detection with filter
def filter(df1, df2):
  # Filter feature selection
  from sklearn.feature_selection import VarianceThreshold 


  # Step 1: Removing Constant features
  constant_filter = VarianceThreshold(threshold=0)
  data_constant = constant_filter.fit_transform(df1)
  #print(data_constant.shape)
  constant_columns = [column for column in df1.columns if column not in df1.columns[constant_filter.get_support()]]
  data_cons1 = df1.drop(constant_columns,axis=1)
  data_cons2 = df2.drop(constant_columns,axis=1)
      
  # Step 2: Removing Quasi-Constant Features
  qcons_filter = VarianceThreshold(threshold=0.01)
  data_qcons = qcons_filter.fit_transform(df1)
  #print(data_qcons.shape)
  qcons_columns = [column for column in df1.columns if column not in df1.columns[qcons_filter.get_support()]]
  data_qcons1 = df1.drop(qcons_columns,axis=1)
  data_qcons2 = df2.drop(qcons_columns,axis=1)
  data_qcons_t1 = data_qcons1.T
  data_qcons_t2 = data_qcons2.T 
  # Step 3: Removing Duplicate Columns
  data_cons_dup1 = data_qcons_t1.drop_duplicates(keep='first').T
  data_cons_dup2 = data_qcons_t2.drop_duplicates(keep='first').T
  return data_cons_dup1, data_cons_dup2


X_train, X_test = filter(df_train, df_test)
X_train, X_test = X_train.values, X_test.values

X_train, X_test = standardize_data(X_train,X_test)



#Define some imputers
imputers = [
    SimpleImputer(missing_values=np.nan, strategy='median'),
    IterativeImputer(random_state=0, estimator=BayesianRidge()),
    IterativeImputer(random_state=0, estimator=DecisionTreeRegressor(max_features="sqrt", random_state=0)),
    IterativeImputer(random_state=0, estimator=ExtraTreesRegressor(n_estimators=15, random_state=0, max_depth=7, min_samples_leaf=2)),
    IterativeImputer(random_state=0, estimator=KNeighborsRegressor(n_neighbors=15)),
    KNNImputer(n_neighbors=10, weights="uniform"),
    IterativeImputer(random_state=0, estimator=RandomForestRegressor(n_estimators= 35, random_state = 0, max_depth= 30, min_samples_leaf=2))
]

def lasso(X_train,y_train,X_test) :
    pipeline = Pipeline([('scaler',StandardScaler()),('model',Lasso())])
    search = GridSearchCV(pipeline,{'model__alpha':np.arange(0.1,10,0.1)},cv = 5, scoring="r2",verbose=3)
    search.fit(X_train,y_train)
    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)
    X_train = X_train[:,importance > 0]
    X_test = X_test[:,importance > 0]
    return X_train,X_test

def lasso1(X_train,y_train,X_test) :
    ls=LassoCV(cv=5)
    ls.fit(X_train,y_train)
    mask=ls.coef_!=0
    X_train=X_train[:,mask]
    X_test = X_test[:,mask]
    

def imputation(imputer, X_train, X_test):
    imputer.fit(X_train)
    X_train_0 = imputer.transform(X_train)
    X_test_0 = imputer.transform(X_test)
    return X_train_0, X_test_0

def features_selection(X_train, y_train, X_test,  n_features):
    from sklearn import feature_selection
    from sklearn.linear_model import LinearRegression
    model = feature_selection.SelectKBest(score_func=feature_selection.f_regression,k=n_features)
    model = model.fit(X_train, y_train)
    train = model.transform(X_train)
    test = model.transform(X_test)

    return train, test

def outlier_detection(X_train, y_train,):
    clf = IsolationForest(max_samples=100, random_state = 4, contamination=float(0.2))
    preds = clf.fit_predict(X_train)
    X_train_1 = X_train[preds==1]
    y_train_1 = y_train[preds==1]
    return X_train_1, y_train_1


class XGB():
    def __init__(self, X_train, y_train, X_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.n_original_features = X_train.shape[1]
        self.selected_features = np.arange(self.n_original_features)

        #self.outlier_detection(self.selected_features, self.y_train)

      
        #Use stacking regressor
        self.estimators = [('lasso', Lasso(alpha=0.0005, random_state =  0, max_iter=100)),
                            ('xgb',XGBRegressor(max_depth=6,# depth of the tree
                                    learning_rate=0.08,
                                    n_estimators=100,# number of the tree
                                    )),
                           ('abr',AdaBoostRegressor(random_state=0, n_estimators=100)),
                           ('dtr', DecisionTreeRegressor(max_features="sqrt", random_state=0)), 
                           ('etr', ExtraTreesRegressor(n_estimators=15, random_state=0, max_depth=7, min_samples_leaf=2)),
                           ('rfr', RandomForestRegressor(n_estimators= 15, random_state = 0, max_depth= 6, min_samples_leaf=2)),
                           ('knr', KNeighborsRegressor(n_neighbors=15)), 
                           ('gbr', GradientBoostingRegressor(n_estimators = 100,learning_rate=0.05,
                                                              max_depth = 10, random_state=0))]

        
        self.stacked_regressor = StackingRegressor(estimators=self.estimators)
        
    def feature_selection(self, n_features = 200):
        self.regressor.fit(self.X_train, self.y_train)
        self.selected_features = np.argsort(self.regressor.feature_importances_)[::-1][:n_features]
        return self.selected_features
    
    def feature_selection_stacked(self, n_features = 200):
        self.stacked_regressor.fit(self.X_train, self.y_train)
        self.selected_features = np.argsort(self.stacked_regressor.feature_importances_)[::-1][:n_features]
        
        return self.selected_features

    def cross_validation(self, n_split = 8):
        ret = cross_val_score(self.regressor, self.X_train[:,self.selected_features], self.y_train, scoring='r2', cv=n_split)
        return ret

    def predict(self, write2csv = True):
        self.regressor.fit(self.X_train[:,self.selected_features], self.y_train)
        pred = self.regressor.predict(self.X_test[:,self.selected_features])

        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred
    
    def predict_stacked(self, write2csv = True):
        self.stacked_regressor.fit(self.X_train[:,self.selected_features], self.y_train)
        pred = self.stacked_regressor.predict(self.X_test[:,self.selected_features])
        
        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred

    def do_all(self, n_features = 200):
        self.feature_selection(n_features)
        pred = self.predict()
        return pred
    

# data imputation for X_train and X_test, then using SelectKbest to pick up the best 200 features


X_train, X_test = imputation(imputers[5], X_train, X_test)
X_train, X_test = features_selection(X_train, y_train.ravel(), X_test,n_features=200)

X_train, X_test =lasso(X_train,y_train,X_test)

X_train,y_train = outlier_detection(X_train,y_train)



samples = X_train.shape[1]

#estimator 100, learning rate 0.1, max_depth=7
xgb4 = XGB(X_train, y_train, X_test)
xgb4.feature_selection_stacked(n_features=samples)
pred = xgb4.predict_stacked(write2csv=True)

print(r2_score(y_test.ravel(),pred))

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 1/5] END ..................model__alpha=0.1;, score=0.464 total time=   0.1s
[CV 2/5] END ..................model__alpha=0.1;, score=0.502 total time=   0.0s
[CV 3/5] END ..................model__alpha=0.1;, score=0.479 total time=   0.0s
[CV 4/5] END ..................model__alpha=0.1;, score=0.516 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END ..................model__alpha=0.1;, score=0.351 total time=   0.0s
[CV 1/5] END ..................model__alpha=0.2;, score=0.487 total time=   0.0s
[CV 2/5] END ..................model__alpha=0.2;, score=0.486 total time=   0.0s
[CV 3/5] END ..................model__alpha=0.2;, score=0.480 total time=   0.0s
[CV 4/5] END ..................model__alpha=0.2;, score=0.502 total time=   0.0s
[CV 5/5] END ..................model__alpha=0.2;, score=0.306 total time=   0.0s
[CV 1/5] END ..model__alpha=0.30000000000000004;, score=0.478 total time=   0.0s
[CV 2/5] END ..model__alpha=0.30000000000000004;, score=0.453 total time=   0.0s
[CV 3/5] END ..model__alpha=0.30000000000000004;, score=0.463 total time=   0.0s
[CV 4/5] END ..model__alpha=0.30000000000000004;, score=0.480 total time=   0.0s
[CV 5/5] END ..model__alpha=0.30000000000000004;, score=0.264 total time=   0.0s
[CV 1/5] END ..................model__alpha=0.4;, score=0.459 total time=   0.0s
[CV 2/5] END ...............

[CV 1/5] END ...model__alpha=2.3000000000000003;, score=0.281 total time=   0.0s
[CV 2/5] END ...model__alpha=2.3000000000000003;, score=0.247 total time=   0.0s
[CV 3/5] END ...model__alpha=2.3000000000000003;, score=0.287 total time=   0.0s
[CV 4/5] END ...model__alpha=2.3000000000000003;, score=0.248 total time=   0.0s
[CV 5/5] END ...model__alpha=2.3000000000000003;, score=0.162 total time=   0.0s
[CV 1/5] END ...model__alpha=2.4000000000000004;, score=0.273 total time=   0.0s
[CV 2/5] END ...model__alpha=2.4000000000000004;, score=0.238 total time=   0.0s
[CV 3/5] END ...model__alpha=2.4000000000000004;, score=0.277 total time=   0.0s
[CV 4/5] END ...model__alpha=2.4000000000000004;, score=0.238 total time=   0.0s
[CV 5/5] END ...model__alpha=2.4000000000000004;, score=0.159 total time=   0.0s
[CV 1/5] END ...model__alpha=2.5000000000000004;, score=0.264 total time=   0.0s
[CV 2/5] END ...model__alpha=2.5000000000000004;, score=0.229 total time=   0.0s
[CV 3/5] END ...model__alpha

[CV 3/5] END ..................model__alpha=4.5;, score=0.029 total time=   0.0s
[CV 4/5] END ..................model__alpha=4.5;, score=0.017 total time=   0.0s
[CV 5/5] END ..................model__alpha=4.5;, score=0.034 total time=   0.0s
[CV 1/5] END ..................model__alpha=4.6;, score=0.006 total time=   0.0s
[CV 2/5] END ..................model__alpha=4.6;, score=0.000 total time=   0.0s
[CV 3/5] END ..................model__alpha=4.6;, score=0.018 total time=   0.0s
[CV 4/5] END ..................model__alpha=4.6;, score=0.007 total time=   0.0s
[CV 5/5] END ..................model__alpha=4.6;, score=0.027 total time=   0.0s
[CV 1/5] END .................model__alpha=4.7;, score=-0.002 total time=   0.0s
[CV 2/5] END .................model__alpha=4.7;, score=-0.000 total time=   0.0s
[CV 3/5] END ..................model__alpha=4.7;, score=0.007 total time=   0.0s
[CV 4/5] END .................model__alpha=4.7;, score=-0.000 total time=   0.0s
[CV 5/5] END ...............

[CV 4/5] END .................model__alpha=7.1;, score=-0.000 total time=   0.0s
[CV 5/5] END .................model__alpha=7.1;, score=-0.008 total time=   0.0s
[CV 1/5] END .................model__alpha=7.2;, score=-0.002 total time=   0.0s
[CV 2/5] END .................model__alpha=7.2;, score=-0.000 total time=   0.0s
[CV 3/5] END .................model__alpha=7.2;, score=-0.001 total time=   0.0s
[CV 4/5] END .................model__alpha=7.2;, score=-0.000 total time=   0.0s
[CV 5/5] END .................model__alpha=7.2;, score=-0.008 total time=   0.0s
[CV 1/5] END .................model__alpha=7.3;, score=-0.002 total time=   0.0s
[CV 2/5] END .................model__alpha=7.3;, score=-0.000 total time=   0.0s
[CV 3/5] END .................model__alpha=7.3;, score=-0.001 total time=   0.0s
[CV 4/5] END .................model__alpha=7.3;, score=-0.000 total time=   0.0s
[CV 5/5] END .................model__alpha=7.3;, score=-0.008 total time=   0.0s
[CV 1/5] END ...............

[CV 1/5] END .................model__alpha=9.8;, score=-0.002 total time=   0.0s
[CV 2/5] END .................model__alpha=9.8;, score=-0.000 total time=   0.0s
[CV 3/5] END .................model__alpha=9.8;, score=-0.001 total time=   0.0s
[CV 4/5] END .................model__alpha=9.8;, score=-0.000 total time=   0.0s
[CV 5/5] END .................model__alpha=9.8;, score=-0.008 total time=   0.0s
[CV 1/5] END .................model__alpha=9.9;, score=-0.002 total time=   0.0s
[CV 2/5] END .................model__alpha=9.9;, score=-0.000 total time=   0.0s
[CV 3/5] END .................model__alpha=9.9;, score=-0.001 total time=   0.0s
[CV 4/5] END .................model__alpha=9.9;, score=-0.000 total time=   0.0s
[CV 5/5] END .................model__alpha=9.9;, score=-0.008 total time=   0.0s


  return f(*args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


AttributeError: 'StackingRegressor' object has no attribute 'feature_importances_'