In [None]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from scipy import stats
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from scipy import stats

X_test = pd.read_csv("X_test.csv", index_col=0).values
X_train = pd.read_csv("X_train.csv", index_col=0).values
y_train = pd.read_csv("y_train.csv", index_col=0).values

## Outlier detection with zscore
df_train = pd.DataFrame(X_train)

for i in range(X_train.shape[1]):
    data_d=df_train[i]
    data_d[(np.nan_to_num(np.abs(stats.zscore(data_d, nan_policy='omit')),0) > 3)]=np.nan
    df_train[i]=data_d

# Outlier detection with filter
def filter(df1, df2):
  # Filter feature selection
  from sklearn.feature_selection import VarianceThreshold 


  # Step 1: Removing Constant features
  constant_filter = VarianceThreshold(threshold=0)
  data_constant = constant_filter.fit_transform(df1)
  #print(data_constant.shape)
  constant_columns = [column for column in df1.columns if column not in df1.columns[constant_filter.get_support()]]
  data_cons1 = df1.drop(constant_columns,axis=1)
  data_cons2 = df2.drop(constant_columns,axis=1)
      
  # Step 2: Removing Quasi-Constant Features
  qcons_filter = VarianceThreshold(threshold=0.01)
  data_qcons = qcons_filter.fit_transform(df1)
  #print(data_qcons.shape)
  qcons_columns = [column for column in df1.columns if column not in df1.columns[qcons_filter.get_support()]]
  data_qcons1 = df1.drop(qcons_columns,axis=1)
  data_qcons2 = df2.drop(qcons_columns,axis=1)
  data_qcons_t1 = data_qcons1.T
  data_qcons_t2 = data_qcons2.T 
  # Step 3: Removing Duplicate Columns
  data_cons_dup1 = data_qcons_t1.drop_duplicates(keep='first').T
  data_cons_dup2 = data_qcons_t2.drop_duplicates(keep='first').T
  return data_cons_dup1, data_cons_dup2

df_test = pd.DataFrame(X_test)
X_train, X_test = filter(df_train, df_test)
X_train, X_test = X_train.values, X_test.values

#Define some imputers
imputers = [
    SimpleImputer(missing_values=np.nan, strategy='median'),
    IterativeImputer(random_state=0, estimator=BayesianRidge()),
    IterativeImputer(random_state=0, estimator=DecisionTreeRegressor(max_features="sqrt", random_state=0)),
    IterativeImputer(random_state=0, estimator=ExtraTreesRegressor(n_estimators=15, random_state=0, max_depth=7, min_samples_leaf=2)),
    IterativeImputer(random_state=0, estimator=KNeighborsRegressor(n_neighbors=15)),
    KNNImputer(n_neighbors=10, weights="uniform"),
    IterativeImputer(random_state=0, estimator=RandomForestRegressor(n_estimators= 15, random_state = 0, max_depth= 6, min_samples_leaf=2))
]

def imputation(imputer, X_train, X_test):
    imputer.fit(X_train)
    X_train_0 = imputer.transform(X_train)
    X_test_0 = imputer.transform(X_test)
    return X_train_0, X_test_0

def features_selection(X_train, y_train, X_test,  n_features):
  from sklearn import feature_selection
  from sklearn.linear_model import LinearRegression
  model = feature_selection.SelectKBest(score_func=feature_selection.f_regression,k=n_features)
  model = model.fit(X_train, y_train)
  train = model.transform(X_train)
  test = model.transform(X_test)

  return train, test

def outlier_detection(X_train, y_train):
    clf = IsolationForest(max_samples=100, random_state = 1, contamination='auto')
    preds = clf.fit_predict(X_train)
    X_train_1 = X_train[preds==1]
    y_train_1 = y_train[preds==1]
    return X_train_1, y_train_1

class XGB():
    def __init__(self, X_train, y_train, X_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.n_original_features = X_train.shape[1]
        self.selected_features = np.arange(self.n_original_features)

        #self.outlier_detection(self.selected_features, self.y_train)

        self.regressor = XGBRegressor(max_depth=6,# depth of the tree
                                    learning_rate=0.08,
                                    n_estimators=100,# number of the tree
                                    )


    def feature_selection(self, n_features = 200):
        self.regressor.fit(self.X_train, self.y_train)
        self.selected_features = np.argsort(self.regressor.feature_importances_)[::-1][:n_features]
        return self.selected_features

    def cross_validation(self, n_split = 8):
        ret = cross_val_score(self.regressor, self.X_train[:,self.selected_features], self.y_train, scoring='r2', cv=n_split)
        return ret

    def predict(self, write2csv = True):
        self.regressor.fit(self.X_train[:,self.selected_features], self.y_train)
        pred = self.regressor.predict(self.X_test[:,self.selected_features])

        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred

    def do_all(self, n_features = 200):
        self.feature_selection(n_features)
        pred = self.predict()
        return pred

# data imputation for X_train and X_test, then using SelectKbest to pick up the best 200 features
X_train_0_1, X_test_0_1 = imputation(imputers[2], X_train, X_test)
X_train_0, X_test_0 = features_selection(X_train_0_1, y_train.ravel(), X_test_0_1,n_features=200)

#pd.DataFrame(X_train_0_1).to_csv('x_train_ex.csv',index= None)
#pd.DataFrame(y_train).to_csv('y_train_ex.csv',index= None)
#pd.DataFrame(X_test_0_1).to_csv('x_test_ex.csv',index= None)


#X_test_0 = pd.read_csv("X_test_.csv", index_col=0).values
#X_train_ = pd.read_csv("X_train_ex.csv", index_col=0).values
#y_train_1 = pd.read_csv("y_train.csv", index_col=0).values
#estimator 100, learning rate 0.1, max_depth=7
xgb4 = XGB(X_train_0, y_train, X_test_0)
xgb4.feature_selection(n_features=200)
pred = xgb4.predict(write2csv=True)
print(pd.DataFrame(pred))

cv_ret = xgb4.cross_validation()
print(cv_ret)