In [1]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV,KFold
from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from scipy import stats
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,RobustScaler,PowerTransformer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RationalQuadratic
from scipy import stats

def standardize_data(X_train,X_test) :
    r_train = RobustScaler()
    r_test = RobustScaler()
    
    p_train = PowerTransformer()
    r_train.fit(X_train)
    r_test.fit(X_test)
    
    scaled_train_data = r_train.transform(X_train)
    scaled_test_data = r_train.transform(X_test)
    
    return scaled_train_data,scaled_test_data

In [2]:
X_test_raw = pd.read_csv("X_test.csv", index_col=0).values
X_train_raw = pd.read_csv("X_train.csv", index_col=0).values
y_train_raw = pd.read_csv("y_train.csv", index_col=0).values

In [3]:
#Define some imputers
imputers = [
    SimpleImputer(missing_values=np.nan, strategy='median'),
    KNNImputer(n_neighbors=15, weights="uniform")]

In [4]:
def imputation(imputer, X_train, X_test):
    imputer.fit(X_train)
    X_train_0 = imputer.transform(X_train)
    X_test_0 = imputer.transform(X_test)
    return X_train_0, X_test_0

def features_selection(X_train, y_train, X_test,  n_features):
    from sklearn import feature_selection
    model = feature_selection.SelectKBest(score_func=feature_selection.f_regression,k=n_features)
    model = model.fit(X_train, y_train)
    cols = model.get_support(indices=True)   
    return cols

def outlier_detection(X_train, y_train):
    clf = IsolationForest(max_samples=100, random_state = 4)
    preds = clf.fit_predict(X_train)
    X_train_1 = X_train[preds==1]
    y_train_1 = y_train[preds==1]
    return X_train_1, y_train_1

In [5]:
# imputation with median
X_train_0, X_test_0 = imputation(imputers[0], X_train_raw, X_test_raw)
X_train_1,X_test_1 = standardize_data(X_train_0,X_test_0)

In [6]:
#Feature selection using SelectKbest
selected_features = features_selection(X_train_1, y_train_raw.ravel(), X_test_1,n_features=180)

  correlation_coefficient /= X_norms


In [7]:
print(selected_features)

[  2  15  21  23  26  27  29  40  69  77  87  89  92  98 100 101 107 113
 114 115 132 133 141 144 146 151 159 169 172 177 193 194 200 203 209 213
 214 218 220 230 231 232 233 242 245 248 254 260 263 276 278 283 286 287
 288 298 300 306 309 310 312 315 318 319 320 325 326 327 334 342 345 349
 350 358 359 362 369 370 374 380 381 383 395 399 402 410 414 415 425 431
 437 440 445 452 456 458 465 479 484 485 493 496 507 512 517 520 523 528
 531 538 542 543 546 547 554 558 562 565 571 590 594 596 602 603 608 610
 612 613 621 633 636 640 641 642 644 648 649 654 657 659 665 668 671 672
 675 677 681 690 696 702 703 711 712 713 720 721 725 726 731 734 742 745
 748 759 766 768 769 773 774 777 778 780 783 788 796 801 817 819 823 824]


In [8]:
# data imputation for X_train and X_test, then using SelectKbest to pick up the best 200 features
new_train = X_train_raw[:,selected_features]
new_test = X_test_raw[:,selected_features]
new_train1,new_test1 = standardize_data(new_train,new_test)
X_train_im, X_test_im = imputation(imputers[1], new_train1, new_test1)

In [9]:
# outlier detection with isolation forest
X_train_o,y_train_o = outlier_detection(X_train_im,y_train_raw)

In [10]:
X_train= X_train_o
y_train=y_train_o
X_test=X_test_im

In [11]:
X_train.shape

(1103, 180)

In [12]:
class Gaussian_r():
    def __init__(self, X_train, y_train, X_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
       
        self.regressor_gauss = GaussianProcessRegressor(kernel = RationalQuadratic(),random_state = 0,normalize_y = True)
             

    def predict(self, write2csv = True):
        self.regressor_gauss.fit(self.X_train, self.y_train)
        pred = self.regressor_gauss.predict(self.X_test)

        if write2csv is True:
            submission = np.hstack([np.arange(0, len(pred)).reshape(-1,1), pred.reshape(-1,1)]) 
            submission_pd = pd.DataFrame(submission, columns=['id','y'])
            submission_pd.to_csv('submission.csv', index=None)

        return pred 

In [13]:
gr = Gaussian_r(X_train, y_train.ravel(), X_test)
pred = gr.predict(write2csv=True)