In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn import preprocessing
import xgboost as xgb

In [2]:
# Step 0: Load the proper csvs

xtrain = pd.read_csv("X_train.csv")
xtrain = xtrain.iloc[:, 1:]
xtest = pd.read_csv("X_test.csv")
xtest = xtest.iloc[:, 1:]
ytrain = pd.read_csv("Y_train.csv")

In [3]:
# Step 1: Treat missing values
# Treat missing values as column medians. Important (apparently) to use the medians from the training set in the test set

xtrain = xtrain.fillna(xtrain.median())
xtest = xtest.fillna(xtrain.median())

In [4]:
# Step 2: Scale the training and test data

scaler = preprocessing.StandardScaler()

xtrain_scaled = scaler.fit_transform(xtrain)
xtrain = pd.DataFrame(xtrain_scaled, columns = xtrain.columns)
xtest_scaled = scaler.fit_transform(xtest)
xtest = pd.DataFrame(xtest_scaled, columns = xtest.columns)

In [5]:
# Step 3: Remove unnecessary id column from ytrain that just ***** things up

ytrain1 = ytrain.loc[:, "y"]
ytrain2 = pd.DataFrame(data = ytrain1.values, columns= ['y'])
ytrain = ytrain2

In [6]:
# Remove features with zero weight from Lasso

clf = Lasso(alpha=0.3)
clf.fit(xtrain, ytrain)

features = (clf.coef_ != 0)

xtrain = xtrain.loc[:, features]
xtest = xtest.loc[:, features]

In [7]:



# Do weird Andreas PCA thing

from sklearn import decomposition
import matplotlib.pyplot as plt
pca = decomposition.PCA(n_components=2)
principal_components = pca.fit_transform(xtrain)
xpca = pd.DataFrame(data = principal_components, columns = ['PC1', 'PC2'])

outliers = (xpca['PC1'] <= 9) & (xpca['PC1'] >= -8) & (xpca['PC2'] <= 7) & (xpca['PC2'] >= -7)
print(np.count_nonzero(outliers))
xtrain = xtrain[outliers]
ytrain = ytrain[outliers]

'''

# Outlier detection with local outlier factor
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=700, contamination=0.08)
outliers = clf.fit_predict(xtrain)

# Remove outliers from xtrain and ytrain


outliers = outliers == 1
print(np.count_nonzero(outliers))
xtrain = xtrain[outliers]
ytrain = ytrain[outliers]

'''

1155


'\n\n# Outlier detection with local outlier factor\nfrom sklearn.neighbors import LocalOutlierFactor\n\nclf = LocalOutlierFactor(n_neighbors=700, contamination=0.08)\noutliers = clf.fit_predict(xtrain)\n\n# Remove outliers from xtrain and ytrain\n\n\noutliers = outliers == 1\nprint(np.count_nonzero(outliers))\nxtrain = xtrain[outliers]\nytrain = ytrain[outliers]\n\n'

# Run Bayesian Optimization on XGBoost parameters

In [8]:
# k-fold cross validation evaluation of xgboost model

from numpy import loadtxt
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold


clf = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)



cv_means = []
cv_stds = []
for i in np.arange(10):
    scores = cross_val_score(estimator = clf,
                                 X = xtrain,
                                 y = ytrain,
                                 scoring = 'r2',
                                 cv = KFold(n_splits=5, shuffle = True))
    cv_means.append(np.mean(scores))
    cv_stds.append(np.std(scores))

print(cv_means)
print(cv_stds)
print(np.mean(cv_means))
print(np.mean(cv_stds))

[0.6491839050176116, 0.6471371854589318, 0.6391528990274764, 0.6340039064060577, 0.6415884405202968, 0.6494298605082397, 0.65346915707234, 0.6405600475093319, 0.6494465034886889, 0.6423864066374126]
[0.026554213685352098, 0.03971028940539932, 0.02038010009558867, 0.026801380582643773, 0.03162171239701299, 0.025048251358722844, 0.023367925631136332, 0.011882191344254816, 0.02484607821746095, 0.05788755333923325]
0.6446358311646387
0.028809969605680507


In [8]:
from bayes_opt import BayesianOptimization
from numpy import loadtxt
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold

# Specify the function
def xgbfun(max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, objective='reg:squarederror', booster='gbtree', tree_method='auto', n_jobs=1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, missing=None, num_parallel_tree=1, importance_type='gain'):
    model = xgb.XGBRegressor(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), verbosity=1, objective='reg:squarederror', booster='gbtree', tree_method='auto', n_jobs=1, 
                             gamma=gamma, min_child_weight=int(min_child_weight), max_delta_step=int(max_delta_step), subsample=subsample, 
                             colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, colsample_bynode=colsample_bynode, 
                             reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=0.5, random_state=42, 
                             missing=None, num_parallel_tree=1, importance_type='gain')
    
    scores = cross_val_score(estimator = model,
                                 X = xtrain,
                                 y = ytrain,
                                 scoring = 'r2',
                                 cv = 10)
    return np.mean(scores)



# specify parameters and distributions to sample from
param_dist = {"max_depth": (2, 5),
              "learning_rate": (0.1, 0.9),
              "n_estimators": (1, 200),
              "gamma": (0.001, 2),
              "min_child_weight": (0.01, 3),
              "max_delta_step": (0.01, 3),
              "subsample": (0.01, 1),
              "colsample_bytree": (0.01, 1),
              "colsample_bylevel": (0.01, 1), 
              "colsample_bynode": (0.01, 1),
              "reg_alpha": (0, 1),
              "reg_lambda": (0, 1),
              "scale_pos_weight": (0.001, 1)}

optimizer = BayesianOptimization(
    f=xgbfun,
    pbounds=param_dist,
    verbose=2,
    random_state=1,
)

probe_params = { "max_depth":3, 
                "learning_rate":0.1, 
                "n_estimators":100, 
                "gamma":0, 
                "min_child_weight":1, 
                "max_delta_step":0, 
                "subsample":1, 
                "colsample_bytree":1, 
                "colsample_bylevel":1, 
                "colsample_bynode":1, 
                "reg_alpha":0, 
                "reg_lambda":1, 
                "scale_pos_weight":1 }
optimizer.probe(
    params=probe_params,
    lazy=True
)

optimizer.maximize(
    init_points=2,
    n_iter=200,
)

print(optimizer.max)

|   iter    |  target   | colsam... | colsam... | colsam... |   gamma   | learni... | max_de... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6417  [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 0.0     [0m | [0m 0.1     [0m | [0m 0.0     [0m | [0m 3.0     [0m | [0m 1.0     [0m | [0m 100.0   [0m | [0m 0.0     [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 1.0     [0m |
| [0m 2       [0m | [0m 0.4355  [0m | [0m 0.4229  [0m | [0m 0.7231  [0m | [0m 0.01011 [0m | [0m 0.6054  [0m | [0m 0.2174  [0m | [0m 0.2861  [0m | [0m 2.559   [0m | [0m 1.043   [0m | [0m 79.96   [0m | [0m 0.5388  [0m | [0m 0.4192  [0m | [0m 0.6855  [0m | [0m 0.2124  [0m |
| [0m 3       [0m | [0m 0.492

| [0m 27      [0m | [0m 0.5576  [0m | [0m 0.9428  [0m | [0m 0.9427  [0m | [0m 0.9367  [0m | [0m 0.1259  [0m | [0m 0.7563  [0m | [0m 2.454   [0m | [0m 4.759   [0m | [0m 2.582   [0m | [0m 96.15   [0m | [0m 0.03485 [0m | [0m 0.9652  [0m | [0m 0.9372  [0m | [0m 0.9409  [0m |
| [0m 28      [0m | [0m 0.6337  [0m | [0m 0.918   [0m | [0m 0.7615  [0m | [0m 1.0     [0m | [0m 0.001   [0m | [0m 0.1811  [0m | [0m 0.468   [0m | [0m 3.429   [0m | [0m 1.067   [0m | [0m 99.79   [0m | [0m 0.5632  [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 1.0     [0m |
| [0m 29      [0m | [0m 0.5603  [0m | [0m 0.5344  [0m | [0m 0.5081  [0m | [0m 1.0     [0m | [0m 0.001   [0m | [0m 0.3589  [0m | [0m 0.4176  [0m | [0m 3.594   [0m | [0m 1.095   [0m | [0m 99.31   [0m | [0m 0.0     [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 0.8619  [0m |
| [0m 30      [0m | [0m 0.5057  [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 0.47   

| [0m 55      [0m | [0m 0.6368  [0m | [0m 0.8672  [0m | [0m 0.7769  [0m | [0m 0.8996  [0m | [0m 0.4533  [0m | [0m 0.1722  [0m | [0m 0.5021  [0m | [0m 3.229   [0m | [0m 1.304   [0m | [0m 99.28   [0m | [0m 0.0     [0m | [0m 0.6159  [0m | [0m 0.4982  [0m | [0m 1.0     [0m |
| [0m 56      [0m | [0m 0.6387  [0m | [0m 0.891   [0m | [0m 0.745   [0m | [0m 0.5189  [0m | [0m 0.09894 [0m | [0m 0.1     [0m | [0m 0.2125  [0m | [0m 3.324   [0m | [0m 1.006   [0m | [0m 99.42   [0m | [0m 0.4426  [0m | [0m 0.6154  [0m | [0m 0.3771  [0m | [0m 1.0     [0m |
| [0m 57      [0m | [0m 0.6398  [0m | [0m 0.9898  [0m | [0m 0.4248  [0m | [0m 0.5766  [0m | [0m 0.2851  [0m | [0m 0.1     [0m | [0m 0.3794  [0m | [0m 3.319   [0m | [0m 0.8719  [0m | [0m 99.4    [0m | [0m 0.0     [0m | [0m 0.6066  [0m | [0m 0.7312  [0m | [0m 0.5363  [0m |
| [0m 58      [0m | [0m 0.6385  [0m | [0m 0.905   [0m | [0m 0.5882  [0m | [0m 1.0    

| [0m 83      [0m | [0m 0.5332  [0m | [0m 0.8675  [0m | [0m 0.8748  [0m | [0m 0.8604  [0m | [0m 0.2501  [0m | [0m 0.7086  [0m | [0m 2.276   [0m | [0m 4.751   [0m | [0m 2.39    [0m | [0m 95.92   [0m | [0m 0.1438  [0m | [0m 0.8551  [0m | [0m 0.8754  [0m | [0m 0.8633  [0m |
| [0m 84      [0m | [0m 0.6281  [0m | [0m 0.7271  [0m | [0m 0.7088  [0m | [0m 0.3702  [0m | [0m 0.07362 [0m | [0m 0.1696  [0m | [0m 0.5189  [0m | [0m 3.159   [0m | [0m 0.5601  [0m | [0m 99.78   [0m | [0m 0.2613  [0m | [0m 0.8084  [0m | [0m 0.6267  [0m | [0m 0.7858  [0m |
| [0m 85      [0m | [0m 0.6253  [0m | [0m 0.9044  [0m | [0m 0.8474  [0m | [0m 0.9539  [0m | [0m 0.07689 [0m | [0m 0.2068  [0m | [0m 0.3858  [0m | [0m 3.384   [0m | [0m 1.073   [0m | [0m 99.51   [0m | [0m 0.3406  [0m | [0m 0.7128  [0m | [0m 0.4342  [0m | [0m 0.8895  [0m |
| [0m 86      [0m | [0m 0.5031  [0m | [0m 0.9284  [0m | [0m 0.9326  [0m | [0m 0.9259 

| [0m 111     [0m | [0m 0.6327  [0m | [0m 0.4999  [0m | [0m 0.7141  [0m | [0m 0.5712  [0m | [0m 0.6007  [0m | [0m 0.1835  [0m | [0m 0.2178  [0m | [0m 3.554   [0m | [0m 1.074   [0m | [0m 100.0   [0m | [0m 0.4401  [0m | [0m 0.5939  [0m | [0m 0.8733  [0m | [0m 0.8053  [0m |
| [0m 112     [0m | [0m 0.6307  [0m | [0m 0.5991  [0m | [0m 0.6706  [0m | [0m 0.4785  [0m | [0m 0.305   [0m | [0m 0.1559  [0m | [0m 0.2352  [0m | [0m 3.355   [0m | [0m 0.8198  [0m | [0m 99.71   [0m | [0m 0.4024  [0m | [0m 0.7908  [0m | [0m 0.7525  [0m | [0m 0.7642  [0m |
| [0m 113     [0m | [0m 0.5957  [0m | [0m 0.6862  [0m | [0m 0.9434  [0m | [0m 0.4711  [0m | [0m 0.2908  [0m | [0m 0.2627  [0m | [0m 0.2544  [0m | [0m 2.997   [0m | [0m 0.6127  [0m | [0m 99.36   [0m | [0m 0.1591  [0m | [0m 0.6656  [0m | [0m 0.8661  [0m | [0m 0.4959  [0m |
| [0m 114     [0m | [0m 0.6365  [0m | [0m 0.7663  [0m | [0m 0.8354  [0m | [0m 0.5774 

| [0m 139     [0m | [0m 0.6393  [0m | [0m 0.5092  [0m | [0m 0.9421  [0m | [0m 1.0     [0m | [0m 0.001   [0m | [0m 0.1668  [0m | [0m 0.6477  [0m | [0m 3.29    [0m | [0m 1.062   [0m | [0m 99.43   [0m | [0m 0.00103 [0m | [0m 0.8201  [0m | [0m 0.9603  [0m | [0m 0.8464  [0m |
| [0m 140     [0m | [0m 0.6467  [0m | [0m 0.9793  [0m | [0m 0.9197  [0m | [0m 0.9655  [0m | [0m 0.2041  [0m | [0m 0.1     [0m | [0m 0.2755  [0m | [0m 3.318   [0m | [0m 0.9404  [0m | [0m 99.32   [0m | [0m 0.4522  [0m | [0m 0.9982  [0m | [0m 0.7594  [0m | [0m 0.8826  [0m |
| [0m 141     [0m | [0m 0.6259  [0m | [0m 0.9736  [0m | [0m 0.6947  [0m | [0m 0.9365  [0m | [0m 0.001   [0m | [0m 0.2076  [0m | [0m 0.4301  [0m | [0m 3.34    [0m | [0m 1.214   [0m | [0m 99.89   [0m | [0m 0.0     [0m | [0m 0.5949  [0m | [0m 1.0     [0m | [0m 0.9555  [0m |
| [0m 142     [0m | [0m 0.6313  [0m | [0m 0.3992  [0m | [0m 0.6972  [0m | [0m 0.229  

| [0m 167     [0m | [0m 0.6475  [0m | [0m 1.0     [0m | [0m 0.6643  [0m | [0m 1.0     [0m | [0m 0.001   [0m | [0m 0.1     [0m | [0m 0.2575  [0m | [0m 3.486   [0m | [0m 0.6684  [0m | [0m 99.58   [0m | [0m 0.5351  [0m | [0m 0.7802  [0m | [0m 0.4438  [0m | [0m 0.7855  [0m |
| [0m 168     [0m | [0m 0.6348  [0m | [0m 1.0     [0m | [0m 0.6097  [0m | [0m 0.6293  [0m | [0m 0.001   [0m | [0m 0.1     [0m | [0m 0.511   [0m | [0m 3.382   [0m | [0m 0.6952  [0m | [0m 99.25   [0m | [0m 0.4092  [0m | [0m 0.936   [0m | [0m 0.5566  [0m | [0m 1.0     [0m |
| [0m 169     [0m | [0m 0.5542  [0m | [0m 0.7044  [0m | [0m 0.08443 [0m | [0m 0.5838  [0m | [0m 0.06124 [0m | [0m 0.6282  [0m | [0m 2.449   [0m | [0m 2.666   [0m | [0m 0.9502  [0m | [0m 191.1   [0m | [0m 0.6598  [0m | [0m 0.5541  [0m | [0m 0.5826  [0m | [0m 0.9287  [0m |
| [0m 170     [0m | [0m 0.6352  [0m | [0m 0.6409  [0m | [0m 0.7946  [0m | [0m 0.9533 

| [0m 195     [0m | [0m 0.6167  [0m | [0m 0.522   [0m | [0m 0.8403  [0m | [0m 0.3478  [0m | [0m 0.3417  [0m | [0m 0.2166  [0m | [0m 0.4215  [0m | [0m 3.674   [0m | [0m 1.155   [0m | [0m 99.64   [0m | [0m 0.2566  [0m | [0m 0.6838  [0m | [0m 1.0     [0m | [0m 0.7931  [0m |
| [0m 196     [0m | [0m 0.6376  [0m | [0m 0.9016  [0m | [0m 0.6082  [0m | [0m 0.2867  [0m | [0m 0.3616  [0m | [0m 0.1     [0m | [0m 0.5543  [0m | [0m 3.491   [0m | [0m 0.3631  [0m | [0m 99.82   [0m | [0m 0.1824  [0m | [0m 0.8042  [0m | [0m 0.247   [0m | [0m 0.7291  [0m |
| [0m 197     [0m | [0m 0.6482  [0m | [0m 1.0     [0m | [0m 0.6918  [0m | [0m 1.0     [0m | [0m 0.001   [0m | [0m 0.1     [0m | [0m 0.4164  [0m | [0m 3.856   [0m | [0m 1.118   [0m | [0m 99.75   [0m | [0m 0.3484  [0m | [0m 0.6583  [0m | [0m 1.0     [0m | [0m 0.8307  [0m |
| [0m 198     [0m | [0m 0.6464  [0m | [0m 0.4563  [0m | [0m 1.0     [0m | [0m 0.9727 

# Optimizer found the following max:

{'params': {'colsample_bylevel': 0.5353897298174576,
  'colsample_bynode': 0.9143062230315995,
  'colsample_bytree': 0.6120552891547169,
  'gamma': 0.001,
  'learning_rate': 0.1,
  'max_delta_step': 0.32938061735583135,
  'max_depth': 3.083538576010634,
  'min_child_weight': 0.7394603332174594,
  'n_estimators': 99.53230647436956,
  'reg_alpha': 0.37206475685082335,
  'reg_lambda': 0.9178582198242464,
  'scale_pos_weight': 0.9119811686410297,
  'subsample': 0.4362866767805317},
 'target': 0.659532507321262}

In [17]:
# Run XGBoost on these optimal parameters and save to csv


model = xgb.XGBRegressor(colsample_bylevel= 0.5353897298174576, 
                         colsample_bynode =0.9143062230315995, 
                         colsample_bytree =0.6120552891547169, 
                         gamma =0.001, 
                         learning_rate =0.1, 
                         max_delta_step =int(0.32938061735583135), 
                         max_depth =int(3.083538576010634), 
                         min_child_weight =int(0.7394603332174594), 
                         n_estimators =int(99.53230647436956), 
                         reg_alpha =0.37206475685082335, 
                         reg_lambda =0.9178582198242464, 
                         scale_pos_weight =0.9119811686410297, 
                         subsample =0.4362866767805317,
                         objective="reg:squarederror", 
                         random_state=42)
    
model.fit(xtrain, ytrain)

y_pred = model.predict(xtest)

index = pd.read_csv("sample.csv")
index['y'] = y_pred

index.to_csv("predictionsLassoXGB_Bopt.csv")

In [19]:
scores = cross_val_score(estimator = clf,
                             X = xtrain,
                             y = ytrain,
                             scoring = 'r2',
                             cv = 10)
print(np.mean(scores))
print(np.std(scores))

0.6417232824885015
0.04520729339953197
