In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.decomposition import PCA, FastICA
import matplotlib.pyplot as plt

import random
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from mlxtend.regressor import StackingRegressor

import xgboost as xgb



## Reading and preprocessing data

In [None]:
x_train = pd.read_csv('data/x_train.csv', index_col='ID')
x_test = pd.read_csv('data/x_test.csv', index_col='ID')
y_train = pd.read_csv('data/y_train.csv')

x_train_sel = pd.read_csv('data/x_train_sel.csv')
x_test_sel = pd.read_csv('data/x_test_sel.csv')

x_train_pca = pd.read_csv('data/x_train_pca.csv')
x_test_pca = pd.read_csv('data/x_test_pca.csv')

In [2]:
x_train_raw = pd.read_csv('train.csv', index_col='ID')
x_test_raw = pd.read_csv('test.csv', index_col='ID')
y_train_raw = x_train_raw['y']
x_train_raw.drop('y', axis=1, inplace=True)

le = LabelEncoder()

In [4]:
def str_to_int(df, col_name):
    if df[col_name].dtype != 'int64':
        le.fit(df[col_name].values)
        df[col_name] = le.transform(df[col_name])

for col_name in x_train_raw.columns:
    str_to_int(x_train_raw, col_name)
    str_to_int(x_test_raw, col_name)

### Feature selection and generation

In [5]:
selector = SelectKBest(k=75, score_func=f_regression).fit(x_train_raw, y_train_raw)
x_train_new = selector.transform(x_train_raw)
x_test_new = selector.transform(x_test_raw)

  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [7]:
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(x_train_raw)
pca2_results_test = pca.transform(x_test_raw)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(x_train_raw)
ica2_results_test = ica.transform(x_test_raw)

x_train_new = pd.DataFrame(x_train_new)
x_test_new = pd.DataFrame(x_test_new)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    x_train_new['pca_' + str(i)] = pca2_results_train[:,i-1]
    x_test_new['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    x_train_new['ica_' + str(i)] = ica2_results_train[:,i-1]
    x_test_new['ica_' + str(i)] = ica2_results_test[:, i-1]
    
y_mean = np.mean(y_train_raw)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    x_train_new, y_train_raw, random_state=42, train_size=.95)

In [9]:
X_train_unf, X_val_unf, y_train_unf, y_val_unf = train_test_split(
    x_train_raw, y_train_raw, random_state=42, train_size=.95)

## Tuning hyperparameters for adaboost

In [11]:
def find_best_params(adaboost, X, y):
    parameters = {
        'n_estimators':range(10, 25, 1),
        'learning_rate':np.arange(0.02, 0.04, 0.001)}
    grid = GridSearchCV(estimator=adaboost, 
                        param_grid=parameters, 
                        verbose=1, 
                        n_jobs=-1, 
                        scoring='r2')
    grid.fit(X, y)
    return grid.best_params_, grid.best_score_

In [12]:
def print_cross_val_score(adaboost, X, y):
    cross_val = cross_val_score(estimator=ada_boost, X=X, y=y, n_jobs=-1, cv=4, verbose=1)
    print('cross_val_score', np.mean(cross_val))

In [None]:
adaboost = AdaBoostRegressor(base_estimator=None, loss='square', random_state=42)
best_params, best_score = find_best_params(adaboost, x_train, y_train)
print(best_params, best_score)
# learning_rate=0.0011230000000000007, n_estimators=7

### Raw data / adaboost

In [None]:
#X_train_unf, X_val_unf, y_train_unf, y_val_unf
ada_boost = AdaBoostRegressor(base_estimator=None, 
                              n_estimators=23, 
                              learning_rate=0.00253, 
                              loss='square', 
                              random_state=42)
ada_boost.fit(X_train_unf, y_train_unf)
y_pred = ada_boost.predict(X_val_unf)
print('r2_score', r2_score(y_val_unf, y_pred))
print_cross_val_score(ada_boost, x_train, y_train)
y_pred = ada_boost.predict(x_test)

### SelectKBest data / adaboost

In [None]:
adaboost = AdaBoostRegressor(base_estimator=None, loss='square', 
                             random_state=42, n_estimators=7)
best_params, best_score = find_best_params(adaboost, x_train_new, y_train)
print(best_params, best_score)

In [None]:
#X_train, X_val, y_train, y_val
ada_boost = AdaBoostRegressor(base_estimator=None, 
                              n_estimators=7, 
                              learning_rate=0.019000000000000003, 
                              loss='square', 
                              random_state=42)
ada_boost.fit(X_train, y_train)
y_pred = ada_boost.predict(X_val)
print('r2_score', r2_score(y_val, y_pred))
print_cross_val_score(ada_boost, x_train_new, y_train_raw)
y_pred = ada_boost.predict(x_test_new)

### SelectKBest + PCA/ICA

In [None]:
adaboost = AdaBoostRegressor(base_estimator=None, loss='square', 
                             random_state=42)
best_params, best_score = find_best_params(adaboost, x_train_new, y_train_raw)
print(best_params, best_score)

In [13]:
#X_train, X_val, y_train, y_val
ada_boost = AdaBoostRegressor(base_estimator=None, 
                              n_estimators=21, 
                              learning_rate=0.030000000000000009, 
                              loss='square', 
                              random_state=42)
ada_boost.fit(X_train, y_train)
y_pred = ada_boost.predict(X_val)
print('r2_score', r2_score(y_val, y_pred))
print_cross_val_score(ada_boost, x_train_new, y_train_raw)
y_pred = ada_boost.predict(x_test_new)

r2_score 0.64013898092


[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    0.6s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.8s finished


cross_val_score 0.558803383458


### Training model xgboost

In [None]:
xgb_params = {
    'n_trees': 374, 
    'eta': 0.15000000000000002,
    'gamma': 0.6000000000000001,
    'max_depth': 4,
    'subsample': 0.9500000000000001,
    'min_child_weight': 2.0,
    'colsample_bytree': 0.9500000000000001,
    'objective': 'reg:linear',
    'nthread': 8,
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# xgb_params = {'colsample_bytree': 0.65, 'eta': 0.30000000000000004, 'eval_metric': 'rmse', 'gamma': 0.9500000000000001, 'max_depth': 8, 'min_child_weight': 1.0, 'n_estimators': 110.0, 'nthread': 6, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.9}
# xgb_params_from_iamtodor = {'colsample_bytree': 0.9500000000000001, 'eta': 0.15000000000000002, 'eval_metric': 'rmse', 'gamma': 0.6000000000000001, 'max_depth': 4, 'min_child_weight': 2.0, 'n_estimators': 169.0, 'nthread': 8, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.9500000000000001}

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(x_test_new)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1500,
                   early_stopping_rounds=150,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

## Tuning hyperparameters

In [None]:
def score(params):
    print("Training with params : ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid)
    score = r2_score(y_val, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}


In [None]:
def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.0001, 0.5, 0.025),
             'max_depth' : sample(scope.int(hp.quniform('max_depth', 0, 10, 1))),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'eval_metric': 'rmse',
             'objective': 'reg:linear',
             'nthread' : 8,
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print(best)

In [None]:
trials = Trials()

optimize(trials)

## Generate submission file

In [None]:
#y_pred = model.predict(dtest)
output = pd.DataFrame({'id': x_test.index, 'y': y_pred})
output.to_csv('submition1.csv', index=False)