## Load and prep columns

In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBRegressor
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from scipy import stats

%config InlineBackend.figure_format = 'retina'

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


# Define columns
data_columns = ['symbol', 'quoteDate', 'adjustedPrice', 'volume', 'previousClose', 'change', 'changeInPercent', 
                '52WeekHigh', '52WeekLow', 'changeFrom52WeekHigh', 'changeFrom52WeekLow', 
                'percebtChangeFrom52WeekHigh', 'percentChangeFrom52WeekLow', 'Price200DayAverage', 
                'Price52WeekPercChange', '1WeekVolatility', '2WeekVolatility', '4WeekVolatility', '8WeekVolatility', 
                '12WeekVolatility', '26WeekVolatility','52WeekVolatility','4WeekBollingerPrediction', '4WeekBollingerType',
                '12WeekBollingerPrediction', '12WeekBollingerType', 'allordpreviousclose', 'allordchange', 
                'allorddayshigh', 'allorddayslow', 'allordpercebtChangeFrom52WeekHigh', 
                'allordpercentChangeFrom52WeekLow', 'asxpreviousclose', 'asxchange', 'asxdayshigh', 
                'asxdayslow', 'asxpercebtChangeFrom52WeekHigh', 'asxpercentChangeFrom52WeekLow', 'exDividendDate', 
                'exDividendPayout', '640106_A3597525W', 'AINTCOV', 'AverageVolume', 'BookValuePerShareYear', 
                'CashPerShareYear', 'DPSRecentYear', 'EBITDMargin', 'EPS', 'EPSGrowthRate10Years', 
                'EPSGrowthRate5Years', 'FIRMMCRT', 'FXRUSD', 'Float', 'GRCPAIAD', 'GRCPAISAD', 'GRCPBCAD', 
                'GRCPBCSAD', 'GRCPBMAD', 'GRCPNRAD', 'GRCPRCAD', 'H01_GGDPCVGDP', 'H01_GGDPCVGDPFY', 'H05_GLFSEPTPOP', 
                'IAD', 'LTDebtToEquityQuarter', 'LTDebtToEquityYear', 'MarketCap',
                'NetIncomeGrowthRate5Years', 'NetProfitMarginPercent', 'OperatingMargin', 'PE',
                'PriceToBook', 'ReturnOnAssets5Years', 'ReturnOnAssetsTTM', 'ReturnOnAssetsYear', 
                'ReturnOnEquity5Years', 'ReturnOnEquityTTM', 'ReturnOnEquityYear', 'RevenueGrowthRate10Years', 
                'RevenueGrowthRate5Years', 'TotalDebtToAssetsQuarter', 'TotalDebtToAssetsYear', 
                'TotalDebtToEquityQuarter', 'TotalDebtToEquityYear', 'bookValue', 'earningsPerShare', 
                'ebitda', 'epsEstimateCurrentYear', 'marketCapitalization', 'peRatio', 'pegRatio', 'pricePerBook', 
                'pricePerEpsEstimateCurrentYear', 'pricePerEpsEstimateNextYear', 'pricePerSales']

selected_columns = ['symbol', 'adjustedPrice', 'volume', 'previousClose', 'change', 
                    '52WeekHigh', '52WeekLow', 'changeFrom52WeekHigh', 'changeFrom52WeekLow', 
                    'percebtChangeFrom52WeekHigh', 'percentChangeFrom52WeekLow', 'Price200DayAverage', 
                    'Price52WeekPercChange', '1WeekVolatility', '2WeekVolatility', '4WeekVolatility', '8WeekVolatility', 
                    '12WeekVolatility', '26WeekVolatility','52WeekVolatility','4WeekBollingerPrediction', '4WeekBollingerType',
                    '12WeekBollingerPrediction', '12WeekBollingerType', 'allordchange', 
                    'allorddayshigh', 'allorddayslow', 'allordpercebtChangeFrom52WeekHigh', 
                    'allordpercentChangeFrom52WeekLow', 'asxchange', 'asxdayshigh', 
                    'asxdayslow', 'asxpercebtChangeFrom52WeekHigh', 'asxpercentChangeFrom52WeekLow', 'AverageVolume', 
                    'EBITDMargin', 'EPSGrowthRate10Years', 'EPSGrowthRate5Years', 'FIRMMCRT', 'FXRUSD', 'Float', 
                    'GRCPAIAD', 'GRCPBCAD', 'GRCPBMAD', 'GRCPNRAD', 'GRCPRCAD', 'H01_GGDPCVGDPFY', 'H05_GLFSEPTPOP', 
                    'IAD', 'LTDebtToEquityQuarter', 'LTDebtToEquityYear', 'MarketCap',
                    'NetIncomeGrowthRate5Years', 'NetProfitMarginPercent', 
                    'PriceToBook', 'ReturnOnAssets5Years', 'ReturnOnAssetsTTM', 'ReturnOnAssetsYear', 
                    'ReturnOnEquity5Years', 'ReturnOnEquityTTM', 'RevenueGrowthRate10Years', 
                    'RevenueGrowthRate5Years', 'TotalDebtToAssetsQuarter', 'TotalDebtToAssetsYear', 
                    'TotalDebtToEquityQuarter', 'bookValue', 'earningsPerShare', 
                    'ebitda', 'epsEstimateCurrentYear', 'marketCapitalization', 'peRatio', 'pegRatio', 'pricePerBook', 
                    'pricePerEpsEstimateCurrentYear', 'pricePerEpsEstimateNextYear', 'pricePerSales']


returns = {
    '1': 'Future1WeekReturn',
    '2': 'Future2WeekReturn',
    '4': 'Future4WeekReturn',
    '8': 'Future8WeekReturn',
    '12': 'Future12WeekReturn',
    '26': 'Future26WeekReturn',
    '52': 'Future52WeekReturn',
    '1ra': 'Future1WeekRiskAdjustedReturn',
    '2ra': 'Future2WeekRiskAdjustedReturn',
    '4ra': 'Future4WeekRiskAdjustedReturn',
    '8ra': 'Future8WeekRiskAdjustedReturn',
    '12ra': 'Future12WeekRiskAdjustedReturn',
    '26ra': 'Future26WeekRiskAdjustedReturn',
    '52ra': 'Future52WeekRiskAdjustedReturn'
}

# Load data
# raw_data = pd.read_csv('data/companyQuotes-20170417-001.csv')

increments = range(1, 77)

share_data = pd.DataFrame()
for increment in increments:
    path ='data/companyQuotes-20170514-%03d.csv.gz' % increment
    frame = pd.read_csv(path, compression='gzip', parse_dates=['quoteDate'], infer_datetime_format=True)
    share_data = share_data.append(frame, ignore_index=True)
    print('Loaded:', path)
    
print(share_data.head(5))
print(len(share_data))

  interactivity=interactivity, compiler=compiler, result=result)


Loaded: data/companyQuotes-20170514-001.csv.gz
Loaded: data/companyQuotes-20170514-002.csv.gz
Loaded: data/companyQuotes-20170514-003.csv.gz
Loaded: data/companyQuotes-20170514-004.csv.gz
Loaded: data/companyQuotes-20170514-005.csv.gz
Loaded: data/companyQuotes-20170514-006.csv.gz
Loaded: data/companyQuotes-20170514-007.csv.gz
Loaded: data/companyQuotes-20170514-008.csv.gz
Loaded: data/companyQuotes-20170514-009.csv.gz
Loaded: data/companyQuotes-20170514-010.csv.gz
Loaded: data/companyQuotes-20170514-011.csv.gz
Loaded: data/companyQuotes-20170514-012.csv.gz
Loaded: data/companyQuotes-20170514-013.csv.gz
Loaded: data/companyQuotes-20170514-014.csv.gz
Loaded: data/companyQuotes-20170514-015.csv.gz
Loaded: data/companyQuotes-20170514-016.csv.gz
Loaded: data/companyQuotes-20170514-017.csv.gz
Loaded: data/companyQuotes-20170514-018.csv.gz
Loaded: data/companyQuotes-20170514-019.csv.gz
Loaded: data/companyQuotes-20170514-020.csv.gz
Loaded: data/companyQuotes-20170514-021.csv.gz
Loaded: data/

In [None]:
# Clip values less than -99 (represents losing all money, can't go below -100)
for key in returns:
    return_column = returns[key]
    raw_data[return_column] = raw_data[return_column].clip(-99, 999, axis=0)

In [None]:
# Plot values for each potential target
for key in returns:
    print('-----')
    return_column = returns[key]
    print(return_column)
    raw_data.hist(column=return_column,bins=[-50, -45, -40, -35, -30, -25, -20, -15, -10, -5, 0, 
                                            5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
    pyplot.show()


    print('Instances: ', raw_data[return_column].count())
    print('Mean: ', raw_data[return_column].mean())
    print('Min: ', raw_data[return_column].min())
    print('25th percentile: ', raw_data[return_column].quantile(0.25))
    print('Median: ', raw_data[return_column].median())
    print('75th percentile: ', raw_data[return_column].quantile(0.75))
    print('Max: ', raw_data[return_column].max())
    print('Std deviation: ', raw_data[return_column].std())
    print('Variance: ', raw_data[return_column].var())
    print('Skew: ', raw_data[return_column].skew())

In [None]:
# Check outliers
outliers = raw_data.loc[(raw_data[target_column] > 100) | (raw_data[target_column] < -50)]
print(len(outliers))

exclude_symbols = outliers['symbol'].unique()

In [None]:
# Remove rows in the excluded symbols list
filtered_data = raw_data[~raw_data['symbol'].isin(exclude_symbols)]

## Apply filter for specific symbolx

In [None]:
# Run filter for a few companies
include_symbols = ['BHP', 'CBA', 'AOU', 'AYS', 'ATT', 'A01', 'BUD', 'AAP', 'AIV', 'AIB', '4DS']
reduced_data = raw_data[raw_data['symbol'].isin(include_symbols)]
print(len(reduced_data))

In [None]:
filtered_data = reduced_data

In [None]:
# Re-plot values for each potential target
for key in returns:
    print('-----')
    return_column = returns[key]
    print(return_column)
    filtered_data.hist(column=return_column,bins=[-50, -45, -40, -35, -30, -25, -20, -15, -10, -5, 0,
                                                  5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
    pyplot.show()


    print('Instances: ', filtered_data[return_column].count())
    print('Mean: ', filtered_data[return_column].mean())
    print('Min: ', filtered_data[return_column].min())
    print('25th percentile: ', filtered_data[return_column].quantile(0.25))
    print('Median: ', filtered_data[return_column].median())
    print('75th percentile: ', filtered_data[return_column].quantile(0.75))
    print('Max: ', filtered_data[return_column].max())
    print('Std deviation: ', filtered_data[return_column].std())
    print('Variance: ', filtered_data[return_column].var())
    print('Skew: ', filtered_data[return_column].skew())

## Use all data

In [None]:
filtered_data = raw_data

## Set-up learning data

In [2]:
# Set target column
target_column = returns['8']

In [3]:
# Remove rows missing the target column
share_data = share_data.dropna(subset=[target_column], how='all')

In [4]:
# Shift values to range of >= 1
import math

shift_val = 0

def get_shift_value(data_frame):
    # if the minimum value is < 1, shift all the values to make them >= 1
    min_val = min(data_frame.values)
    if min_val < 1:
        return (min_val * -1) + 1
    else:
        return 0

print(share_data[target_column].head(5))
    
shift_val = get_shift_value(share_data[target_column])
print(shift_val)

share_data[target_column] = share_data[target_column].add(shift_val)

print(share_data[target_column].head(5))

0   -15.714245
1   -22.222190
2   -24.999973
3   -25.301180
4   -34.482744
Name: Future8WeekReturn, dtype: float64
100.999995833
0    85.285751
1    78.777806
2    76.000023
3    75.698816
4    66.517252
Name: Future8WeekReturn, dtype: float64


In [5]:
# Set log values
print(share_data[target_column].head(5))
    
print('Min:',min(share_data[target_column].values),', Max:', max(share_data[target_column].values))
    
share_data[target_column] = np.log(share_data[target_column])

print(share_data[target_column].head(5))

print('Min:',min(share_data[target_column].values),', Max:', max(share_data[target_column].values))

0    85.285751
1    78.777806
2    76.000023
3    75.698816
4    66.517252
Name: Future8WeekReturn, dtype: float64
Min: 1.0 , Max: 799999238.06
0    4.446007
1    4.366631
2    4.330734
3    4.326763
4    4.197461
Name: Future8WeekReturn, dtype: float64
Min: 0.0 , Max: 20.5001213332


In [None]:
# Create y_data
y_data = share_data[target_column].values

In [10]:
# Filter down data to the X columns being used
share_data = share_data[data_columns]

# Convert quote dates data to year and month
share_data['quoteDate'] = pd.to_datetime(share_data['quoteDate'])
share_data['quoteYear'], share_data['quoteMonth'],  share_data['quoteDay'] = share_data['quoteDate'].dt.year, share_data['quoteDate'].dt.month, share_data['quoteDate'].dt.day

# Convert dividend dates data to year and month
share_data['exDividendDate'] = pd.to_datetime(share_data['exDividendDate'])
share_data['exDividendYear'], share_data['exDividendMonth'],  share_data['exDividendDay'] = share_data['exDividendDate'].dt.year, share_data['exDividendDate'].dt.month, share_data['exDividendDate'].dt.day


# Remove quote dates column
del share_data['quoteDate']
del share_data['exDividendDate']

KeyError: "['quoteDate' 'exDividendDate'] not in index"

In [11]:
print(share_data.dtypes)

print('Min:',min(y_data),', Max:', max(y_data))

symbol                                object
adjustedPrice                        float64
volume                                 int64
previousClose                        float64
change                               float64
changeInPercent                      float64
52WeekHigh                           float64
52WeekLow                            float64
changeFrom52WeekHigh                 float64
changeFrom52WeekLow                  float64
percebtChangeFrom52WeekHigh          float64
percentChangeFrom52WeekLow           float64
Price200DayAverage                   float64
Price52WeekPercChange                float64
1WeekVolatility                      float64
2WeekVolatility                      float64
4WeekVolatility                      float64
8WeekVolatility                      float64
12WeekVolatility                     float64
26WeekVolatility                     float64
52WeekVolatility                     float64
4WeekBollingerPrediction              object
4WeekBolli

## Convert non-numerical values

In [None]:
from datetime import datetime as dt
from dateutil.parser import parse

def is_date(string):
    try: 
        parse(string)
        return True
    except:
        return False

def convert_date_to_ordinal(date_val):
    if(pd.isnull(date_val)):
        return -99999
    
    elif(type(date_val) is str):
        if(is_date(date_val)):
            return parse(date_val).toordinal()
        else:
            return -99999

    elif(type(date_val) is int or type(date_val) is float):
        return date_val
    

# Fix date values - convert to ordinals
filtered_data['quoteDate'] = filtered_data['quoteDate'].apply(lambda x: convert_date_to_ordinal(x))

# print(filtered_data['exDividendDate'].apply(lambda x: convert_date_to_ordinal(x)))
filtered_data['exDividendDate'] = filtered_data['exDividendDate'].apply(lambda x: convert_date_to_ordinal(x))

print(filtered_data.head(5))

# Convert categorical variables to boolean fields
#  4WeekBollingerPrediction              
#  4WeekBollingerType                    
#  12WeekBollingerPrediction             
#  12WeekBollingerType                   

filtered_data = pd.get_dummies(data=filtered_data, columns=['symbol', '4WeekBollingerPrediction', '4WeekBollingerType', 
                                                            '12WeekBollingerPrediction', '12WeekBollingerType'])


# Fill nan values with placeholder and check for null values
filtered_data = filtered_data.fillna(-99999)
print(pd.isnull(filtered_data).any())

In [13]:
# Convert categorical variables to boolean fields
#  4WeekBollingerPrediction              
#  4WeekBollingerType                    
#  12WeekBollingerPrediction             
#  12WeekBollingerType                   

share_data = pd.get_dummies(data=share_data, columns=['symbol', '4WeekBollingerPrediction', '4WeekBollingerType', 
                                                            '12WeekBollingerPrediction', '12WeekBollingerType'])


# Fill nan values with placeholder and check for null values
share_data = share_data.fillna(-99999)
print(pd.isnull(share_data).any())

adjustedPrice                        False
volume                               False
previousClose                        False
change                               False
changeInPercent                      False
52WeekHigh                           False
52WeekLow                            False
changeFrom52WeekHigh                 False
changeFrom52WeekLow                  False
percebtChangeFrom52WeekHigh          False
percentChangeFrom52WeekLow           False
Price200DayAverage                   False
Price52WeekPercChange                False
1WeekVolatility                      False
2WeekVolatility                      False
4WeekVolatility                      False
8WeekVolatility                      False
12WeekVolatility                     False
26WeekVolatility                     False
52WeekVolatility                     False
allordpreviousclose                  False
allordchange                         False
allorddayshigh                       False
allorddaysl

In [None]:
# Check data types
print(share_data.dtypes)

# Check how many fields in X_data
print(share_data.values.shape)

adjustedPrice                        float64
volume                                 int64
previousClose                        float64
change                               float64
changeInPercent                      float64
52WeekHigh                           float64
52WeekLow                            float64
changeFrom52WeekHigh                 float64
changeFrom52WeekLow                  float64
percebtChangeFrom52WeekHigh          float64
percentChangeFrom52WeekLow           float64
Price200DayAverage                   float64
Price52WeekPercChange                float64
1WeekVolatility                      float64
2WeekVolatility                      float64
4WeekVolatility                      float64
8WeekVolatility                      float64
12WeekVolatility                     float64
26WeekVolatility                     float64
52WeekVolatility                     float64
allordpreviousclose                  float64
allordchange                         float64
allorddays

## Run xgboost with early stopping

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import time

# Split into train and test data
#X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size=0.7, test_size=0.3)

print('Training for', target_column)

# Fit model with training set
start = time.time()
model = xgb.XGBRegressor(nthread=-1, n_estimators=10000)

print(model)

kfold = KFold(n_splits=3, shuffle=True)

errs = []
r2s = []

for train_index, test_index in kfold.split(share_data.values):
    actuals = y_data[test_index]
    eval_set = [( share_data.values[test_index], actuals)]
    model.fit( share_data.values[train_index],y_data[train_index], early_stopping_rounds=30, eval_metric="mae", 
              eval_set=eval_set, verbose=True)
    predictions = model.predict( share_data.values[test_index])

    # Output model settings
    fit_time = time.time()
    print('Elapsed time: %d' % (fit_time - start))
    err = mean_absolute_error(actuals, predictions)
    errs.append(err)
    r2 = r2_score(actuals, predictions)
    r2s.append(r2)
    print("Fold mean absolute error: %s" % err)
    print("Fold r2: %s" % r2)
    

print('-----')
print("Average (3 folds) mean absolute error: %s" % np.mean(errs))
print("Average (3 folds) r2: %s" % np.mean(r2s))

## CVGridSearch with early stopping

In [None]:
from sklearn.metrics import mean_absolute_error
import time

# Split into train and test data
print('Splitting data')
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size=0.75, test_size=0.25)

kfold = KFold(n_splits=4, shuffle=True)

print('Training for', target_column)

# Fit model with training set
start = time.time()
model = xgb.XGBRegressor(nthread=-1, n_estimators=10000, learning_rate=0.05, max_depth=50, min_child_weight=0,
                        gamma=1.5)
eval_set = [(X_test, y_test)]
over_fifty = [i/100.0 for i in range(0, 101, 5)]
under_fifty = [i/100.0 for i in range(0, 36, 5)]

paramGrid = {
            #"max_depth": [i for i in range(25, 151, 25)],
            #"learning_rate": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]#,
            # "min_child_weight": [0,0.1,0.2], #over_fifty,
            #"gamma": [0.5, 1, 1.5, 2, 2.5, 3, 3.5]#under_fifty#,
            "scale_pos_weight": over_fifty,
            #"colsample_bylevel": over_fifty#,
            #"colsample_bytree": over_fifty,
            #"subsample": over_fifty#,
            #"max_delta_step": under_fifty,
            #"reg_lambda": over_fifty#,
            #"reg_alpha": under_fifty
            #"reg_lambda_bias": under_fifty
            }

fit_params = {
            "early_stopping_rounds": 50, 
            "eval_metric": "mae", 
            "eval_set": eval_set, 
            "verbose": False
            }

grid_search = GridSearchCV(model, paramGrid, scoring="r2", fit_params=fit_params, verbose=2, cv=kfold)

grid_result = grid_search.fit(X_train, y_train)
# Output model settings
fit_time = time.time()
print('Fit elapsed time: %d' % (fit_time - start))

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

params = grid_result.cv_results_['params']

## Run xgboost

In [None]:
from sklearn.metrics import mean_absolute_error
import time

# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size=0.7, test_size=0.3)

print('Training for', target_column)

# Fit model with training set
start = time.time()
model = xgb.XGBRegressor(base_score=0.35, colsample_bylevel=0.8, colsample_bytree=0.8, 
                         gamma=0, learning_rate=0.075, max_delta_step=0, max_depth=70, 
                         min_child_weight=0, missing=None, n_estimators=9500, nthread=-1, 
                         reg_alpha=0.4, reg_lambda=0.3, scale_pos_weight=0, subsample=0.8)
model.fit(X_train, y_train)
# Output model settings
fit_time = time.time()
print(model)
print('Fit elapsed time: %d' % (fit_time - start))


# make predictions for test data
predictions = model.predict(X_test)
predition_time = time.time()
print('Prediction elapsed time: %d' % (predition_time - fit_time))

# evaluate predictions
mae = mean_absolute_error(y_test, predictions)
print('Mean absolute error:', mae)

# Evaluate distribution of errors - get error amount for each prediction
y_errors = np.absolute(np.subtract(y_test, predictions))

# Plot the distribution of errors
pyplot.figure(figsize=(20, 16))
plot_title = 'XGBoost ' + target_column + ' prediction errors'
pyplot.plot(y_test, y_errors)
pyplot.ylabel('Error')
pyplot.xlabel('Actual return')
pyplot.title(plot_title)
pyplot.show(plot_title)

# ---------- 8 Week Returns ---------------------------------------------
    # --- All data ---
    # XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
    #       learning_rate=0.1, max_delta_step=0, max_depth=3,
    #       min_child_weight=1, missing=None, n_estimators=100, nthread=1,
    #       objective='reg:linear', reg_alpha=0, reg_lambda=1,
    #       scale_pos_weight=1, seed=0, silent=True, subsample=1)
    # Mean absolute error:  27.209411857320072

    # --- Removed outliers: n_estimators=100 ---
    # XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
    #       learning_rate=0.1, max_delta_step=0, max_depth=3,
    #       min_child_weight=1, missing=None, n_estimators=100, nthread=1,
    #       objective='reg:linear', reg_alpha=0, reg_lambda=1,
    #       scale_pos_weight=1, seed=0, silent=True, subsample=1)
    # Mean absolute error:  23.8139769746

    # --- Removed outliers: n_estimators=200 ---
    # XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
    #       learning_rate=0.1, max_delta_step=0, max_depth=3,
    #       min_child_weight=1, missing=None, n_estimators=200, nthread=1,
    #       objective='reg:linear', reg_alpha=0, reg_lambda=1,
    #       scale_pos_weight=1, seed=0, silent=True, subsample=1)
    # Mean absolute error:  21.9375376132

    # --- Removed outliers: n_estimators=500 ---
    # XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
    #       learning_rate=0.1, max_delta_step=0, max_depth=3,
    #       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
    #       objective='reg:linear', reg_alpha=0, reg_lambda=1,
    #       scale_pos_weight=1, seed=0, silent=True, subsample=1)
    # Mean absolute error:  21.9761006957
    
    
# ---------- 8 Week Risk Adjusted Returns -------------------------------------
    # --- All data ---
    # XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
    #   learning_rate=0.1, max_delta_step=0, max_depth=3,
    #   min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
    #   objective='reg:linear', reg_alpha=0, reg_lambda=1,
    #   scale_pos_weight=1, seed=0, silent=True, subsample=1)
    # Fit elapsed time: 193
    # Prediction elapsed time: 3
    # Mean absolute error: 456.680567416

In [None]:
# Plot the distribution of errors
pyplot.figure(figsize=(20, 16))

print( min(y_test))
print(max(y_test))


plot_title = 'XGBoost ' + target_column + ' prediction errors'
pyplot.plot(y_test, y_errors)
pyplot.ylabel('Error')
pyplot.xlabel('Actual return')
pyplot.xlim([ min(y_test),max(y_test)])
pyplot.title(plot_title)
pyplot.show(plot_title)

## Xgboost for one symbol

In [None]:
from sklearn.metrics import mean_absolute_error
import time

# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size=0.7, test_size=0.3)

print('Training for', target_column)

# Fit model with training set
model = xgb.XGBRegressor(nthread=-1, colsample_bylevel=0.8, colsample_bytree=0.8,
                         learning_rate=0.075, max_depth=10,n_estimators=9500, 
                         subsample=0.8)
start = time.time()
model.fit(X_train, y_train)
elapsed = time.time() - start

# Output model settings
print(model)
print('Fit elapsed time: %d' % (elapsed))



start = time.time()
# make predictions for test data
predictions = model.predict(X_test)
elapsed = time.time() - start
print('Prediction elapsed time: %d' % (elapsed))

# evaluate predictions
mae = mean_absolute_error(y_test, predictions)
print('Mean absolute error:', mae)

# Evaluate distribution of errors - get error amount for each prediction
y_errors = np.absolute(np.subtract(y_test, predictions))

# Plot the distribution of errors
pyplot.figure(figsize=(20, 16))
plot_title = 'XGBoost ' + target_column + ' prediction errors'
pyplot.plot(y_test, y_errors)
pyplot.ylabel('Error')
pyplot.xlabel('Actual return')
pyplot.title(plot_title)
pyplot.show(plot_title)

# ---------- 8 Week Returns - CBA  ---------------------------------------------
    # Training for Future8WeekReturn
    # XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
    #        learning_rate=0.1, max_delta_step=0, max_depth=3,
    #        min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
    #        objective='reg:linear', reg_alpha=0, reg_lambda=1,
    #        scale_pos_weight=1, seed=0, silent=True, subsample=1)
    # Fit elapsed time: 0
    # Prediction elapsed time: 0
    # Mean absolute error: 2.85405055196

#     Training for Future8WeekReturn
#     XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
#            learning_rate=0.1, max_delta_step=0, max_depth=3,
#            min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
#            objective='reg:linear', reg_alpha=0, reg_lambda=1,
#            scale_pos_weight=1, seed=0, silent=True, subsample=1)
#     Fit elapsed time: 1
#     Prediction elapsed time: 0
#     Mean absolute error: 1.87473924615

#     Training for Future8WeekReturn
#     XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
#            learning_rate=0.1, max_delta_step=0, max_depth=3,
#            min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
#            objective='reg:linear', reg_alpha=0, reg_lambda=1,
#            scale_pos_weight=1, seed=0, silent=True, subsample=1)
#     Fit elapsed time: 2
#     Prediction elapsed time: 0
#     Mean absolute error: 1.82999759228

#     Training for Future8WeekReturn
#     XGBRegressor(base_score=0.5, colsample_bylevel=0.8, colsample_bytree=0.8,
#            gamma=0, learning_rate=0.075, max_delta_step=0, max_depth=10,
#            min_child_weight=1, missing=None, n_estimators=9500, nthread=-1,
#            objective='reg:linear', reg_alpha=0, reg_lambda=1,
#            scale_pos_weight=1, seed=0, silent=True, subsample=0.8)
#     Fit elapsed time: 14
#     Prediction elapsed time: 0
#     Mean absolute error: 1.55688219974

## Optimise single symbol model

In [None]:
from sklearn.metrics import mean_absolute_error
import time

def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return array[idx]

if __name__ == "__main__":
    print("Work through parameter optimization")

    # Split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size=0.7, test_size=0.3)

    
    model = xgb.XGBRegressor(nthread=-1)

    kfold = KFold(n_splits=5, shuffle=True)


    print("Set non-optimised baseline")
    round_err = []
    for r in range(0, 5):
        err = []
        for train_index, test_index in kfold.split(X_data):
            model.fit(X_data[train_index],y_data[train_index])
            predictions = model.predict(X_data[test_index])
            actuals = y_data[test_index]
            err.append(mean_absolute_error(actuals, predictions))

        print(np.mean(err))
        round_err.append(np.mean(err))

    baseline_error = np.mean(round_err)

    print("Average baseline error: %f" % baseline_error)
    print('-----')

    n_estimators=[7000, 7500, 8000, 8500, 9000, 9500, 10000]
        
    param_grid = dict(n_estimators=n_estimators)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    n_estimators_r = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        n_estimators_r.append(grid_result.best_params_['n_estimators'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    n_estimators = find_nearest(n_estimators_r, np.mean(n_estimators_r))
    
    model.n_estimators = n_estimators
    
    print("Averaged best n_estimators: %f " % n_estimators)
    print('-----')  
        
    learning_rate = [0.025, 0.05, 0.075, 0.1, 0.2, 0.3]
    param_grid = dict(learning_rate=learning_rate)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    learning_rates = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        learning_rates.append(grid_result.best_params_['learning_rate'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    learning_rate = find_nearest(learning_rates, np.mean(learning_rates))
    
    model.learning_rate = learning_rate
    
    print("Averaged best learning rate: %f " % learning_rate)
    print('-----')     

    max_depth = [2, 4, 6, 8, 10, 12, 14]
    param_grid = dict(max_depth=max_depth)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    max_depths = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        max_depths.append(grid_result.best_params_['max_depth'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    max_depth = find_nearest(max_depths, np.mean(max_depths))
    
    model.max_depth = max_depth
    
    print("Averaged best max depth: %f " % max_depth)
    print('-----')
    samples = [0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0] #[i/100.0 for i in range(60,101, 5)]
    param_grid = dict(subsample=samples)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    subsamples = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        subsamples.append(grid_result.best_params_['subsample'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    subsample = find_nearest(subsamples, np.mean(subsamples))
    
    model.subsample = subsample
    
    print("Averaged best subsample: %f " % subsample)
    print('-----')

    param_grid = dict(colsample_bytree=samples)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    colsample_bytrees = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        colsample_bytrees.append(grid_result.best_params_['colsample_bytree'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    colsample_bytree = find_nearest(colsample_bytrees, np.mean(colsample_bytrees))
    
    model.colsample_bytree = colsample_bytree
    
    print("Averaged best colsample_bytree: %f " % colsample_bytree)
    print('-----')

    param_grid = dict(colsample_bylevel=samples)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    colsample_bylevels = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        colsample_bylevels.append(grid_result.best_params_['colsample_bylevel'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    colsample_bylevel = find_nearest(colsample_bylevels, np.mean(colsample_bylevels))
    
    model.colsample_bylevel = colsample_bylevel
    
    print("Averaged best colsample_bylevel: %f " % colsample_bylevel)
    print('-----')

    # Retest with new parameters
    round_err = []
    for r in range(0, 5):
        err = []
        for train_index, test_index in kfold.split(X_data):
            xgb_model = xgb.XGBRegressor(nthread=-1, colsample_bytree = colsample_bytree, 
                                         learning_rate = learning_rate, max_depth = max_depth, 
                                         n_estimators = n_estimators, subsample = subsample,
                                         colsample_bylevel = colsample_bylevel)
            xgb_model.fit(X_data[train_index],y_data[train_index])
            predictions = model.predict(X_data[test_index])
            actuals = y_data[test_index]
            err.append(mean_absolute_error(actuals, predictions))
               
        print(np.mean(err))
        round_err.append(np.mean(err))

    tuned_error = np.mean(round_err)

    print("Average tuned error: %s" % tuned_error)
    improvement = baseline_error - tuned_error
    print('-----')
    print('Optimisation improvement result: %s, %s%%' % (improvement, improvement / baseline_error * 100))
    print('-----')
    print(xgb_model)
    print('-----')

## Secondary parameters

In [None]:
weights = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    
#     model.eval_metric = 'mae'
    
#     gamma = [0]
#     param_grid = dict(gamma=gamma)

#     grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
#     gammas = []

#     for r in range(0, 5):
#         grid_result = grid_search.fit(X_data, y_data)
#         # summarize results
#         print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#         gammas.append(grid_result.best_params_['gamma'])
#         means = grid_result.cv_results_['mean_test_score']
#         stds = grid_result.cv_results_['std_test_score']
#         params = grid_result.cv_results_['params']

#     gamma = find_nearest(gammas, np.mean(gammas))
    
#     model.gamma = gamma
    
#     print("Averaged best gamma: %f " % gamma)
#     print('-----')    
    
#     min_child_weight = [0]
#     param_grid = dict(min_child_weight=min_child_weight)

#     grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
#     min_child_weights = []

#     for r in range(0, 5):
#         grid_result = grid_search.fit(X_data, y_data)
#         # summarize results
#         print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#         min_child_weights.append(grid_result.best_params_['min_child_weight'])
#         means = grid_result.cv_results_['mean_test_score']
#         stds = grid_result.cv_results_['std_test_score']
#         params = grid_result.cv_results_['params']

#     min_child_weight = find_nearest(min_child_weights, np.mean(min_child_weights))
    
#     model.min_child_weight = min_child_weight
    
#     print("Averaged best min_child_weight: %f " % min_child_weight)
#     print('-----')

    gamma = 0
    min_child_weight = 0

    reg_lambda = [0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
    param_grid = dict(reg_lambda=reg_lambda)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    reg_lambdas = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        reg_lambdas.append(grid_result.best_params_['reg_lambda'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    reg_lambda = find_nearest(reg_lambdas, np.mean(reg_lambdas))
    
    model.reg_lambda = reg_lambda
    
    print("Averaged best reg_lambda: %f " % reg_lambda)
    print('-----')

    scale_pos_weight = [0, 1, 2, 3, 4, 5]
    param_grid = dict(scale_pos_weight=scale_pos_weight)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    scale_pos_weights = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        scale_pos_weights.append(grid_result.best_params_['scale_pos_weight'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    scale_pos_weight = find_nearest(scale_pos_weights, np.mean(scale_pos_weights))
    
    model.scale_pos_weight = scale_pos_weight
    
    print("Averaged best scale_pos_weight: %f " % scale_pos_weight)
    print('-----')
    

    reg_alpha = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
    param_grid = dict(reg_alpha=reg_alpha)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    reg_alphas = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        reg_alphas.append(grid_result.best_params_['reg_alpha'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    reg_alpha = find_nearest(reg_alphas, np.mean(reg_alphas))
    
    model.reg_alpha = reg_alpha
    
    print("Averaged best reg_alpha: %f " % reg_alpha)
    print('-----')
        
    base_score = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
    param_grid = dict(base_score=base_score)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    base_scores = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        base_scores.append(grid_result.best_params_['base_score'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    base_score = find_nearest(base_scores, np.mean(base_scores))
    
    model.base_score = base_score
    
    print("Averaged best base_score: %f " % base_score)
    print('-----')

    
    # Retest with new parameters
    round_err = []
    for r in range(0, 5):
        err = []
        for train_index, test_index in kfold.split(X_data):
            xgb_model = xgb.XGBRegressor(nthread=-1, colsample_bytree = colsample_bytree, gamma=gamma, 
                                         learning_rate = learning_rate, max_depth = max_depth, 
                                         n_estimators = n_estimators, subsample = subsample,
                                         colsample_bylevel = colsample_bylevel, base_score = base_score,
                                         reg_alpha = reg_alpha, scale_pos_weight = scale_pos_weight,
                                         reg_lambda = reg_lambda, min_child_weight = min_child_weight)
            xgb_model.fit(X_data[train_index],y_data[train_index])
            predictions = model.predict(X_data[test_index])
            actuals = y_data[test_index]
            err.append(mean_absolute_error(actuals, predictions))
               
        print(np.mean(err))
        round_err.append(np.mean(err))

    tuned_error = np.mean(round_err)

    print("Average tuned error: %s" % tuned_error)
    improvement = baseline_error - tuned_error
    print('-----')
    print('Optimisation improvement result: %s, %s%%' % (improvement, improvement / baseline_error * 100))
    print('-----')
    print(xgb_model)
    print('-----')

In [None]:
model.max_depth = 70
    n_estimators=[7000, 7500, 8000, 8500, 9000, 9500, 10000]
        
    param_grid = dict(n_estimators=n_estimators)

    grid_search = GridSearchCV(model,param_grid, scoring="neg_mean_absolute_error", cv=kfold, verbose=1, n_jobs=-1)
    n_estimators_r = []

    for r in range(0, 5):
        grid_result = grid_search.fit(X_data, y_data)
        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        n_estimators_r.append(grid_result.best_params_['n_estimators'])
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

    n_estimators = find_nearest(n_estimators_r, np.mean(n_estimators_r))
    
    model.n_estimators = n_estimators
    
    print("Averaged best n_estimators: %f " % n_estimators)
    print('-----')

## Compare model to baseline

In [None]:
import time
from sklearn.metrics import r2_score

kfold = KFold(n_splits=5, shuffle=True)

# Test with base parameters
print('-----')
print('Base model')

base_errs = []
base_r2s = []
for r in range(0, 5):
    err = []
    r2 = []
    for train_index, test_index in kfold.split(X_data):
        start = time.time()
        base_model = xgb.XGBRegressor(nthread=-1)
        base_model.fit(X_data[train_index],y_data[train_index])
        fit_time = time.time()
        predictions = base_model.predict(X_data[test_index])
        prediction_time = time.time()
        actuals = y_data[test_index]
        err.append(mean_absolute_error(actuals, predictions))
        r2.append(r2_score(actuals, predictions))
               
    print(np.mean(err))
    base_errs.append(np.mean(err))
    print(np.mean(r2))
    base_r2s.append(np.mean(r2))
    print('Fit elapsed time: %d, Prediction elapsed time: %d' % (fit_time - start, prediction_time - fit_time))

base_error = np.mean(base_errs)
base_r2 = np.mean(base_r2s)

print('-----')
print(base_model)
print("Average base error: %s" % base_error)
print("Average base r2: %s" % base_r2)


# Retest with new parameters
print('-----')
print('Optimised model')

opt_err = []
opt_r2s = []
for r in range(0, 5):
    err = []
    r2 = []
    for train_index, test_index in kfold.split(X_data):
        start = time.time()
        tuned_model = xgb.XGBRegressor(n_estimators=10000, nthread=-1,  learning_rate=0.05, max_depth=50)
        eval_set = [(X_data[test_index], y_data[test_index])]
        tuned_model.fit(X_data[train_index],y_data[train_index], early_stopping_rounds=50, eval_metric="mae", 
                        eval_set=eval_set, verbose=False)
        fit_time = time.time()
        predictions = tuned_model.predict(X_data[test_index])
        prediction_time = time.time()
        actuals = y_data[test_index]
        err.append(mean_absolute_error(actuals, predictions))
        r2.append(r2_score(actuals, predictions))
               
    print(np.mean(err))
    opt_err.append(np.mean(err))
    print(np.mean(r2))
    opt_r2s.append(np.mean(r2))
    print('Fit elapsed time: %d, Prediction elapsed time: %d' % (fit_time - start, prediction_time - fit_time))


tuned_error = np.mean(opt_err)
tuned_r2 = np.mean(opt_r2s)


print('-----')
print(tuned_model)
print("Average tuned error: %s" % tuned_error)
improvement = base_error - tuned_error
print('Optimisation improvement result: %s, %s%%' % (improvement, improvement / base_error * 100))
print('-----')

print("Average tuned r2: %s" % tuned_r2)
improvement = tuned_r2 - base_r2
print('Optimisation improvement result: %s, %s%%' % (improvement, improvement / base_r2 * 100))
print('-----')


#     --- CBA --
#     Base model
#     2.76593178309
#     Fit elapsed time: 0, Prediction elapsed time: 0
#     2.80202959964
#     Fit elapsed time: 0, Prediction elapsed time: 0
#     2.74822700498
#     Fit elapsed time: 0, Prediction elapsed time: 0
#     2.80035623995
#     Fit elapsed time: 0, Prediction elapsed time: 0
#     2.78568218851
#     Fit elapsed time: 0, Prediction elapsed time: 0
#     -----
#     XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
#            learning_rate=0.1, max_delta_step=0, max_depth=3,
#            min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
#            objective='reg:linear', reg_alpha=0, reg_lambda=1,
#            scale_pos_weight=1, seed=0, silent=True, subsample=1)
#     Average base error: 2.78044536324
#     -----
#     Optimised model
#     1.42491403015
#     Fit elapsed time: 42, Prediction elapsed time: 0
#     1.4597646821
#     Fit elapsed time: 43, Prediction elapsed time: 0
#     1.46155690531
#     Fit elapsed time: 43, Prediction elapsed time: 0
#     1.45526380132
#     Fit elapsed time: 44, Prediction elapsed time: 0
#     1.48145221254
#     Fit elapsed time: 42, Prediction elapsed time: 0
#     -----
#     XGBRegressor(base_score=0.35, colsample_bylevel=0.8, colsample_bytree=0.8,
#            gamma=0, learning_rate=0.075, max_delta_step=0, max_depth=70,
#            min_child_weight=0, missing=None, n_estimators=9500, nthread=-1,
#            objective='reg:linear', reg_alpha=0.4, reg_lambda=0.3,
#            scale_pos_weight=0, seed=0, silent=True, subsample=0.8)
#     Average tuned error: 1.45659032628
#     -----
#     Optimisation improvement result: 1.32385503695, 47.6130570468%
#     -----

## Data checks

In [None]:
# Check correlations 
filtered_data[data_columns].corr()

## Feature importance

In [None]:
from skrebate import ReliefF
from sklearn.model_selection import train_test_split

# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size=0.7, test_size=0.3)


fs = ReliefF()
fs.fit(X_train, y_train)

for feature_name, feature_score in zip(filtered_data.columns, fs.feature_importances_):
    print(feature_name, '\t', feature_score)