In [1]:
import os,sys,time,random,math,time
import tarfile, zipfile

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.linear_model import LinearRegression,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor
from sklearn.svm import SVR
from sklearn import decomposition, datasets, ensemble
from sklearn.cluster import KMeans,MeanShift

from sklearn.base import clone as skclone
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import make_scorer,precision_score, recall_score, f1_score, average_precision_score, accuracy_score, mean_absolute_error

from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
from IPython.display import display, Image

import xgboost as xgb

use_xgb=True #disable for speed

from subprocess import check_output
datadir="./input/"
print(check_output(["ls", datadir]).decode("utf8"))

%matplotlib inline  


test.csv.zip
train.csv.zip



In [2]:
def loadData(datadir,filename):
    # Load the wholesale customers dataset
    #data = pd.read_csv(filename)
    data = ''
    print ("loading: "+datadir+filename)
    try:
        if zipfile.is_zipfile(datadir+filename):
            z = zipfile.ZipFile(datadir+filename)
            filename = z.open(filename[:-4])
        else:
            filename=datadir+filename
        data = pd.read_csv(filename, parse_dates=True)  
        print ("Dataset has {} samples with {} features each.".format(*data.shape))
    except Exception as e:
        print ("Dataset could not be loaded. Is the dataset missing?")
        print(e)
    return data

def writeData(data,filename):
    # Load the wholesale customers dataset
    try:
        data.to_csv(filename, index=False)
    except Exception as e:
        print ("Dataset could not be written.")
        print(e)
    verify=[]
    try:
        with open(filename, 'r') as f:
            for line in f:
                verify.append(line)
        f.closed
        return verify[:5]
    except IOError:
        sys.std
        
def LabelEncoder(data):
    # lifted in parts from:
    #https://www.kaggle.com/mmueller/allstate-claims-severity/yet-another-xgb-starter/code
    features = data.columns
    cats = [feat for feat in features if 'cat' in feat]
    for feat in cats:
        data[feat] = pd.factorize(data[feat], sort=True)[0]
    return data

# XGB!

def xgbfit(X_train,y_train):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    

    xgb_params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.075,
        'objective': 'reg:linear',
        'max_depth': 6,
        'num_parallel_tree': 1,
        'min_child_weight': 1,
        'eval_metric': 'mae',
    }

    start_time = time.time()
    res = xgb.cv(xgb_params, dtrain, num_boost_round=750, nfold=4, seed=42, stratified=False,
                 early_stopping_rounds=15, verbose_eval=100, show_stdv=True, maximize=False)
    print("fit time:{}s".format(round((time.time()-start_time), 3) ))

    best_nrounds = res.shape[0] - 1
    cv_mean = res.iloc[-1, 0]
    cv_std = res.iloc[-1, 1]
    print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))
    # XGB Train!
    start_time = time.time()
    gbdt = xgb.train(xgb_params, dtrain, best_nrounds)
    print("Train time:{}s".format(round((time.time()-start_time), 3) ))
    return gbdt

In [3]:
def kmeansPlusmeanshift(data):
    start_time = time.time()
    startingClusterSize=int(len(data)*.05)
    print "kmeans.... for {} clusters".format(startingClusterSize)
    k_means =KMeans(n_clusters=startingClusterSize,n_jobs=10)
    k_means.fit(data.sample(frac=0.25).values)
    clusters=k_means.cluster_centers_
    print("kmeans round 1 time:{}s".format(round((time.time()-start_time), 3) ))
    print clusters[:15]
    
    start_time = time.time()
    #use the cluster centers of the guessed clusters to get an estimate of actual numbers of clusters. doing this for speed increase!
    print "\nmeanshift..."
    meanshift=MeanShift(n_jobs=10)
    meanshift.fit(clusters)
    newcenters=meanshift.cluster_centers_
    print("meanshift time:{}s".format(round((time.time()-start_time), 3) ))
    print newcenters[:15], "\nnum of clusters from meanshift:",len(newcenters)
    
    start_time = time.time()
    # use the new clusters number to predict each locations cluster
    print "\nredo kmeans with new cluster number from meanshift +1 to account for sampling..."
    k_means =KMeans(n_clusters=len(newcenters)+1,n_jobs=10)
    final_clusters=k_means.fit_predict(data.values)
    print("kmeans round 2 time:{}s".format(round((time.time()-start_time), 3) ))
    return final_clusters

In [4]:
def kmeansPlusmeanshift(data):  
    start_time = time.time()
    # use the new clusters number to predict each locations cluster
    print "\nredo kmeans with new cluster number from meanshift +1 to account for sampling..."
    k_means =KMeans(n_clusters=80,n_jobs=12)
    final_clusters=k_means.fit_predict(data.values)
    print("kmeans round 2 time:{}s".format(round((time.time()-start_time), 3) ))
    return final_clusters

In [5]:
data = loadData(datadir,'train.csv.zip')
display(data.info())
display(data.head(5))

test_data= loadData(datadir,'test.csv.zip') 
display(test_data.info())
display(test_data.head(5))

loading: ./input/train.csv.zip
Dataset has 188318 samples with 132 features each.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188318 entries, 0 to 188317
Columns: 132 entries, id to loss
dtypes: float64(15), int64(1), object(116)
memory usage: 189.7+ MB


None

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


loading: ./input/test.csv.zip
Dataset has 125546 samples with 131 features each.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125546 entries, 0 to 125545
Columns: 131 entries, id to cont14
dtypes: float64(14), int64(1), object(116)
memory usage: 125.5+ MB


None

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,4,A,B,A,A,A,A,A,A,B,...,0.281143,0.466591,0.317681,0.61229,0.34365,0.38016,0.377724,0.369858,0.704052,0.392562
1,6,A,B,A,B,A,A,A,A,B,...,0.836443,0.482425,0.44376,0.7133,0.5189,0.60401,0.689039,0.675759,0.453468,0.208045
2,9,A,B,A,B,B,A,B,A,B,...,0.718531,0.212308,0.325779,0.29758,0.34365,0.30529,0.24541,0.241676,0.258586,0.297232
3,12,A,A,A,A,B,A,A,A,A,...,0.397069,0.36993,0.342355,0.40028,0.33237,0.3148,0.348867,0.341872,0.592264,0.555955
4,15,B,A,A,A,A,B,A,A,A,...,0.302678,0.398862,0.391833,0.23688,0.43731,0.50556,0.359572,0.352251,0.301535,0.825823


###  Pre Proccessing

In [6]:
# combine the two frames so we can encode the labels!
test_data['loss']=0

lengthofData=len(data)
lengthoftest_data=len(test_data)

print("data:",lengthofData)
print("test:",lengthoftest_data)

combineddata=pd.concat([data,test_data])
lengthofcombined=len(combineddata)
print("combined:",lengthofcombined)

('data:', 188318)
('test:', 125546)
('combined:', 313864)


In [7]:
# the categorical data that we need in a number format
combineddata=LabelEncoder(combineddata)
print("label encoded")

label encoded


In [8]:
#predict the cluster for each row
filename='clusters.npy'
if os.path.isfile(filename):
    print("File found, using it")
    combineddata['clusters']=joblib.load(filename)
else:
    print("no files, running clusters...")
    combineddata['clusters']=kmeansPlusmeanshift(combineddata.drop(['id','loss'],1))
    joblib.dump(combineddata['clusters'],filename)


File found, using it


In [9]:

# time to split the data back apart!
data=combineddata.iloc[:lengthofData].copy()
test_data=combineddata.iloc[lengthofData:].copy()
test_data.drop(['loss'],1,inplace=True) # didn't have this column before, make it go away!


x_test = test_data.copy()
x_test.drop(['id'],1,inplace=True)

# we don't want the ID columns in X, and of course not loss either
x=data.drop(['id','loss'],1)
# loss is our label
y=data['loss']

#minmax scaler
scaler= MinMaxScaler() 
x = scaler.fit_transform(x)
x_test_data = scaler.fit_transform(x_test)

#display(x[:5])
#display(y.head(5))

print("Pre-Processing done")
print("data:",len(x))
print("labels:",len(y))
print("test:",len(x_test_data))


Pre-Processing done
('data:', 188318)
('labels:', 188318)
('test:', 125546)


In [10]:
#del data,test_data
#del combineddata
#del scaler
#del x_test

### pick our sklearn regressors, and do some param optimizations

In [11]:
regressor_w_grid=[] # a list of regressions to use
#regrList.append([LinearRegression()])
regressor_w_grid.append([ExtraTreesRegressor(n_jobs = -1),
                         dict(n_estimators=[5,7,10,25,50,500],
                         max_features=['auto','sqrt','log2'])])
regressor_w_grid.append([Ridge(),
                         dict(alpha=[.5,1,2,4,40,400])])
regressor_w_grid.append([RandomForestRegressor(#criterion = 'mae',
                                      n_jobs =-1, 
                                      random_state=42),
                        dict(n_estimators=[5,7,10,25,50,500],
                             max_features=['auto','sqrt','log2'])])
regressor_w_grid.append([KNeighborsRegressor(n_jobs = -1),
                        dict(n_neighbors=[2,5,7,15],
                             leaf_size =[3,10,15,25,30,50,100])])
#regrList.append([SVR(), dict()]) # oh my so slow! and bad initial scores



regrList=np.array(regressor_w_grid).T[0]
paramater_grid=np.array(regressor_w_grid).T[1]
print regrList
print paramater_grid

print("number of scikitlearn regressors to use:",len(regrList))

[ ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)
 Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)
 KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
          weights='uniform')]
[ {'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [5, 7,

In [12]:
#  train/validation split
X_train, X_validation, y_train, y_validation = train_test_split( x,
                                                                y,
                                                               test_size=0.80,
                                                                random_state=42)
display("sample train data size:{}".format(len(y_train)))

'sample train data size:37663'

In [13]:
start_time0 = time.time()
for i in range(len(regrList)):
    start_time = time.time()
    print("In:{}".format(regrList[i]))
    filename= 'grid_regr{}.pkl'.format(i)
    if os.path.isfile(filename):
        print filename," exists, importing "
        regrList[i]=joblib.load(filename) 
    else:
        print("{} not present, running a gridsearch".format(filename))
        #search the param_grid for best params based on the f1 score
        grid_search = GridSearchCV(regrList[i],
                                   param_grid= paramater_grid[i],
                                   n_jobs= -1,
                                   scoring=make_scorer(mean_absolute_error,greater_is_better=False)) 
        grid_search.fit(X_train,y_train)
        #reach into the grid search and pull out the best parameters, and set those on the clf
        params={}
        for p in grid_search.best_params_:
            params[p]=grid_search.best_params_[p]
        regrList[i].set_params(**params)
        print("run time:{}s".format(round((time.time()-start_time), 3) ))   
        joblib.dump(regrList[i],filename) 
        del grid_search
print("Full GridSearch run time:{}s".format(round((time.time()-start_time0), 3) ))   


#Full GridSearch run time:4774.187s

In:ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)
grid_regr0.pkl  exists, importing 
In:Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
grid_regr1.pkl  exists, importing 
In:RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)
grid_regr2.pkl  exists, importing 
In:KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_nei

In [14]:
del X_train, X_validation, y_train, y_validation

### Stacking Layer 1, train and predict for layer 2

Split the data into k-folds(divisions). train the regressors on each combination of k-1 folds, and then predict on the held-out fold. Preserve the prediction of each regressor for the next layer.

In [15]:
#prepare the fold divisions

data_size=x.shape[0]
print "size of train data:",data_size
folds=[]
num_folds=5
fold_start=0
for k in range(num_folds-1):
    fold_end=((data_size/num_folds)*(k+1))
    folds.append((fold_start,fold_end))
    fold_start=fold_end
folds.append((fold_start,data_size))
print "folds at:",folds
print "fold size:", (data_size/num_folds)
print "train size:",(data_size/num_folds)*(num_folds-1)

count=0
for i in folds:
    count+=i[1]-i[0]
print count

size of train data: 188318
folds at: [(0, 37663), (37663, 75326), (75326, 112989), (112989, 150652), (150652, 188318)]
fold size: 37663
train size: 150652
188318


In [16]:
x_layer2=[]
start_time0 = time.time()
MAE_tracking=[]

for fold_start,fold_end in folds:
    print("Fold:{} to {} of: {}".format(fold_start,fold_end,data_size))
    start_time1 = time.time()
    fold_result=[]
    
    X_test = x[fold_start:fold_end].copy()
    y_test = y[fold_start:fold_end].copy()
    X_train=np.concatenate((x[:fold_start], x[fold_end:]), axis=0)
    y_train=np.concatenate((y[:fold_start], y[fold_end:]), axis=0)
    print "\nfolding! len test {}, len train {}".format(len(X_test),len(X_train))
    
    for i in range(len(regrList)): # for each of the regressions we use, fit/predict the data
        print(regrList[i])
        start_time = time.time()
        estimator=skclone(regrList[i], safe=True)
        estimator.fit(X_train,y_train)
        print("\nfit time:{}s".format(round((time.time()-start_time), 3) ))
        start_time = time.time()
        curr_predict=np.array(estimator.predict(X_test)).copy()
        if fold_result == []:
            fold_result = curr_predict
        else:
            fold_result = np.column_stack((fold_result,curr_predict))  
        #show some stats on that last regressions run
        MAE=np.mean(abs(curr_predict - y_test))
        MAE_tracking.append(["run:{}-{}:{}".format(fold_start,fold_end,i),MAE])
        print("Mean abs error: {:.2f}".format(MAE))
        print("predict time:{}s".format(round((time.time()-start_time), 3) ))
        #print("Score: {:.2f}".format(estimator.score(X_test, y_test))) #delays the run...
        
    #XGB -- it doesn't fit the pattern of scikit, so do it seperatly
    if use_xgb == True:
        dtest = xgb.DMatrix(X_test)
        gbdt=xgbfit(X_train,y_train)

        # now do a prediction and spit out a score(MAE) that means something
        start_time = time.time()
        curr_predict=gbdt.predict(dtest)
        fold_result = np.column_stack((fold_result,curr_predict))   
        MAE=np.mean(abs(curr_predict - y_test))
        MAE_tracking.append(["run:{}-{}:{}".format(fold_start,fold_end,'XGB'),MAE])
        print("XGB Mean abs error: {:.2f}".format(MAE))
        print("XGB predict time:{}s".format(round((time.time()-start_time), 3) ))
    if x_layer2 == []:
        x_layer2=fold_result
    else:
        x_layer2=np.append(x_layer2,fold_result,axis=0)
        
    print "--layer2 length:",len(x_layer2)
    print "--layer2 shape:",np.shape(x_layer2)
    print("Fold run time:{}s".format(round((time.time()-start_time1), 3) ))   
print("Full run time:{}s".format(round((time.time()-start_time0), 3) ))   
#preserve the run
joblib.dump(x_layer2,'x_layer2.npy') 
joblib.dump(MAE_tracking,'MAE_tracking.npy')

Fold:0 to 37663 of: 188318

folding! len test 37663, len train 150655
ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=500, n_jobs=-1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)

fit time:72.699s
Mean abs error: 1241.16
predict time:4.165s
Ridge(alpha=40, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

fit time:3.952s
Mean abs error: 1334.21
predict time:0.012s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)





fit time:65.875s
Mean abs error: 1233.78
predict time:3.799s
KNeighborsRegressor(algorithm='auto', leaf_size=3, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
          weights='uniform')

fit time:50.201s
Mean abs error: 1306.24
predict time:358.25s
[0]	train-mae:2810.38+4.10425	test-mae:2810.39+13.8376
[100]	train-mae:1161+2.17175	test-mae:1205.7+9.12972
[200]	train-mae:1120.54+2.29236	test-mae:1191.1+8.86479
[300]	train-mae:1094.03+3.17877	test-mae:1187.68+8.64846
[400]	train-mae:1070.56+2.93079	test-mae:1186.26+8.62755
[500]	train-mae:1049.05+2.90489	test-mae:1185.27+8.80342
fit time:237.068s
CV-Mean: 1185.259796+8.80555038302
Train time:74.262s
XGB Mean abs error: 1191.91
XGB predict time:0.259s
--layer2 length: 37663
--layer2 shape: (37663, 5)
Fold run time:871.289s
Fold:37663 to 75326 of: 188318

folding! len test 37663, len train 150655
ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='sqrt', max_le




fit time:74.981s
Mean abs error: 1243.73
predict time:4.267s
Ridge(alpha=40, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

fit time:3.991s
Mean abs error: 1329.68
predict time:0.012s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

fit time:67.346s
Mean abs error: 1234.18
predict time:3.867s
KNeighborsRegressor(algorithm='auto', leaf_size=3, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
          weights='uniform')

fit time:55.961s
Mean abs error: 1316.09
predict time:432.621s
[0]	train-mae:2806.92+1.9964	test-mae:2807.01+5.73712
[100]	train-mae:1159.19+1.9946	test-mae:1203.68+8.02486
[200]	train-mae:1118.95+2.2491

['MAE_tracking.npy']

In [49]:
#preserve the run
x_layer2=joblib.load('x_layer2.npy') 
MAE_tracking=joblib.load('MAE_tracking.npy')

In [50]:
# add an avged column of all the runs

avg_column=np.mean(x_layer2, axis=1)

MAE=np.mean(abs(avg_column - y))
print("avgd Mean abs error: {:.2f}".format(MAE))
x_layer2=np.column_stack((x_layer2,avg_column))
print("length of new row: {}".format(len(x_layer2[0])))

avgd Mean abs error: 1206.77
length of new row: 6


In [51]:
display("test-first 3",x_layer2[:3])
print("length of row: {}".format(len(x_layer2[0])))


'test-first 3'

array([[ 2234.74742   ,   931.1027475 ,  2389.70176   ,  2263.95866667,
         2003.27526855,  1964.55717254],
       [ 2006.8304    ,  2431.8660271 ,  2019.39784   ,  1960.644     ,
         2156.67480469,  2115.08261436],
       [ 4745.98622   ,  5166.53495085,  4397.67864   ,  4233.80933333,
         4164.74316406,  4541.75046165]])

length of row: 6


### put each in a cluster

In [52]:

start_time = time.time()
# use the new clusters number to predict each locations cluster
print "\nredo kmeans with new cluster number from meanshift to account for sampling..."
k_means =KMeans(n_clusters=80,n_jobs=12)
final_clusters=k_means.fit_predict(x_layer2)
print("kmeans round 2 time:{}s".format(round((time.time()-start_time), 3) ))

print("length of row: {}".format(len(x_layer2[0]))
x_layer2=np.column_stack((x_layer2,final_clusters))
print("length of row: {}".format(len(x_layer2[0])))
joblib.dump(x_layer2,'x_layer2_w_clusters.npy') 


redo kmeans with new cluster number from meanshift to account for sampling...
kmeans round 2 time:48.971s


'Clusters sample:'

array([73, 18, 52, 10, 41,  1, 48, 23, 25,  7,  1, 23, 11, 54, 30], dtype=int32)

'test-first 3'

array([[ 2234.74742   ,   931.1027475 ,  2389.70176   ,  2263.95866667,
         2003.27526855,  1964.55717254],
       [ 2006.8304    ,  2431.8660271 ,  2019.39784   ,  1960.644     ,
         2156.67480469,  2115.08261436],
       [ 4745.98622   ,  5166.53495085,  4397.67864   ,  4233.80933333,
         4164.74316406,  4541.75046165]])

length of row: 6


'test-first 3'

array([[ 2234.74742   ,   931.1027475 ,  2389.70176   ,  2263.95866667,
         2003.27526855,  1964.55717254,    73.        ],
       [ 2006.8304    ,  2431.8660271 ,  2019.39784   ,  1960.644     ,
         2156.67480469,  2115.08261436,    18.        ],
       [ 4745.98622   ,  5166.53495085,  4397.67864   ,  4233.80933333,
         4164.74316406,  4541.75046165,    52.        ]])

length of row: 7


['x_layer2_w_clusters.npy', 'x_layer2_w_clusters.npy_01.npy']

In [53]:
x_layer2=joblib.load('x_layer2_w_clusters.npy') 



### train layer 2

In [60]:
x_layer3 = []

for fold_start,fold_end in folds:
    print("Fold:{} to {} of: {}".format(fold_start,fold_end,data_size))
    start_time1 = time.time()
    fold_result=[]
    
    X_layer2_validation = x_layer2[fold_start:fold_end].copy()
    y_layer2_validation = y[fold_start:fold_end].copy()
    X_layer2_train=np.concatenate((x_layer2[:fold_start], x_layer2[fold_end:]), axis=0)
    y_layer2_train=np.concatenate((y[:fold_start], y[fold_end:]), axis=0)
    print "\nfolding! len test {}, len train {}".format(len(X_test),len(X_train))
    

    layer2_regr=LinearRegression()
    layer2_regr.fit(X_layer2_train,y_layer2_train)
    layer2_predict_linear=layer2_regr.predict(X_layer2_validation)
    #show some stats on that last regressions run    
    MAE=np.mean(abs(layer2_predict_linear - y_layer2_validation))
    MAE_tracking.append(["run:{}".format('linearLayer2'),MAE])
    print("LinearRegression Mean abs error: {:.2f}".format(MAE))
    print("Score: {:.2f}".format(layer2_regr.score(X_layer2_validation, y_layer2_validation)))
    fold_result = layer2_predict_linear
    #with LinearReg: Mean abs error: 1172.67

    #KNeighborsRegressor
    layer2_regr=KNeighborsRegressor(n_jobs = -1)
    layer2_regr.fit(X_layer2_train,y_layer2_train)
    layer2_predict_KNeighbors=layer2_regr.predict(X_layer2_validation)
    #show some stats on that last regressions run    
    MAE=np.mean(abs(layer2_predict_KNeighbors - y_layer2_validation))
    MAE_tracking.append(["run:{}".format('linearLayer2'),MAE])
    print("KNeighborsRegressor Mean abs error: {:.2f}".format(MAE))
    print("Score: {:.2f}".format(layer2_regr.score(X_layer2_validation, y_layer2_validation)))
    fold_result = np.column_stack((fold_result,layer2_predict_KNeighbors))  

    #Mean abs error: 1291.64

    # The XGB version of layer 2
    print len(x_layer2)
    print len(y)
    dtest = xgb.DMatrix(X_layer2_validation)
    layer2_gbdt=xgbfit(X_layer2_train,y_layer2_train)
    # now do a prediction and spit out a score(MAE) that means something
    start_time = time.time()
    layer2_gbdt_predict=layer2_gbdt.predict(dtest)
    MAE=np.mean(abs(layer2_gbdt_predict- y_layer2_validation))
    MAE_tracking.append(["run:{}".format('XGBLayer2'),MAE])
    print("XGB Mean abs error: {:.2f}".format(MAE))
    print("XGB predict time:{}s".format(round((time.time()-start_time), 3) ))    
    fold_result = np.column_stack((fold_result,layer2_gbdt_predict))  
    
    #XGB Mean abs error: 1154.25
    
    # ? average those weighted to XGB
    layer2_avg_predict=(layer2_predict_linear+layer2_predict_KNeighbors+layer2_gbdt_predict+layer2_gbdt_predict)/4

    MAE=np.mean(abs(layer2_avg_predict- y_layer2_validation))
    print("AVG Mean abs error: {:.2f}".format(MAE))
    fold_result = np.column_stack((fold_result,layer2_avg_predict))  

    #AVG Mean abs error: 1163.71
    
    if x_layer3 == []:
        x_layer3=fold_result
    else:
        x_layer3=np.append(x_layer3,fold_result,axis=0)

Fold:0 to 37663 of: 188318

folding! len test 37666, len train 150652
LinearRegression Mean abs error: 1185.03
Score: 0.56
KNeighborsRegressor Mean abs error: 1298.95
Score: 0.50
188318
188318
[0]	train-mae:2810.3+4.11615	test-mae:2810.31+13.0248
fit time:7.431s
CV-Mean: 1155.171875+7.84539876263
Train time:1.45s
XGB Mean abs error: 1157.93
XGB predict time:0.011s
AVG Mean abs error: 1170.72
Fold:37663 to 75326 of: 188318

folding! len test 37666, len train 150652
LinearRegression Mean abs error: 1167.58
Score: 0.58
KNeighborsRegressor Mean abs error: 1284.89
Score: 0.51
188318
188318
[0]	train-mae:2808.97+4.34532	test-mae:2808.91+13.1953
fit time:7.145s
CV-Mean: 1157.572998+8.95502111568
Train time:1.421s
XGB Mean abs error: 1149.56
XGB predict time:0.011s
AVG Mean abs error: 1159.29
Fold:75326 to 112989 of: 188318

folding! len test 37666, len train 150652
LinearRegression Mean abs error: 1178.05
Score: 0.58




KNeighborsRegressor Mean abs error: 1296.77
Score: 0.51
188318
188318
[0]	train-mae:2806.9+2.06352	test-mae:2806.99+5.596
fit time:7.436s
CV-Mean: 1154.2679445+7.74710302501
Train time:1.882s
XGB Mean abs error: 1162.93
XGB predict time:0.011s
AVG Mean abs error: 1171.79
Fold:112989 to 150652 of: 188318

folding! len test 37666, len train 150652
LinearRegression Mean abs error: 1182.55
Score: 0.59
KNeighborsRegressor Mean abs error: 1301.08
Score: 0.51
188318
188318
[0]	train-mae:2808.43+3.44362	test-mae:2808.46+10.2724
fit time:8.108s
CV-Mean: 1154.108734+4.15215943059
Train time:1.319s
XGB Mean abs error: 1161.95
XGB predict time:0.011s
AVG Mean abs error: 1172.91
Fold:150652 to 188318 of: 188318

folding! len test 37666, len train 150652
LinearRegression Mean abs error: 1167.90
Score: 0.59
KNeighborsRegressor Mean abs error: 1280.27
Score: 0.51
188318
188318
[0]	train-mae:2812.13+4.99478	test-mae:2812.1+15.1572
fit time:6.766s
CV-Mean: 1158.17807+7.22327723449
Train time:1.427s
XGB 

### Layer 3

In [61]:
#  train/validation split
X_layer2_train, X_layer2_validation, y_layer2_train, y_layer2_validation = train_test_split( x_layer3,
                                                                                y,
                                                                                test_size=0.25,
                                                                                random_state=42)

In [62]:
# The XGB layer3?
print len(x_layer3)
print len(y)

dtest = xgb.DMatrix(X_layer2_validation)
layer3_gbdt=xgbfit(X_layer2_train,y_layer2_train)

# now do a prediction and spit out a score(MAE) that means something
start_time = time.time()
layer3_gbdt_predict=layer3_gbdt.predict(dtest)
MAE=np.mean(abs(layer3_gbdt_predict- y_layer2_validation))
MAE_tracking.append(["run:{}".format('XGBLayer2'),MAE])
print("XGB Mean abs error: {:.2f}".format(MAE))
print("XGB predict time:{}s".format(round((time.time()-start_time), 3) ))
#XGB Mean abs error: 1152.25

188318
188318
[0]	train-mae:2810.84+3.86282	test-mae:2810.9+11.9862
fit time:4.57s
CV-Mean: 1156.49072275+3.87762527898
Train time:1.013s
XGB Mean abs error: 1152.80
XGB predict time:0.016s


### MAE tracking:

In [None]:

MAE_tracking_graph=np.array(MAE_tracking)

print(MAE_tracking_graph.T)

plt.plot(MAE_tracking_graph.T[1])
plt.xlabel(MAE_tracking_graph.T[0])

plt.show()

del MAE_tracking_graph

### Predict layer 1 on test 

In [22]:
x_layer2_test = []
start_time1 = time.time()
for i in range(len(regrList)): # for each of the regressions we use, fit/predict the data
    start_time = time.time()            
    estimator=skclone(regrList[i], safe=True)
    print(estimator)
    estimator.fit(x,y)
    curr_predict=estimator.predict(x_test_data)
    print("predict time:{}s".format(round((time.time()-start_time), 3) ))
    
    if x_layer2_test == []:
        x_layer2_test = np.array(curr_predict.copy())
    else:
        x_layer2_test = np.column_stack((x_layer2_test,curr_predict))
    print curr_predict

#XGB -- it doesn't fit the pattern of scikit, so do it seperatly
if use_xgb == True:
    gbdt=xgbfit(x,y)
    dtest = xgb.DMatrix(x_test_data)
    # now do a prediction and spit out a score(MAE) that means something
    #start_time = time.time()
    curr_predict=gbdt.predict(dtest)
    x_layer2_test = np.column_stack((x_layer2_test,curr_predict))
    #print("Mean abs error: {:.2f}".format(np.mean(abs(cache[i+1] - y_test))))
    print("XGB predict time:{}s".format(round((time.time()-start_time), 3) ))

print("Fold run time:{}s".format(round((time.time()-start_time1), 3) ))   

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=500, n_jobs=-1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)
predict time:124.182s
[ 1968.14066  2490.3962   9540.60034 ...,  3106.19334  1732.70008
  4216.1039 ]
Ridge(alpha=40, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
predict time:5.089s
[  1142.23745145   1979.72828041  11311.5527931  ...,   2802.90769796
   1072.78012048   4663.58593472]
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)




predict time:111.534s
[ 2069.21534  2462.46922  9069.8269  ...,  3171.91794  1416.01394
  3897.07898]
KNeighborsRegressor(algorithm='auto', leaf_size=3, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
          weights='uniform')
predict time:1608.587s
[ 2116.656       2440.668       8861.596      ...,  2155.36533333
   768.71333333  2690.06533333]
[0]	train-mae:2809.84+7.17522	test-mae:2809.9+23.9038
[100]	train-mae:1167.28+1.35166	test-mae:1205.61+5.02426
[200]	train-mae:1129.26+1.88244	test-mae:1189.53+4.80291
[300]	train-mae:1105.42+2.28498	test-mae:1185.36+4.49929
[400]	train-mae:1085.12+2.0075	test-mae:1183.85+4.6061
fit time:279.369s
CV-Mean: 1183.10092175+4.58730875707
Train time:87.018s
XGB predict time:1976.875s
Fold run time:2217.685s


In [23]:
start_time = time.time()
# use the new clusters number to predict each locations cluster
print "\nredo kmeans with new cluster number from meanshift +1 to account for sampling..."
k_means =KMeans(n_clusters=80,n_jobs=12)
final_clusters=k_means.fit_predict(x_layer2_test)
print("kmeans round 2 time:{}s".format(round((time.time()-start_time), 3) ))
display("Clusters sample:",final_clusters[:15])

display("test-first 3",x_layer2_test[:3])
print("length of row: {}".format(len(x_layer2_test[0])))

x_layer2_test=np.column_stack((x_layer2_test,final_clusters))

display("test-first 3",x_layer2_test[:3])
print("length of row: {}".format(len(x_layer2_test[0])))
print("run time:{}s".format(round((time.time()-start_time), 3) ))   


redo kmeans with new cluster number from meanshift +1 to account for sampling...
kmeans round 2 time:54.323s


'Clusters sample:'

array([26, 19, 39, 51, 37, 30, 28, 12, 43, 35,  6, 64, 20, 45, 45], dtype=int32)

'test-first 3'

array([[  1968.14066   ,   1142.23745145,   2069.21534   ,   2116.656     ,
          1762.1595459 ],
       [  2490.3962    ,   1979.72828041,   2462.46922   ,   2440.668     ,
          2070.75366211],
       [  9540.60034   ,  11311.5527931 ,   9069.8269    ,   8861.596     ,
         10677.32910156]])

length of row: 5


'test-first 3'

array([[  1968.14066   ,   1142.23745145,   2069.21534   ,   2116.656     ,
          1762.1595459 ,     26.        ],
       [  2490.3962    ,   1979.72828041,   2462.46922   ,   2440.668     ,
          2070.75366211,     19.        ],
       [  9540.60034   ,  11311.5527931 ,   9069.8269    ,   8861.596     ,
         10677.32910156,     39.        ]])

length of row: 6
run time:54.337s


### Predict Layer 2

In [24]:
test_data['loss']=layer2_regr.predict(x_layer2_test)

result=test_data[['id','loss',]]
output_fname="result_submission_stack.csv"
display(writeData(result,output_fname))



['id,loss\n',
 '4,1835.71318186\n',
 '6,2159.34736559\n',
 '9,10456.9061043\n',
 '12,6456.47971617\n']

In [25]:
#the XGB version:
dtest = xgb.DMatrix(x_layer2_test)
test_data['loss']=layer2_gbdt.predict(dtest)

result=test_data[['id','loss',]]
output_fname="result_submission_stack_xgb.csv"
display(writeData(result,output_fname))



['id,loss\n',
 '4,1636.23242188\n',
 '6,1931.69592285\n',
 '9,9768.40332031\n',
 '12,5718.75927734\n']

In [26]:
#let's have a look at the std of the result, as a cross check
print("result std:",result.std(axis=0))

('result std:', id      170098.328125
loss      1976.653198
dtype: float32)


# EOF 