### Parameters

In [None]:
SEED = 0      # modifiable seed
CLF_SS = 1      # sub-sample model types for faster run
TARGETS = 0    # which target (0-4) to predict; -1 for all
nfolds = 5

### Imports

In [None]:
import os
import numpy as np  
import pandas as pd 
import pickle

In [None]:
import multiprocessing
from joblib import Parallel, delayed

In [None]:
from collections import Counter
import datetime as datetime

In [None]:
import gc
import sys

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
rcParams['figure.figsize'] = (15, 5.5)

pd.options.display.max_rows = 150

In [None]:
start = datetime.datetime.now()

if SEED < 0:
    np.random.seed(datetime.datetime.now().microsecond)
else:
    np.random.seed(SEED)

### Data Loading

In [None]:
path = './'

X = np.load(os.path.join(path, 'X_tr_%02d.npy' % TARGETS))
X_te = np.load(os.path.join(path, 'X_te_%02d.npy' % TARGETS))
y = np.load(os.path.join(path, 'y_tr_%02d.npy' % TARGETS))
groups = np.random.randint(0, nfolds, len(y))

In [None]:
X = pd.DataFrame(X)
X_te = pd.DataFrame(X_te)

In [None]:
y = pd.DataFrame(np.reshape(list(y) *5, (5, -1)).T, columns=['age', 'd11', 'd12', 'd21', 'd22'])

### Model Setup

In [None]:
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold, KFold, ShuffleSplit
from sklearn.svm import SVR, NuSVR
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler, FunctionTransformer 

In [None]:
nusvr_params = {
    'kernel': [  'rbf',  ] , 
    'C': [1],
    'gamma': [ 'scale'],
    'nu': [0.5] }

def trainNuSVR(x, y, groups, cv = 0, n_jobs = -1, **kwargs):
    clf = NuSVR(cache_size=100, tol = 1e-3)
    params = nusvr_params        
    return trainModel(x, y, groups, clf, params, cv, n_jobs,  **kwargs)

In [None]:
enet_params = { 'alpha': np.logspace(-8, 1, 2),
                'l1_ratio': [ 0.5]}

def trainENet(x, y, groups, cv = 0, n_jobs = -1, **kwargs):
    clf = ElasticNet(normalize = False, selection = 'random', max_iter = 1000, tol = 1e-3 )
    return trainModel(x, y, groups, clf, enet_params, cv, n_jobs, **kwargs)

In [None]:
def fnae(y_true, y_pred, tidx=0):
    mean_ = np.array([ 50.00929913, 374.75058741, 462.62996118, 332.46733185, 381.32490681])
    scale_ = np.array([ 13.54897461, 108.25005321, 128.03349126, 112.01700719, 124.92278531])

    t_true = y_true * scale_[tidx] + mean_[tidx]
    t_pred = y_pred * scale_[tidx] + mean_[tidx]

    if tidx == 0:
        age_values = np.unique(t_true)
        for i, a in enumerate(t_pred):
            t_pred[i] = age_values[np.argmin(np.abs(a-age_values))]

    if tidx > 0:
        t_true = np.power(t_true, 1./1.5)
        t_pred = np.power(t_pred, 1./1.5)
    score = np.mean(np.sum(np.abs(t_true - t_pred), axis=0) / np.sum(t_true, axis=0))
    return score

fnae_scorer = make_scorer(fnae, greater_is_better = False, tidx=TARGETS)

In [None]:
def trainModel(x, y, groups, clf, params, cv = 0, n_jobs = None, 
                   verbose=0, splits=None, **kwargs):
    if n_jobs is None:
        n_jobs = -1    

    n_iter = 3
    n_splits = 2
        
    folds = ShuffleSplit(n_splits = n_splits, train_size = 0.75, test_size = 0.20)
    clf = RandomizedSearchCV(clf, params, cv = folds, n_iter = n_iter, 
                            verbose = 1, n_jobs = n_jobs, scoring = fnae_scorer)
    
    f = clf.fit(x, y, groups)
    
    print(pd.DataFrame(clf.cv_results_['mean_test_score'])); print();  
    best = clf.best_estimator_;  print(best)
    print("Best Score: {}".format(np.round(clf.best_score_,4)))
    
    return best

In [None]:
def runBag(n = 3, model_type = trainENet, data = None, **kwargs):
    start_time = datetime.datetime.now(); 
    
    X, y, groups = data

    """
    valid = ~y.isnull()
    X = X[valid]; y = y[valid]; groups = groups[valid]
    """
    
    group_list = [*dict.fromkeys(groups)]   
    group_list.sort()
    
    clfs = []; preds = []; ys=[]; datestack = []
    for group in group_list:
        g = gc.collect()
        x_holdout = X[groups == group]
        y_holdout = y[groups == group]
        x_train = X[groups != group]
        y_train = y[groups != group]
        
        groups_train = groups[groups != group]

        model = model_type 
        clf = model(x_train, y_train, groups_train, **kwargs) 
        clfs.append(clf)

        predicted = clf.predict(x_holdout)
        print("{}: {:.4f}".format(group,
              fnae(y_holdout, predicted)  ) )
        
        preds.append(predicted)
        ys.append(y_holdout)
    
    y_pred = np.concatenate(preds)
    y_ho = np.concatenate(ys) 

    end_time = datetime.datetime.now(); 
    print("\nModel Bag Time: {}\n".format(str(end_time - start_time).split('.', 2)[0] ))
    return clfs

In [None]:
def trainBaseClfs(clfs, clf_names, data, target = None, **kwargs):
    start_time = datetime.datetime.now(); 
    
    X, y, groups = data
        
    group_list = [*dict.fromkeys(groups)]   
    group_list.sort()
    
    X_ordered = []; y_ordered = []; groups_ordered =[]  
    all_base_clfs = []; base_preds = [[] for i in range(0, 5 * len(clfs))]; 
    for group in group_list:
        print("Training Fold {} of {}:".format(group, len(group_list)))
        np.random.seed(SEED)
        
        x_holdout = X[groups == group]
        y_holdout = y[groups == group]
        x_train = X[groups != group]
        y_train = y[groups != group]

        y_idx = ALL_TARGETS.index(target)
        
        X_ordered.append(x_holdout)
        y_ordered.append(y_holdout)
        groups_ordered.append(groups[groups == group])
        
        base_clfs = []
        for idx, clf in enumerate(clfs):
            base_clfs.append(clone(clf))
        
        def train_model(model, X, y):
            ss = (~pd.DataFrame(y).isnull().any(axis=1))
            model.fit(X[ss], y[ss]); return model
        
        base_clfs = Parallel(n_jobs=-1)(delayed(train_model)(model, x_train, y_train[y_var]) for model in base_clfs)
        all_base_clfs.append(base_clfs)
        
        def predict_model(model, X):
            o = model.predict(X); return o    
        preds = Parallel(n_jobs=-1)(delayed(predict_model)(model, x_holdout) for model in base_clfs)
        
        
        pidx = 0; clf_pred_names = []
        for idx, clf in enumerate(base_clfs):   
            print("{:.4f} for {}".format( 
                      fnae(y_holdout[target], preds[idx]), clf_names[idx]  ) )
            base_preds[pidx].append(preds[idx]); pidx+=1;
            clf_pred_names.append(clf_names[idx])
            
        print("\nTime Elapsed: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))
        
    base_preds = base_preds[:len(clf_pred_names)]
    for idx in range(0, len(base_preds)):
        base_preds[idx] = np.concatenate(base_preds[idx])

    
    print("\Base Classifier Train Time: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))
    return (all_base_clfs, base_preds, clf_pred_names, 
        pd.concat(X_ordered), pd.concat(y_ordered), np.concatenate(groups_ordered))

In [None]:
def Lassos():
    clfs = []; clf_names = []
    lassos =  np.logspace(-6, -1, 2)
    for l in lassos:
        clfs.append(Lasso(alpha = l,  selection = 'random', max_iter = 500, tol = 1e-3))
        clf_names.append('Lasso alpha={}'.format(l))
        if CLF_SS > 1:
            clfs.append(clfs[-1]); clf_names.append(clf_names[-1])
 
    return clfs, clf_names

In [None]:
def Ridges():
    clfs = []; clf_names = []
    ridges =  np.logspace(-4, 2, 2)
    for r in ridges:
        clfs.append(Ridge(alpha = r, max_iter = 500, tol = 1e-3))
        clf_names.append('Ridge alpha={}'.format(r))
        if CLF_SS > 1:
            clfs.append(clfs[-1]); clf_names.append(clf_names[-1])

    return clfs, clf_names

In [None]:
def SVRs():
    clfs = []; clf_names = []
    svrs =  (np.logspace(-1.5, 0.5, 2), [0.01, 0.1]) 
    for c in svrs[0]:
        for e in svrs[1]:
            clfs.append(SVR(C = c, epsilon = e, cache_size=1000, max_iter = 500, tol = 1e-3))
            clf_names.append('SVR C={}, epsilon={}'.format(c,e))
            
    return clfs, clf_names

In [None]:
def ENets():
    clfs = []; clf_names = []
    enets = (np.logspace(-6, -1, 41), [0.98]) 
    for a in enets[0]:
        for l in enets[1]:
            clfs.append(ElasticNet(alpha = a, l1_ratio = l,
                         normalize = False, selection = 'random', 
                         max_iter = 5000, tol = 1e-5))
            clf_names.append('Enet alpha={}, l1_ratio={}'.format(a,l))
 
    for a in enets[0]:
        for l in enets[1]:
            clfs.append(ElasticNet(alpha = a, l1_ratio = l,
                         normalize = True, selection = 'random', 
                         max_iter = 5000, tol = 1e-5))
            clf_names.append('Enet-n alpha={}, l1_ratio={}'.format(a,l))
            
    return clfs, clf_names

In [None]:
def getBaseClfs(y_var, model_sets=None):
    idx = ALL_TARGETS.index(y_var)

    clfs = []
    clf_names = []
    
    for model_set in model_sets:
        clfs.extend(model_set[0])
        clf_names.extend(model_set[1])
   

    return clfs[::CLF_SS], clf_names[::CLF_SS];


In [None]:
ALL_TARGETS = y.columns.to_list()  
if isinstance(TARGETS, list):
    targets = [ALL_TARGETS[i] for i in TARGETS]
elif TARGETS is not None and TARGETS >= 0:
    targets = ALL_TARGETS[TARGETS: TARGETS + 1]
else:
    targets = ALL_TARGETS
# print(targets)

### Train Models

In [None]:
all_raw_base_clfs = []; all_base_clfs = []; scalers = []
y_var = targets[0]

print('---Training Models for {}---\n'.format(y_var))

# train base classifiers
model_sets=[Ridges(), Lassos()]
raw_base_clfs, base_clf_names = getBaseClfs(y_var, model_sets=model_sets)
all_raw_base_clfs.append((raw_base_clfs, base_clf_names))

base_clfs, base_clf_preds, base_clf_names, Xe, ye, ge = \
                trainBaseClfs(raw_base_clfs, base_clf_names, 
                              data = (X, y, groups), 
                              target=y_var, )
Xe = pd.concat( (Xe, pd.DataFrame( dict(zip(base_clf_names, base_clf_preds)), index=Xe.index) ),
                 axis = 'columns')

all_base_clfs.append((base_clfs, base_clf_preds, base_clf_names, Xe, ye, ge ))

# Meta model

In [None]:
def metaFilter(X):
    return X[[c for c in X.columns if c not in X_te.columns ]] 

In [None]:
def runBag(n = 3, model_types = None, data = None, **kwargs):
    start_time = datetime.datetime.now(); 
    
    X, y, groups = data
    
    group_list = [*dict.fromkeys(groups)]   
    group_list.sort()
    
    clfs = []; preds = []; ys=[]; datestack = []
    for midx, group in enumerate(group_list):
        g = gc.collect()
        x_holdout = X[midx][groups == group]
        y_holdout = y[groups == group]
        x_train = X[midx][groups != group]
        y_train = y[groups != group]
        
        groups_train = groups[groups != group]

        model = model_types[midx]
        clf = model(x_train, y_train, groups_train, **kwargs) 
        clfs.append(clf)

        predicted = clf.predict(x_holdout)
        print("{}: {:.4f}".format(group,
              fnae(y_holdout, predicted)  ) )
        
        preds.append(predicted)
        ys.append(y_holdout)
    
    y_pred = np.concatenate(preds)
    y_ho = np.concatenate(ys) 

    end_time = datetime.datetime.now(); 
    print("\nModel Bag Time: {}\n".format(str(end_time - start_time).split('.', 2)[0] ))
    return clfs

In [None]:
def trainModel(x, y, groups, clf, params, cv = 0, n_jobs = None, 
                   verbose=0, splits=None, **kwargs):
    if n_jobs is None:
        n_jobs = -1    

    n_iter = 30
    n_splits = 10
        
    n_iter = 3
    n_splits = 2
        
    folds = ShuffleSplit(n_splits = n_splits, train_size = 0.75, test_size = 0.20)
    clf = RandomizedSearchCV(clf, params, cv = folds, n_iter = n_iter, 
                            verbose = 1, n_jobs = n_jobs, scoring = fnae_scorer)
    
    f = clf.fit(x, y, groups)
    
    print(pd.DataFrame(clf.cv_results_['mean_test_score'])); print();  
    best = clf.best_estimator_;  print(best)
    print("Best Score: {}".format(np.round(clf.best_score_,4)))
    
    return best

In [None]:
STOP

change values back for train classifiers
slightly increase SVR?
also reset n_iter and n_splits above
n_iter perhaps to 100 to have more reliable results? or even 150?
also increase resolution of ranges... of these trainModels, it's anyhow randomsearch

also, slightly increase SVR C from previous part?

also verify that all n_jobs are set to -1 (there are a few 4)

In [None]:
nusvr_params = {
    'kernel': [  'rbf',  ] , 
    'C': [1],
    'gamma': [ 'scale'],
    'nu': [0.5] }

def trainNuSVR(x, y, groups, cv = 0, n_jobs = -1, **kwargs):
    clf = NuSVR(cache_size=100, tol = 1e-3)
    params = nusvr_params        
    return trainModel(x, y, groups, clf, params, cv, n_jobs,  **kwargs)

In [None]:
enet_params = { 'alpha': np.logspace(-8, 1, 2),
                'l1_ratio': [ 0.5]}

def trainENet(x, y, groups, cv = 0, n_jobs = -1, **kwargs):
    clf = ElasticNet(normalize = False, selection = 'random', max_iter = 1000, tol = 1e-3 )
    return trainModel(x, y, groups, clf, enet_params, cv, n_jobs, **kwargs)

### Run metaclassifers

In [None]:
meta_scalers = [StandardScaler()] * 5
meta_models = [trainENet] * 5

scalers_stand = [s.fit(metaFilter(Xe)) for s in meta_scalers]
all_clfs_stand = [runBag(data = ([s.transform(metaFilter(Xe)) for s in scalers_stand], ye[y_var], ge), model_types = meta_models)]

In [None]:
meta_scalers = [FunctionTransformer()] * 5
meta_models = [trainNuSVR] * 5

scalers_func = [s.fit(metaFilter(Xe)) for s in meta_scalers]
all_clfs_func = [runBag(data = ([s.transform(metaFilter(Xe)) for s in scalers_stand], ye[y_var], ge), model_types = meta_models)]

### Build Submission

In [None]:
def revert_transform(y_true, y_pred, tidx=0):
    mean_ = np.array([ 50.00929913, 374.75058741, 462.62996118, 332.46733185, 381.32490681])
    scale_ = np.array([ 13.54897461, 108.25005321, 128.03349126, 112.01700719, 124.92278531])

    t_true = y_true * scale_[tidx] + mean_[tidx]
    t_pred = y_pred * scale_[tidx] + mean_[tidx]

    age_values = np.unique(t_true)
    for i, a in enumerate(t_pred):
        t_pred[i] = age_values[np.argmin(np.abs(a-age_values))]

    if tidx > 0:
        t_true = np.power(t_true, 1./1.5)
        t_pred = np.power(t_pred, 1./1.5)
    return np.squeeze(t_pred)

In [None]:
def predictAll(X_test, all_base_clfs, all_clfs, all_scalers):
    start_time = datetime.datetime.now(); 
        
    def predict_model(model, X):
        o = model.predict(X)
        return o    
    
    all_preds = pd.DataFrame(columns = targets, index=X_test.index)
    for tidx, y_var in enumerate(targets): # loop over targets
        print(y_var)
        Xi = X_test.copy()
        base_clfs = all_base_clfs[tidx][0]

        preds = []; 
        for g_idx, g_clfs in enumerate(base_clfs): # loop over groups
            print(g_idx)
            preds.append(Parallel(n_jobs=-1)(delayed(predict_model)(g_clfs[mhm], Xi) for mhm in range(len(g_clfs))))
        print("\Base Classifier Prediction Time: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))

        c_preds = []; sub_preds = np.zeros((len(preds), len(Xi)))
        for c_idx in range(0, len(preds[0])):  
            if len(preds[0][c_idx].shape) > 1: 
                for t_idx in range(0, preds[0][c_idx].shape[1]):
                    for g_idx, this_pred_group in enumerate(preds):  
                        sub_preds[g_idx, :] = this_pred_group[c_idx][:, t_idx]
                    c_preds.append(np.mean( sub_preds, axis = 0))  
            else:
                for g_idx, this_pred_group in enumerate(preds): 
                    sub_preds[g_idx, :] = this_pred_group[c_idx]
                c_preds.append(np.mean( sub_preds, axis = 0)) 

        Xf = pd.concat( (Xi, pd.DataFrame( dict(zip(all_base_clfs[tidx][2], c_preds)), index=Xi.index) ),
                     axis = 'columns')
        print("\nTime Elapsed: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))
 
        print('\nrunning stacker')
        mmodels = all_clfs[tidx]
        pred = Parallel(n_jobs=-1)(delayed(predict_model)(mmodels[aidx], all_scalers[aidx].transform(metaFilter(Xf)))
                                  for aidx in range(5))
        sub_preds = np.zeros((len(all_clfs[tidx]), len(Xi)))
        for g_idx, clf in enumerate(all_clfs[tidx]):
            sub_preds[g_idx, :] = pred[g_idx]
        all_preds[y_var] = np.mean(sub_preds, axis = 0)

    end_time = datetime.datetime.now(); 
    print("\Prediction Time: {}\n".format(str(end_time - start_time).split('.', 2)[0] ))
    return all_preds, Xf

In [None]:
y_oos_stand, Xf = predictAll(X_te, all_base_clfs, all_clfs_stand, scalers_stand)
s_pred_stand = revert_transform(y.iloc[:, TARGETS].values, y_oos_stand.values, tidx=TARGETS)
pd.DataFrame(s_pred_stand).to_csv('submission_%02d_stand.csv' % TARGETS, index=False)

In [None]:
y_oos_func, Xf = predictAll(X_te, all_base_clfs, all_clfs_func, scalers_func)
s_pred_func = revert_transform(y.iloc[:, TARGETS].values, y_oos_func.values, tidx=TARGETS)
pd.DataFrame(s_pred_func).to_csv('submission_%02d_func.csv' % TARGETS, index=False)

In [None]:
# Which priorities should be chosen?
meta_selecter = [0, 1, 1, 0, 0]
break

In [None]:
scalers_comb = list(np.array([scalers_stand, scalers_func])[meta_selecter, 0])
all_clfs_comb = [list(np.array([all_clfs_stand[0], all_clfs_func[0]])[meta_selecter, 0])]

In [None]:
y_oos_comb, Xf = predictAll(X_te, all_base_clfs, all_clfs_comb, scalers_comb)
s_pred_comb = revert_transform(y.iloc[:, TARGETS].values, y_oos_comb.values, tidx=TARGETS)
pd.DataFrame(s_pred_comb).to_csv('submission_%02d_comb.csv' % TARGETS, index=False)