In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from yellowbrick.regressor import AlphaSelection
from xgboost import XGBRegressor

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, WhiteKernel, RBF, RationalQuadratic, Matern, ExpSineSquared
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import f_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

import tensorflow as tf
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.optimizers import Adam
from keras.layers import Dropout
from keras import regularizers
from keras import initializers

Using TensorFlow backend.


# Tool functions

## Models

In [2]:
def fit_neural_network(dropout, X_train, y_train):

    # Create model
    model = Sequential()
    model.add(Dense(30, input_dim=len(indices), kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))

    model.add(Dense(1, init='RandomUniform'))
    
    # Compile model
    optimizer = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[coeff_determination])
    
    # Fit the model
    print("Start fitting ...")
    model.fit(x=X_train, y=y_train['y'], epochs=80, verbose=0, validation_split=0.1, shuffle=True, \
              steps_per_epoch=50, initial_epoch=0, validation_steps=5)
    
    return model

In [3]:
def fit_extra_trees_regressor_with_grid_search(X_train, y_train):
    
    # Create Gradient boosting object
    etr = ExtraTreesRegressor(n_jobs=1)
    
    # Grid search 
    parameters = {'max_depth':[10, 15, 20, 25], 
                         'n_estimators':[1000],
                         'min_samples_split':[2]
                 }    
    clf = GridSearchCV(etr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for Extra Trees Regressor: " + str(clf.best_params_))
            
    return clf

In [4]:
def fit_gradient_boosting_with_grid_search(X_train, y_train):
    
    # Create Gradient boosting object
    gb = GradientBoostingRegressor()
    
    # Grid search 
    parameters = {'learning_rate':[0.06, 0.05, 0.04], 
                         'n_estimators':[1000],
                         'min_samples_split':[2], 
                         'max_depth':[2, 4, 6]}    
    clf = GridSearchCV(gb, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for Gradient Boosting: " + str(clf.best_params_))
            
    return clf

In [5]:
# ConstantKernel, DotProduct, WhiteKernel, RBF, RationalQuadratic, Matern, ExpSineSquared
def fit_gaussian_process_with_grid_search(X_train, y_train):
    
    # Create Gaussian process object
    gpr = GaussianProcessRegressor()
    
    # Grid search
    parameters = {
        'kernel': [10 * RationalQuadratic() * DotProduct(sigma_0 = 1) + 2 * ConstantKernel() + WhiteKernel(noise_level=0.5)
                  ]}
    clf = GridSearchCV(gpr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for Gaussian Process: " + str(clf.best_params_))
    
    return clf

In [6]:
def fit_adaboost_with_grid_search(X_train, y_train):
    
    # Create random forest object
    ada = AdaBoostRegressor()
    
    # Grid search 
    parameters = {'base_estimator':[
                    DecisionTreeRegressor(max_depth=35),
                    DecisionTreeRegressor(max_depth=40),
                    DecisionTreeRegressor(max_depth=45)
    ]
                  ,'n_estimators':[1000], 'loss':['square']}
    clf = GridSearchCV(ada, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for AdaBoost: " + str(clf.best_params_))
            
    return clf

In [7]:
def fit_random_forest_with_grid_search(X_train, y_train):
    
    # Create random forest object
    rf = RandomForestRegressor() 
    
    # Grid search 
    parameters = {'max_depth':[10, 15, 20], 
                  'n_estimators' :[1000],
                  'min_samples_split':[2, 3, 5],
                  'max_leaf_nodes':[100, 150]}
    clf = GridSearchCV(rf, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for Random Forest: " + str(clf.best_params_))
            
    return clf

In [8]:
def fit_simple_linear_regression(X_train, y_train):
    
    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train, y_train['y'])
    
    return regr

In [9]:
def fit_xgboost_with_grid_search(X_train, y_train):

    # Create gradient boosting object
    xgbr = XGBRegressor(verbosity=1, max_depth=10, reg_lambda=1) 
    
    # Grid search 
    parameters = {'max_depth':[1,2,4], 'reg_lambda':[8, 10, 12], 'min_child_weight':[6, 8, 12,16]}
    clf = GridSearchCV(xgbr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for XGBoost: " + str(clf.best_params_))
            
    return clf

In [10]:
def fit_svr_with_grid_search(X_train, y_train):

    # Create svr object
    svr = SVR()
    # svr = SVR(kernel='rbf', C=30, gamma=1)
    # svr = SVR(kernel='linear', C=100, gamma='auto')
    # svr = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1, coef0=1)
    
    # Grid search 
    parameters = {'kernel':['rbf'],'C':[10e5, 10e6, 10e7, 10e8], 'gamma':[10e-2, 10e-1, 1, 2, 4, 8]}
    clf = GridSearchCV(svr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for SVR: " + str(clf.best_params_))
            
    return clf 

## Data preparation

In [11]:
def feature_extraction_lasso(X_train , X_val, y_train):
    alphas = np.logspace(-1, 10, 100)
    reg = LassoCV(cv = 5, alphas= alphas)
    lasso = AlphaSelection(reg)
    lasso.fit(X_train , y_train['y'])
    alpha_best = lasso.alpha_
    coef = lasso.coef_
    print("Best Alpha = " , alpha_best)
    print('# of coef before = ' , len(coef))
    print("# of coef after = " , np.sum(coef!=0))
    X_train_extracted = X_train.loc[:,coef!=0]
    X_val_extracted = X_val.loc[:,coef!=0]
    return (X_train_extracted , X_val_extracted)

In [12]:
def feature_select_by_correlation(X_train, y_train, X_val, nb_features):
    corr = X_train.corrwith(y_train['y'], axis=0, method='pearson')
    
    feature_select = pd.DataFrame(corr.iloc[(-corr.abs()).argsort()][:nb_features])
    best_feature_names = feature_select.index.values
    
    # Feature selection of the training set
    X_train_feature_extracted = X_train[best_feature_names]

    # Feature selection of the testing set
    X_val_feature_extracted = X_val[best_feature_names]
    
    return X_train_feature_extracted, X_val_feature_extracted

In [13]:
def drop_outliers_samples_gaussian_selection(X_train, y_train):
    
    # Use Elliptic Enveloppe for outlier detection, Computation heavy
    elenv = EllipticEnvelope(support_fraction=0.95)
    elenv.fit(X_train)
    
    # Outlier indices for training
    outliers_training = elenv.predict(X_train)
    outliers_training_indices = np.argwhere(outliers_training == -1).flatten()
    
    # Drop signal outliers in training data
    X_train_without_outliers = X_train.drop(index=outliers_training_indices)
    y_train_without_outliers = y_train.drop(index=outliers_training_indices)

    return X_train_without_outliers, y_train_without_outliers    

In [14]:
def drop_outliers_samples_isolation_forest(X_train, y_train):
    
    # Use isolation forest for outlier detection, Computation heavy
    forest = IsolationForest()
    forest.fit(X_train)
    
    # Outlier indices for training
    outliers_training = forest.predict(X_train)
    outliers_training_indices = np.argwhere(outliers_training == -1).flatten()
    
    # Drop signal outliers in training data
    X_train_without_outliers = X_train.drop(index=outliers_training_indices)
    y_train_without_outliers = y_train.drop(index=outliers_training_indices)

    return X_train_without_outliers, y_train_without_outliers   

In [15]:
def fill_nan_with_median(dataframe_with_nan, feature_medians_training):
    column_index = 0
    for column in dataframe_with_nan.columns:
        dataframe_with_nan[column].fillna(feature_medians_training[column_index], inplace=True)
        column_index = column_index + 1

    return dataframe_with_nan

In [16]:
def nan_2_median(X_train, X_val):
    
    # Compute median of each feature
    feature_medians_training = np.nanmedian(X_train, axis=0)
    
    # Transform training Data
    X_train_with_median = fill_nan_with_median(X_train, feature_medians_training)

    # Tranform validation Data using Training Data
    X_val_with_median = fill_nan_with_median(X_val, feature_medians_training)
    
    return X_train_with_median, X_val_with_median

In [17]:
def feature_selection_with_f_regressor_and_random_forest(n_features_fr, n_features_rf, X_train, X_val, y_train):
    
    # Using f_regression
    features_scores = f_regression(X_train, y_train['y'])[0]
    y = list(features_scores)
    myarray = np.asarray(y)

    indices_fr = myarray.argsort()[-n_features_fr:][::-1]
    
    # Using Random Forest Regressor
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=50)
    rf.fit(X_train, y_train['y'])

    scores = list(rf.feature_importances_)
    my_rf_features = np.asarray(scores)

    indices_rf = my_rf_features.argsort()[-n_features_rf:][::-1]

    # Make the union of the two
    indices = list(np.union1d(indices_rf, indices_fr))
    
    return X_train.iloc[:, indices], X_val.iloc[:, indices]