In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#BOSTON HOUSING DATA
###################
data = pd.read_csv('Boston.csv')
data = data.drop('Unnamed: 0', axis=1)

In [None]:
def setup(data, 
          target, 
          split=0.7):
    
  from sklearn.model_selection import train_test_split
  global X_train, X_test, y_train, y_test, X, y, seed
  X = data.drop(target,axis=1)
  y = data[target]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-split)
  import random
  seed = random.randint(150,900)

In [None]:
setup(data, 'TSX')

In [None]:
def create_model(estimator = None, 
                 fold = 10, 
                 round = 4,  
                 verbose = True):
  
  #defining X_train and y_train    
  data_X = X_train
  data_y = y_train
  
  #ignore co-linearity warnings for qda and lda 
  import warnings
  warnings.filterwarnings('ignore') 
  
  import numpy as np
  import pandas as pd
  from scipy import stats
  import random
  from sklearn.model_selection import KFold
  from sklearn.metrics import mean_absolute_error
  from sklearn.metrics import mean_squared_error
  from sklearn.metrics import mean_squared_log_error
  from sklearn.metrics import max_error
  from sklearn.metrics import r2_score
    
  kf = KFold(fold, random_state=seed)

  score_mae =np.empty((0,0))
  score_mse =np.empty((0,0))
  score_rmse =np.empty((0,0))
  score_r2 =np.empty((0,0))
  score_max_error =np.empty((0,0))
  avgs_mae =np.empty((0,0))
  avgs_mse =np.empty((0,0))
  avgs_rmse =np.empty((0,0))
  avgs_r2 =np.empty((0,0))
  avgs_max_error =np.empty((0,0))
    
  if estimator == None:
    print("Please enter your custom model as on object or choose from model library. If you have previously defined the estimator, the output is generated using the same estimator") 
  elif estimator == 'lr':
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    full_name = 'Linear Regression'
  elif estimator == 'lasso':
    from sklearn.linear_model import Lasso
    model = Lasso(random_state=seed)
    full_name = 'Lasso Regression'
  elif estimator == 'ridge':
    from sklearn.linear_model import Ridge
    model = Ridge(random_state=seed)
    full_name = 'Ridge Regression'
  elif estimator == 'en':
    from sklearn.linear_model import ElasticNet
    model = ElasticNet(random_state=seed)
    full_name = 'Elastic Net'
  elif estimator == 'lars':
    from sklearn.linear_model import Lars
    model = Lars()
    full_name = 'Least Angle Regression'
  elif estimator == 'llars':
    from sklearn.linear_model import LassoLars
    model = LassoLars()
    full_name = 'Lasso Least Angle Regression'
  elif estimator == 'omp':
    from sklearn.linear_model import OrthogonalMatchingPursuit
    model = OrthogonalMatchingPursuit()
    full_name = 'Orthogonal Matching Pursuit'
  elif estimator == 'br':
    from sklearn.linear_model import BayesianRidge
    model = BayesianRidge()
    full_name = 'Bayesian Ridge Regression'    
  elif estimator == 'ard':
    from sklearn.linear_model import ARDRegression
    model = ARDRegression()
    full_name = 'Automatic Relevance Determination'        
  elif estimator == 'par':
    from sklearn.linear_model import PassiveAggressiveRegressor
    model = PassiveAggressiveRegressor(random_state=seed)
    full_name = 'Passive Aggressive Regressor'    
  elif estimator == 'ransac':
    from sklearn.linear_model import RANSACRegressor
    model = RANSACRegressor(random_state=seed)
    full_name = 'Random Sample Consensus'   
  elif estimator == 'tr':
    from sklearn.linear_model import TheilSenRegressor
    model = TheilSenRegressor(random_state=seed)
    full_name = 'TheilSen Regressor'     
  elif estimator == 'huber':
    from sklearn.linear_model import HuberRegressor
    model = HuberRegressor()
    full_name = 'Huber Regressor'   
  elif estimator == 'kr':
    from sklearn.kernel_ridge import KernelRidge
    model = KernelRidge()
    full_name = 'Kernel Ridge'    
  elif estimator == 'svm':
    from sklearn.svm import SVR
    model = SVR()
    full_name = 'Support Vector Regression'          
  elif estimator == 'knn':
    from sklearn.neighbors import KNeighborsRegressor
    model = KNeighborsRegressor()
    full_name = 'Nearest Neighbors Regression'      
  elif estimator == 'dt':
    from sklearn.tree import DecisionTreeRegressor
    model = DecisionTreeRegressor(random_state=seed)
    full_name = 'Decision Tree Regressor'
  elif estimator == 'rf':
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(random_state=seed)
    full_name = 'Random Forest Regressor'
  elif estimator == 'et':
    from sklearn.ensemble import ExtraTreesRegressor
    model = ExtraTreesRegressor(random_state=seed)
    full_name = 'Extra Trees Regressor'    
  elif estimator == 'ada':
    from sklearn.ensemble import AdaBoostRegressor
    model = AdaBoostRegressor(random_state=seed)
    full_name = 'AdaBoost Regressor'    
  elif estimator == 'gbr':
    from sklearn.ensemble import GradientBoostingRegressor
    model = GradientBoostingRegressor(random_state=seed)
    full_name = 'Gradient Boosting Regressor'       
  elif estimator == 'mlp':
    from sklearn.neural_network import MLPRegressor
    model = MLPRegressor(random_state=seed)
    full_name = 'MLP Regressor'
  else:
    model = estimator
    full_name = str(model).split("(")[0]
     
  for train_i , test_i in kf.split(data_X,data_y):
    
    Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
    ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
    model.fit(Xtrain,ytrain)
    pred_ = model.predict(Xtest)
    mae = mean_absolute_error(ytest,pred_)
    mse = mean_squared_error(ytest,pred_)
    rmse = np.sqrt(mse)
    r2 = r2_score(ytest,pred_)
    max_error_ = max_error(ytest,pred_)
    score_mae = np.append(score_mae,mae)
    score_mse = np.append(score_mse,mse)
    score_rmse = np.append(score_rmse,rmse)
    score_r2 =np.append(score_r2,r2)
    score_max_error = np.append(score_max_error,max_error_)
       
  mean_mae=np.mean(score_mae)
  mean_mse=np.mean(score_mse)
  mean_rmse=np.mean(score_rmse)
  mean_r2=np.mean(score_r2)
  mean_max_error=np.mean(score_max_error)
  std_mae=np.std(score_mae)
  std_mse=np.std(score_mse)
  std_rmse=np.std(score_rmse)
  std_r2=np.std(score_r2)
  std_max_error=np.std(score_max_error)
    
  avgs_mae = np.append(avgs_mae, mean_mae)
  avgs_mae = np.append(avgs_mae, std_mae) 
  avgs_mse = np.append(avgs_mse, mean_mse)
  avgs_mse = np.append(avgs_mse, std_mse)
  avgs_rmse = np.append(avgs_rmse, mean_rmse)
  avgs_rmse = np.append(avgs_rmse, std_rmse)
  avgs_r2 = np.append(avgs_r2, mean_r2)
  avgs_r2 = np.append(avgs_r2, std_r2)
  avgs_max_error = np.append(avgs_max_error, mean_max_error)
  avgs_max_error = np.append(avgs_max_error, std_max_error)
    
  model_results = pd.DataFrame({'MAE': score_mae, 'MSE': score_mse, 'RMSE' : score_rmse, 
                                'R2' : score_r2, 'ME' : score_max_error})

  model_avgs = pd.DataFrame({'MAE': avgs_mae, 'MSE': avgs_mse, 'RMSE' : avgs_rmse, 'R2' : avgs_r2,
                             'ME' : avgs_max_error},index=['Mean', 'SD'])
    
  model_results = model_results.append(model_avgs)
  model_results = model_results.round(round)  
  model_results = model_results.style.set_table_styles([ dict(selector='th', props=[('text-align', 'center')] ) ])

  if verbose:
    display(model_results)
    return model
  else:
    return model

In [None]:
def compare_models(model_library = None, 
                   fold = 10, 
                   round = 4, 
                   sort = 'MAE', 
                   blacklist = None):
  
  #ignore warnings
  import warnings
  warnings.filterwarnings('ignore') 
    
  #defining X_train and y_train
  data_X = X_train
  data_y=y_train

  from sklearn.linear_model import LinearRegression
  from sklearn.linear_model import Ridge
  from sklearn.linear_model import Lasso
  from sklearn.linear_model import ElasticNet
  from sklearn.linear_model import Lars
  from sklearn.linear_model import LassoLars
  from sklearn.linear_model import OrthogonalMatchingPursuit
  from sklearn.linear_model import BayesianRidge
  from sklearn.linear_model import ARDRegression
  from sklearn.linear_model import PassiveAggressiveRegressor
  from sklearn.linear_model import RANSACRegressor
  from sklearn.linear_model import TheilSenRegressor
  from sklearn.linear_model import HuberRegressor
  from sklearn.kernel_ridge import KernelRidge
  from sklearn.svm import SVR
  from sklearn.neighbors import KNeighborsRegressor
  from sklearn.tree import DecisionTreeRegressor
  from sklearn.ensemble import RandomForestRegressor
  from sklearn.ensemble import ExtraTreesRegressor
  from sklearn.ensemble import AdaBoostRegressor
  from sklearn.ensemble import GradientBoostingRegressor
  from sklearn.neural_network import MLPRegressor 
  from sklearn.model_selection import KFold
  from sklearn.model_selection import cross_val_score
  from sklearn.model_selection import cross_val_predict
  from sklearn.model_selection import cross_validate
  from sklearn.model_selection import KFold
  from sklearn.metrics import mean_absolute_error
  from sklearn.metrics import mean_squared_error
  from sklearn.metrics import mean_squared_log_error
  from sklearn.metrics import max_error
  from sklearn.metrics import r2_score
  import numpy as np
  import pandas as pd
  import random
  import pandas.io.formats.style

  lr = LinearRegression()
  lasso = Lasso(random_state=seed)
  ridge = Ridge(random_state=seed)
  en = ElasticNet(random_state=seed)
  lars = Lars()
  llars = LassoLars()
  omp = OrthogonalMatchingPursuit()
  br = BayesianRidge()
  ard = ARDRegression()
  par = PassiveAggressiveRegressor(random_state=seed)
  ransac = RANSACRegressor(random_state=seed)
  tr = TheilSenRegressor(random_state=seed)
  huber = HuberRegressor()
  kr = KernelRidge()
  svr = SVR()
  knn = KNeighborsRegressor()
  dt = DecisionTreeRegressor(random_state=seed)
  rf = RandomForestRegressor(random_state=seed)
  et = ExtraTreesRegressor(random_state=seed)
  ada = AdaBoostRegressor(random_state=seed)
  gbr = GradientBoostingRegressor(random_state=seed)
  mlp = MLPRegressor(random_state=seed)  
  
  #blacklist models

  if model_library != None:
    
    model_library = model_library
    
    model_names = []
    
    for names in model_library:
        
        model_names = np.append(model_names, str(names).split("(")[0])
        
        import re 
        
        def putSpace(input):
            words = re.findall('[A-Z][a-z]*', input)
            words = ' '.join(words)
            return words  

        model_names_modified = []
        for i in model_names:
            model_names_modified.append(putSpace(i))
            
        model_names_modified = []
        for i in model_names:
            model_names_modified.append(putSpace(i))

        model_names = model_names_modified

        model_names_final = []
        for j in model_names:
            if j == 'A R D Regression':
                model_names_final.append('Automatic Relevance Determination')
            elif j == 'M L P Regressor':
                model_names_final.append('MLP Regressor')
            elif j == 'R A N S A C Regressor':
                model_names_final.append('RANSAC Regressor')
            elif j == 'S V R':
                model_names_final.append('Support Vector Regressor')
            elif j == 'Lars':
                model_names_final.append('Least Angle Regression')                
            else: 
                model_names_final.append(j)

        model_names = model_names_final    
    
  else:
        
    if blacklist == None:
        
        model_library = [lr, lasso, ridge, en, lars, llars, omp, br, ard, par, ransac, tr, huber, kr, svr, knn, 
                        dt, rf, et, ada, gbr, mlp]
    
        model_names = []
    
        for names in model_library:
            model_names = np.append(model_names, str(names).split("(")[0])
    
        import re 

        def putSpace(input):
            words = re.findall('[A-Z][a-z]*', input)
            words = ' '.join(words)
            return words  

        model_names_modified = []
        for i in model_names:
            model_names_modified.append(putSpace(i))

        model_names = model_names_modified

        model_names_final = []
        for j in model_names:
            
            if j == 'A R D Regression':
                model_names_final.append('Automatic Relevance Determination')
            elif j == 'M L P Regressor':
                model_names_final.append('MLP Regressor')
            elif j == 'R A N S A C Regressor':
                model_names_final.append('RANSAC Regressor')
            elif j == 'S V R':
                model_names_final.append('Support Vector Regressor')
            elif j == 'Lars':
                model_names_final.append('Least Angle Regression')  
            else: 
                model_names_final.append(j)

        model_names = model_names_final

    else:
        
        model_library_values = ['lr', 'lasso', 'ridge', 'en', 'lars', 'llars', 'omp', 'br', 'ard', 'par', 'ransac', 'tr',
                                'huber', 'kr', 'svr', 'knn', 'dt', 'rf', 'et', 'ada', 'gbr', 'mlp']

        location = []

        for item in blacklist:
            location.append(model_library_values.index(item))

        model_library = [lr, lasso, ridge, en, lars, llars, omp, br, ard, par, ransac, tr, huber, kr, svr, knn, 
                        dt, rf, et, ada, gbr, mlp]

        for i in location:
            del model_library[i]

        model_names = []

        for names in model_library:
            model_names = np.append(model_names, str(names).split("(")[0])

        import re

        def putSpace(input):
            words = re.findall('[A-Z][a-z]*', input)
            words = ' '.join(words)
            return words  

        model_names_modified = []
        for i in model_names:
            model_names_modified.append(putSpace(i))

        model_names = model_names_modified

        model_names_final = []
        for j in model_names:
            if j == 'Gaussian N B':
                model_names_final.append('Naive Bayes')
            elif j == 'M L P Classifier':
                model_names_final.append('MLP Classifier')
            elif j == 'S G D Classifier':
                model_names_final.append('SVM - Linear Kernel')
            elif j == 'S V C':
                model_names_final.append('SVM - Radial Kernel')
            else: 
                model_names_final.append(j)

        model_names = model_names_final

  kf = KFold(fold, random_state=seed)
  
  score_mae =np.empty((0,0))
  score_mse =np.empty((0,0))
  score_rmse =np.empty((0,0))
  score_r2 =np.empty((0,0))
  score_max_error =np.empty((0,0))
  #avgs_mae =np.empty((0,0))
  #avgs_mse =np.empty((0,0))
  #avgs_rmse =np.empty((0,0))
  #avgs_r2 =np.empty((0,0))
  #avgs_max_error =np.empty((0,0))
    
  avg_mae = np.empty((0,0))
  avg_mse = np.empty((0,0))
  avg_rmse = np.empty((0,0))
  avg_r2 = np.empty((0,0))
  avg_max_error = np.empty((0,0))
  #avg_kappa = np.empty((0,0))
      
  for model in model_library:
 
    for train_i , test_i in kf.split(data_X,data_y):
     
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
        model.fit(Xtrain,ytrain)
        pred_ = model.predict(Xtest)
        mae = mean_absolute_error(ytest,pred_)
        mse = mean_squared_error(ytest,pred_)
        rmse = np.sqrt(mse)
        r2 = r2_score(ytest,pred_)
        max_error_ = max_error(ytest,pred_)
        score_mae = np.append(score_mae,mae)
        score_mse = np.append(score_mse,mse)
        score_rmse = np.append(score_rmse,rmse)
        score_r2 =np.append(score_r2,r2)
        score_max_error = np.append(score_max_error,max_error_)
        
    avg_mae = np.append(avg_mae,np.mean(score_mae))
    avg_mse = np.append(avg_mse,np.mean(score_mse))
    avg_rmse = np.append(avg_rmse,np.mean(score_rmse))
    avg_r2 = np.append(avg_r2,np.mean(score_r2))
    avg_max_error = np.append(avg_max_error,np.mean(score_max_error))
    
    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
    score_rmse =np.empty((0,0))
    score_r2 =np.empty((0,0))
    score_max_error =np.empty((0,0))
    #score_kappa =np.empty((0,0))
  
  def highlight_min(s):
    is_min = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_min]

  def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

  compare_models_ = pd.DataFrame({'Model':model_names, 'MAE':avg_mae, 'MSE':avg_mse, 
                     'RMSE':avg_rmse, 'R2':avg_r2, 
                     'ME':avg_max_error}).round(round).sort_values(by=[sort], 
                      ascending=True).reset_index(drop=True).style.apply(highlight_min,subset=['MAE','MSE','RMSE','ME']) #.style.apply(highlight_max, subset='R2')
  compare_models_ = compare_models_.set_properties(**{'text-align': 'left'})
  compare_models_ = compare_models_.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
                  
  return compare_models_

In [None]:
def ensemble_model(estimator,
                   method = 'Bagging', 
                   fold = 10,
                   n_estimators = 10,
                   round = 4,  
                   verbose = True):
    
    #loading general libraries
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_squared_log_error
    from sklearn.metrics import max_error
    from sklearn.metrics import r2_score
    
    #defining X_train and y_train    
    data_X = X_train
    data_y = y_train
  
    #ignore co-linearity warnings for qda and lda 
    import warnings
    warnings.filterwarnings('ignore') 
    
    #defining estimator as model
    model = estimator
     
    if method == 'Bagging':
        from sklearn.ensemble import BaggingRegressor
        model = BaggingRegressor(model,bootstrap=True,n_estimators=n_estimators, random_state=seed)
        
    else:
        from sklearn.ensemble import AdaBoostRegressor
        model = AdaBoostRegressor(model, random_state=seed)
    
    kf = KFold(fold, random_state=seed)
    
    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
    score_rmse =np.empty((0,0))
    score_r2 =np.empty((0,0))
    score_max_error =np.empty((0,0))
    avgs_mae =np.empty((0,0))
    avgs_mse =np.empty((0,0))
    avgs_rmse =np.empty((0,0))
    avgs_r2 =np.empty((0,0))
    avgs_max_error =np.empty((0,0))

    for train_i , test_i in kf.split(data_X,data_y):
        
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
        model.fit(Xtrain,ytrain)
        pred_ = model.predict(Xtest)
        mae = mean_absolute_error(ytest,pred_)
        mse = mean_squared_error(ytest,pred_)
        rmse = np.sqrt(mse)
        r2 = r2_score(ytest,pred_)
        max_error_ = max_error(ytest,pred_)
        score_mae = np.append(score_mae,mae)
        score_mse = np.append(score_mse,mse)
        score_rmse = np.append(score_rmse,rmse)
        score_r2 =np.append(score_r2,r2)
        score_max_error = np.append(score_max_error,max_error_)

    mean_mae=np.mean(score_mae)
    mean_mse=np.mean(score_mse)
    mean_rmse=np.mean(score_rmse)
    mean_r2=np.mean(score_r2)
    mean_max_error=np.mean(score_max_error)
    std_mae=np.std(score_mae)
    std_mse=np.std(score_mse)
    std_rmse=np.std(score_rmse)
    std_r2=np.std(score_r2)
    std_max_error=np.std(score_max_error)

    avgs_mae = np.append(avgs_mae, mean_mae)
    avgs_mae = np.append(avgs_mae, std_mae) 
    avgs_mse = np.append(avgs_mse, mean_mse)
    avgs_mse = np.append(avgs_mse, std_mse)
    avgs_rmse = np.append(avgs_rmse, mean_rmse)
    avgs_rmse = np.append(avgs_rmse, std_rmse)
    avgs_r2 = np.append(avgs_r2, mean_r2)
    avgs_r2 = np.append(avgs_r2, std_r2)
    avgs_max_error = np.append(avgs_max_error, mean_max_error)
    avgs_max_error = np.append(avgs_max_error, std_max_error)

    
    
    model_results = pd.DataFrame({'MAE': score_mae, 'MSE': score_mse, 'RMSE' : score_rmse, 
                                    'R2' : score_r2, 'ME' : score_max_error})
    model_avgs = pd.DataFrame({'MAE': avgs_mae, 'MSE': avgs_mse, 'RMSE' : avgs_rmse, 'R2' : avgs_r2,
                             'ME' : avgs_max_error},index=['Mean', 'SD'])

    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)  
    model_results = model_results.style.set_table_styles([ dict(selector='th', props=[('text-align', 'center')] ) ])
    
    model = model
    
    if verbose:
        display(model_results)
        return model
    else:
        return model    

In [None]:
def blend_models(estimator_list = None, 
                 fold = 10, 
                 round = 4, 
                 sort = 'mae'):
  
  data_X = X_train
  data_y = y_train
    
  import numpy as np
  import pandas as pd
  from sklearn import metrics
  from scipy import stats
  import random
  from sklearn.linear_model import LinearRegression
  from sklearn.linear_model import Ridge
  from sklearn.linear_model import Lasso
  from sklearn.linear_model import ElasticNet
  from sklearn.linear_model import Lars
  from sklearn.linear_model import LassoLars
  from sklearn.linear_model import OrthogonalMatchingPursuit
  from sklearn.linear_model import BayesianRidge
  from sklearn.linear_model import ARDRegression
  from sklearn.linear_model import PassiveAggressiveRegressor
  from sklearn.linear_model import RANSACRegressor
  from sklearn.linear_model import TheilSenRegressor
  from sklearn.linear_model import HuberRegressor
  from sklearn.kernel_ridge import KernelRidge
  from sklearn.svm import SVR
  from sklearn.neighbors import KNeighborsRegressor
  from sklearn.tree import DecisionTreeRegressor
  from sklearn.ensemble import RandomForestRegressor
  from sklearn.ensemble import ExtraTreesRegressor
  from sklearn.ensemble import AdaBoostRegressor
  from sklearn.ensemble import GradientBoostingRegressor
  from sklearn.neural_network import MLPRegressor 
  from sklearn.ensemble import VotingRegressor
  from sklearn.model_selection import KFold
  from sklearn.metrics import mean_absolute_error
  from sklearn.metrics import mean_squared_error
  from sklearn.metrics import mean_squared_log_error
  from sklearn.metrics import max_error
  from sklearn.metrics import r2_score

  kf = KFold(fold, random_state=seed)

  score_mae =np.empty((0,0))
  score_mse =np.empty((0,0))
  score_rmse =np.empty((0,0))
  score_r2 =np.empty((0,0))
  score_max_error =np.empty((0,0))
  avgs_mae =np.empty((0,0))
  avgs_mse =np.empty((0,0))
  avgs_rmse =np.empty((0,0))
  avgs_r2 =np.empty((0,0))
  avgs_max_error =np.empty((0,0))
 
  lr = LinearRegression()
  lasso = Lasso(random_state=seed)
  ridge = Ridge(random_state=seed)
  en = ElasticNet(random_state=seed)
  lars = Lars()
  llars = LassoLars()
  omp = OrthogonalMatchingPursuit()
  br = BayesianRidge()
  ard = ARDRegression()
  par = PassiveAggressiveRegressor(random_state=seed)
  ransac = RANSACRegressor(random_state=seed)
  tr = TheilSenRegressor(random_state=seed)
  huber = HuberRegressor()
  kr = KernelRidge()
  svr = SVR()
  knn = KNeighborsRegressor()
  dt = DecisionTreeRegressor(random_state=seed)
  rf = RandomForestRegressor(random_state=seed)
  et = ExtraTreesRegressor(random_state=seed)
  ada = AdaBoostRegressor(random_state=seed)
  gbr = GradientBoostingRegressor(random_state=seed)
  mlp = MLPRegressor(random_state=seed)  
    
  if estimator_list == None:
    estimator_list = [lr,lasso,ridge,en,lars,llars,omp,br,ard,par,ransac,tr,huber,kr,svr,knn,dt,rf,et,ada,gbr,mlp]

  else:
    estimator_list = estimator_list

  model_names = []

  for names in estimator_list:
    model_names = np.append(model_names, str(names).split("(")[0])

  import re

  def putSpace(input):
        words = re.findall('[A-Z][a-z]*', input)
        words = ' '.join(words)
        return words  

  model_names_modified = []
  
  for i in model_names:
    model_names_modified.append(putSpace(i))

    model_names = model_names_modified

  model_names_final = []
  
  for j in model_names:

    if j == 'A R D Regression':
        model_names_final.append('Automatic Relevance Determination')
    elif j == 'M L P Regressor':
        model_names_final.append('MLP Regressor')
    elif j == 'R A N S A C Regressor':
        model_names_final.append('RANSAC Regressor')
    elif j == 'S V R':
        model_names_final.append('Support Vector Regressor')
    elif j == 'Lars':
        model_names_final.append('Least Angle Regression')                
    else: 
        model_names_final.append(j)
        
  model_names = model_names_final
  estimator_list = estimator_list
  estimator_list = zip(model_names, estimator_list)
  estimator_list = set(estimator_list)
  estimator_list = list(estimator_list)
    
  model = VotingRegressor(estimators=estimator_list, n_jobs=-1)
  
  for train_i , test_i in kf.split(data_X,data_y):
    
    Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
    ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]    
    model.fit(Xtrain,ytrain)
    pred_ = model.predict(Xtest)
    mae = mean_absolute_error(ytest,pred_)
    mse = mean_squared_error(ytest,pred_)
    rmse = np.sqrt(mse)
    r2 = r2_score(ytest,pred_)
    max_error_ = max_error(ytest,pred_)
    score_mae = np.append(score_mae,mae)
    score_mse = np.append(score_mse,mse)
    score_rmse = np.append(score_rmse,rmse)
    score_r2 =np.append(score_r2,r2)
    score_max_error = np.append(score_max_error,max_error_)
       
  mean_mae=np.mean(score_mae)
  mean_mse=np.mean(score_mse)
  mean_rmse=np.mean(score_rmse)
  mean_r2=np.mean(score_r2)
  mean_max_error=np.mean(score_max_error)
  std_mae=np.std(score_mae)
  std_mse=np.std(score_mse)
  std_rmse=np.std(score_rmse)
  std_r2=np.std(score_r2)
  std_max_error=np.std(score_max_error)
    
  avgs_mae = np.append(avgs_mae, mean_mae)
  avgs_mae = np.append(avgs_mae, std_mae) 
  avgs_mse = np.append(avgs_mse, mean_mse)
  avgs_mse = np.append(avgs_mse, std_mse)
  avgs_rmse = np.append(avgs_rmse, mean_rmse)
  avgs_rmse = np.append(avgs_rmse, std_rmse)
  avgs_r2 = np.append(avgs_r2, mean_r2)
  avgs_r2 = np.append(avgs_r2, std_r2)
  avgs_max_error = np.append(avgs_max_error, mean_max_error)
  avgs_max_error = np.append(avgs_max_error, std_max_error)
    
  model_results = pd.DataFrame({'MAE': score_mae, 'MSE': score_mse, 'RMSE' : score_rmse, 
                                'R2' : score_r2, 'ME' : score_max_error})
  model_avgs = pd.DataFrame({'MAE': avgs_mae, 'MSE': avgs_mse, 'RMSE' : avgs_rmse, 'R2' : avgs_r2,
                             'ME' : avgs_max_error},index=['Mean', 'SD'])
    
  model_results = model_results.append(model_avgs)
  model_results = model_results.round(round)  
  model_results = model_results.style.set_table_styles([ dict(selector='th', props=[('text-align', 'center')] ) ])
  display(model_results)
  return model

In [None]:
def stack_models(estimator_list, 
                 meta_model = None, 
                 fold = 10,
                 round = 4, 
                 restack = False, 
                 plot = False):
       
    #Defining meta model. Linear Regression hardcoded for now
    
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import cross_val_predict
    from sklearn.model_selection import cross_validate
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_squared_log_error
    from sklearn.metrics import max_error
    from sklearn.metrics import r2_score
    
    if meta_model == None:
        from sklearn.linear_model import LinearRegression
        meta_model = LinearRegression()
    else:
        meta_model = meta_model
    
    #defining model_library model names
    
    model_names = np.zeros(0)
    for item in estimator_list:
        model_names = np.append(model_names, str(item).split("(")[0])
    
    ##########################
    ##########################
    ##########################
    
    base_array = np.zeros((0,0))
    base_prediction = pd.DataFrame(y_train)
    base_prediction = base_prediction.reset_index(drop=True)
    
    for model in estimator_list:
        base_array = cross_val_predict(model,X_train,y_train,cv=fold)
        base_array = base_array
        base_array_df = pd.DataFrame(base_array)
        base_prediction = pd.concat([base_prediction,base_array_df],axis=1)
        base_array = np.empty((0,0))
        
    #defining column names now
    target_col_name = np.array(base_prediction.columns[0])
    model_names = np.append(target_col_name, model_names)
    base_prediction.columns = model_names #defining colum names now
    
    #defining data_X and data_y dataframe to be used in next stage.
    
    if restack:
        data_X_ = X_train
        data_X_ = data_X_.reset_index(drop=True)
        data_X = base_prediction.drop(base_prediction.columns[0],axis=1)
        data_X = pd.concat([data_X_,data_X],axis=1)
        
    elif restack == False:
        data_X = base_prediction.drop(base_prediction.columns[0],axis=1)
        
    data_y = base_prediction[base_prediction.columns[0]]
    
    #Correlation matrix of base_prediction
    base_prediction_cor = base_prediction.drop(base_prediction.columns[0],axis=1)
    base_prediction_cor = base_prediction_cor.corr()
    
    #Meta Modeling Starts Here
    
    model = meta_model #this defines model to be used below as model = meta_model (as captured above)

    kf = KFold(fold, random_state=seed) #capturing fold requested by user
    
    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
    score_rmse =np.empty((0,0))
    score_r2 =np.empty((0,0))
    score_max_error =np.empty((0,0))
    avgs_mae =np.empty((0,0))
    avgs_mse =np.empty((0,0))
    avgs_rmse =np.empty((0,0))
    avgs_r2 =np.empty((0,0))
    avgs_max_error =np.empty((0,0))
    
    for train_i , test_i in kf.split(data_X,data_y):
        
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]

        model.fit(Xtrain,ytrain)
        pred_ = model.predict(Xtest)
        mae = mean_absolute_error(ytest,pred_)
        mse = mean_squared_error(ytest,pred_)
        rmse = np.sqrt(mse)
        r2 = r2_score(ytest,pred_)
        max_error_ = max_error(ytest,pred_)
        score_mae = np.append(score_mae,mae)
        score_mse = np.append(score_mse,mse)
        score_rmse = np.append(score_rmse,rmse)
        score_r2 =np.append(score_r2,r2)
        score_max_error = np.append(score_max_error,max_error_)
   
    mean_mae=np.mean(score_mae)
    mean_mse=np.mean(score_mse)
    mean_rmse=np.mean(score_rmse)
    mean_r2=np.mean(score_r2)
    mean_max_error=np.mean(score_max_error)
    std_mae=np.std(score_mae)
    std_mse=np.std(score_mse)
    std_rmse=np.std(score_rmse)
    std_r2=np.std(score_r2)
    std_max_error=np.std(score_max_error)

    avgs_mae = np.append(avgs_mae, mean_mae)
    avgs_mae = np.append(avgs_mae, std_mae) 
    avgs_mse = np.append(avgs_mse, mean_mse)
    avgs_mse = np.append(avgs_mse, std_mse)
    avgs_rmse = np.append(avgs_rmse, mean_rmse)
    avgs_rmse = np.append(avgs_rmse, std_rmse)
    avgs_r2 = np.append(avgs_r2, mean_r2)
    avgs_r2 = np.append(avgs_r2, std_r2)
    avgs_max_error = np.append(avgs_max_error, mean_max_error)
    avgs_max_error = np.append(avgs_max_error, std_max_error)
           
    
    model_results = pd.DataFrame({'MAE': score_mae, 'MSE': score_mse, 'RMSE' : score_rmse, 
                                'R2' : score_r2, 'ME' : score_max_error})

    model_avgs = pd.DataFrame({'MAE': avgs_mae, 'MSE': avgs_mse, 'RMSE' : avgs_rmse, 'R2' : avgs_r2,
                             'ME' : avgs_max_error},index=['Mean', 'SD'])

    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)  
    model_results = model_results.style.set_table_styles([ dict(selector='th', props=[('text-align', 'center')] ) ])
    
    models = []
    for i in estimator_list:
        models.append(i)
    
    models.append(meta_model)
    
    if plot:
        ax = sns.heatmap(base_prediction_cor, vmin=-0.5, vmax=1, center=0,cmap='magma', square=True, annot=True, 
                         linewidths=1)
    
    else:
        display(model_results)
        return models

In [None]:
def tune_model(estimator = None, 
               fold = 10, 
               round = 4, 
               n_iter = 10, 
               optimize = 'mae',
               ensemble = False, 
               method = 'Bagging',
               verbose = True):
  
  #defining data
  data_X = X_train
  data_y = y_train
  
  #defining optimizer
  if optimize == 'mae':
    optimize = 'neg_mean_absolute_error'
  elif optimize == 'mse':
    optimize = 'neg_mean_squared_error'
  elif optimize == 'me':
    optimize = 'max_error'
  elif optimize == 'r2':
    optimize = 'r2'
    
  import numpy as np
  import pandas as pd
  from sklearn import metrics
  from sklearn.model_selection import GridSearchCV
  from sklearn.model_selection import RandomizedSearchCV
  from scipy import stats
  import random
  from sklearn.metrics import mean_absolute_error
  from sklearn.metrics import mean_squared_error
  from sklearn.metrics import mean_squared_log_error
  from sklearn.metrics import max_error
  from sklearn.metrics import r2_score
  from sklearn.model_selection import KFold
    
  kf = KFold(fold, random_state=seed)

  score_mae =np.empty((0,0))
  score_mse =np.empty((0,0))
  score_rmse =np.empty((0,0))
  score_r2 =np.empty((0,0))
  score_max_error =np.empty((0,0))
  avgs_mae =np.empty((0,0))
  avgs_mse =np.empty((0,0))
  avgs_rmse =np.empty((0,0))
  avgs_r2 =np.empty((0,0))
  avgs_max_error =np.empty((0,0))
    
  if estimator == 'lr':
    
    from sklearn.linear_model import LinearRegression
    param_grid = {'fit_intercept': [True, False],
             'normalize' : [True, False]
                 }        
    model_grid = RandomizedSearchCV(estimator=LinearRegression(), param_distributions=param_grid, 
                                    scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                   n_jobs=-1, iid=False)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_
 
  elif estimator == 'lasso':
        
    from sklearn.linear_model import Lasso
    
    param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
              'fit_intercept': [True, False],
              'normalize' : [True, False],
                 }
    model_grid = RandomizedSearchCV(estimator=Lasso(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=fold, 
                                    random_state=seed, iid=False,n_jobs=-1)
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_
 
  elif estimator == 'ridge':
    
    from sklearn.linear_model import Ridge
    
    param_grid = {"alpha": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
              "fit_intercept": [True, False],
              "normalize": [True, False],
              }
    
    model_grid = RandomizedSearchCV(estimator=Ridge(random_state=seed), param_distributions=param_grid,
                                   scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                   iid=False, n_jobs=-1)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_
 
  elif estimator == 'en':
    
    from sklearn.linear_model import ElasticNet
    
    param_grid = {'alpha': [0.0001,0.001,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                  'l1_ratio' : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                  'fit_intercept': [True, False],
                  'normalize': [True, False]
                 } 
   
    model_grid = RandomizedSearchCV(estimator=ElasticNet(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=fold, 
                                    random_state=seed, iid=False, n_jobs=-1)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_
    
  elif estimator == 'lars':
    
    from sklearn.linear_model import Lars
    
    param_grid = {'fit_intercept':[True, False],
                 'normalize' : [True, False],
                 'eps': [0.00001, 0.0001, 0.001, 0.01, 0.05, 0.0005, 0.005, 0.00005, 0.02, 0.007]}
   
    model_grid = RandomizedSearchCV(estimator=Lars(), param_distributions=param_grid,
                                   scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                   n_jobs=-1)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_  
    
  elif estimator == 'llars':

    from sklearn.linear_model import LassoLars
    
    param_grid = {'alpha': [0.0001,0.001,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                 'fit_intercept':[True, False],
                 'normalize' : [True, False],
                 'eps': [0.00001, 0.0001, 0.001, 0.01, 0.05, 0.0005, 0.005, 0.00005, 0.02, 0.007]}
    
    model_grid = RandomizedSearchCV(estimator=LassoLars(), param_distributions=param_grid,
                                   scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                   n_jobs=-1)
        
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_    
    
  elif estimator == 'omp':
    
    from sklearn.linear_model import OrthogonalMatchingPursuit
    import random
    
    param_grid = {'n_nonzero_coefs': range(0,len(X_train.columns)+1),
                  'fit_intercept' : [True, False],
                  'normalize': [True, False]}
    
    model_grid = RandomizedSearchCV(estimator=OrthogonalMatchingPursuit(), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_        

  elif estimator == 'br':
   
    from sklearn.linear_model import BayesianRidge

    param_grid = {'alpha_1': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'alpha_2': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'lambda_1': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'lambda_2': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'compute_score': [True, False],
                  'fit_intercept': [True, False],
                  'normalize': [True, False]
                 }    
    
    model_grid = RandomizedSearchCV(estimator=BayesianRidge(), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_    

  elif estimator == 'ard':
   
    from sklearn.linear_model import ARDRegression

    param_grid = {'alpha_1': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'alpha_2': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'lambda_1': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'lambda_2': [0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1, 0.15, 0.2, 0.3],
                  'threshold_lambda' : [5000,10000,15000,20000,25000,30000,35000,40000,45000,50000,55000,60000],
                  'compute_score': [True, False],
                  'fit_intercept': [True, False],
                  'normalize': [True, False]
                 }    
    
    model_grid = RandomizedSearchCV(estimator=ARDRegression(), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_       
    
  elif estimator == 'par':
   
    from sklearn.linear_model import PassiveAggressiveRegressor

    param_grid = {'C': [0.01, 0.005, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                  'fit_intercept': [True, False],
                  'early_stopping' : [True, False],
                  #'validation_fraction': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                  'loss' : ['epsilon_insensitive', 'squared_epsilon_insensitive'],
                  'epsilon' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                  'shuffle' : [True, False]
                 }    
    
    model_grid = RandomizedSearchCV(estimator=PassiveAggressiveRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_         
    
  elif estimator == 'ransac':
   
    from sklearn.linear_model import RANSACRegressor

    param_grid = {'min_samples': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                  'max_trials': [1,2,3,4,5,6,7,8,9,10],
                  'max_skips': [1,2,3,4,5,6,7,8,9,10],
                  'stop_n_inliers': [1,2,3,4,5,6,7,8,9,10],
                  'stop_probability': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                  'loss' : ['absolute_loss', 'squared_loss'],
                 }    
    
    model_grid = RandomizedSearchCV(estimator=RANSACRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_         
    
  elif estimator == 'tr':
   
    from sklearn.linear_model import TheilSenRegressor

    param_grid = {'fit_intercept': [True, False],
                  'max_subpopulation': [5000, 10000, 15000, 20000, 25000, 30000, 40000, 50000]
                 }    
    
    model_grid = RandomizedSearchCV(estimator=TheilSenRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_    
    
  elif estimator == 'huber':
   
    from sklearn.linear_model import HuberRegressor

    param_grid = {'epsilon': [1.1, 1.2, 1.3, 1.35, 1.4, 1.5, 1.55, 1.6, 1.7, 1.8, 1.9],
                  'alpha': [0.00001, 0.0001, 0.0003, 0.005, 0.05, 0.1, 0.0005, 0.15],
                  'fit_intercept' : [True, False]
                 }    
    
    model_grid = RandomizedSearchCV(estimator=HuberRegressor(), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_        

  elif estimator == 'kr':
    
    from sklearn.kernel_ridge import KernelRidge

    param_grid = {'alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] }    
    
    model_grid = RandomizedSearchCV(estimator=KernelRidge(), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_       
    
  elif estimator == 'svm':
    
    from sklearn.svm import SVR

    param_grid = {#'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                  #'float' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                  'C' : [0.01, 0.005, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                  'epsilon' : [1.1, 1.2, 1.3, 1.35, 1.4, 1.5, 1.55, 1.6, 1.7, 1.8, 1.9],
                  'shrinking': [True, False]
                 }    
    
    model_grid = RandomizedSearchCV(estimator=SVR(), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_     
    
  elif estimator == 'knn':
    
    from sklearn.neighbors import KNeighborsRegressor

    param_grid = {'n_neighbors': range(1,51),
                 'weights' :  ['uniform', 'distance'],
                 'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                 'leaf_size': [10,20,30,40,50,60,70,80,90]
                 } 
    
    model_grid = RandomizedSearchCV(estimator=KNeighborsRegressor(), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_         
    
  elif estimator == 'dt':
    
    from sklearn.tree import DecisionTreeRegressor

    param_grid = {"max_depth": np.random.randint(3, (len(X_train.columns)*.85),4),
                  "max_features": np.random.randint(3, len(X_train.columns),4),
                  "min_samples_leaf": [0.1,0.2,0.3,0.4,0.5],
                  "min_samples_split" : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                  "min_weight_fraction_leaf" : [0.1,0.2,0.3,0.4,0.5],
                  "min_impurity_decrease" : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                  "criterion": ["mse", "mae", "friedman_mse"],
                  #"max_leaf_nodes" : [1,2,3,4,5,6,7,8,9,10,None]
                 } 
    
    model_grid = RandomizedSearchCV(estimator=DecisionTreeRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_         
        
  elif estimator == 'rf':
    
    from sklearn.ensemble import RandomForestRegressor
    
    
    param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                  'criterion': ['mse', 'mae'],
                  'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                  'min_samples_split': [2, 5, 7, 9, 10],
                  'min_samples_leaf' : [1, 2, 4],
                  'max_features' : ['auto', 'sqrt', 'log2'],
                  'bootstrap': [True, False]
                  }
    
    model_grid = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_       
    

  elif estimator == 'et':
    
    from sklearn.ensemble import ExtraTreesRegressor
    
    param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                  'criterion': ['mse', 'mae'],
                  'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                  'min_samples_split': [2, 5, 7, 9, 10],
                  'min_samples_leaf' : [1, 2, 4],
                  'max_features' : ['auto', 'sqrt', 'log2'],
                  'bootstrap': [True, False]
                  }  
    
    model_grid = RandomizedSearchCV(estimator=ExtraTreesRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_       
                
  elif estimator == 'ada':
    
    from sklearn.ensemble import AdaBoostRegressor
    
    param_grid = {'n_estimators': [10, 40, 70, 80, 90, 100, 120, 140, 150],
                  'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                  'loss' : ["linear", "square", "exponential"]
                 }    
    
    model_grid = RandomizedSearchCV(estimator=AdaBoostRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_ 

  elif estimator == 'gbr':
    
    from sklearn.ensemble import GradientBoostingRegressor
    
    param_grid = {'loss': ['ls', 'lad', 'huber', 'quantile'],
                  'n_estimators': [10, 40, 70, 80, 90, 100, 120, 140, 150],
                  'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                  'subsample' : [0.1,0.3,0.5,0.7,0.9,1],
                  'criterion' : ['friedman_mse', 'mse', 'mae'],
                  'min_samples_split' : [2,4,5,7,9,10],
                  'min_samples_leaf' : [1,2,3,4,5],
                  'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                  'max_features' : ['auto', 'sqrt', 'log2']
                 }     
    
    model_grid = RandomizedSearchCV(estimator=GradientBoostingRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_         

  elif estimator == 'mlp':
    
    from sklearn.neural_network import MLPRegressor
    
    param_grid = {'learning_rate': ['constant', 'invscaling', 'adaptive'],
                  'solver' : ['lbfgs', 'adam'],
                  'alpha': [0.0001, 0.001, 0.01, 0.00001, 0.003, 0.0003, 0.0005, 0.005, 0.05],
                  'hidden_layer_sizes': np.random.randint(50,150,10),
                  'activation': ["tanh", "identity", "logistic","relu"]
                  }    
    
    model_grid = RandomizedSearchCV(estimator=MLPRegressor(random_state=seed), 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, n_jobs=-1)    
    
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_   
    
  if estimator == 'dt' and ensemble == True and method == 'Bagging':
    
    #when using normal BaggingRegressor() DT estimator raise's an exception for max_features parameter. Hence a separate 
    #call has been made for estimator='dt' and method = 'Bagging' where max_features has been removed from param_grid_dt.
    
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import BaggingRegressor
    
    param_grid = {'n_estimators': [10,15,20,25,30],
                 'max_samples': [0.3,0.5,0.6,0.7,0.8,0.9],
                 'max_features':[0.3,0.5,0.6,0.7,0.8,0.9],
                 'bootstrap': [True, False],
                 'bootstrap_features': [True, False],
                 }
    
    param_grid_dt = {"max_depth": np.random.randint(3, (len(X_train.columns)*.85),4),
                     "min_samples_leaf": [2,3,4],
                     "min_samples_leaf": [0.1,0.2,0.3,0.4,0.5],
                     "min_samples_split" : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                     "min_weight_fraction_leaf" : [0.1,0.2,0.3,0.4,0.5],
                     "min_impurity_decrease" : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                     "criterion": ["mse", "mae", "friedman_mse"]}

    
    model_grid = RandomizedSearchCV(estimator=DecisionTreeRegressor(random_state=seed), param_distributions=param_grid_dt,
                                   scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                   iid=False, n_jobs=-1)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_
    
    best_model = BaggingRegressor(best_model, random_state=seed)
    
    model_grid = RandomizedSearchCV(estimator=best_model, 
                                    param_distributions=param_grid, n_iter=n_iter, 
                                    cv=fold, random_state=seed, iid=False, n_jobs=-1)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_    
  
  elif ensemble and method == 'Bagging':
    
    from sklearn.ensemble import BaggingRegressor
    
    param_grid = {'n_estimators': [10,15,20,25,30],
                 'max_samples': [0.3,0.5,0.6,0.7,0.8,0.9],
                 'max_features':[0.3,0.5,0.6,0.7,0.8,0.9],
                 'bootstrap': [True, False],
                 'bootstrap_features': [True, False],
                 }

    best_model = BaggingRegressor(best_model, random_state=seed)
    
    model_grid = RandomizedSearchCV(estimator=best_model, 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, iid=False, n_jobs=-1)
 
    model_grid.fit(X_train,y_train)
    model = model_grid.best_estimator_
    best_model = model_grid.best_estimator_
    best_model_param = model_grid.best_params_    
  
      
  elif ensemble and method =='Boosting':
    
    from sklearn.ensemble import AdaBoostRegressor
    
    param_grid = {'n_estimators': [10, 40, 70, 80, 90, 100, 120, 140, 150],
                  'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                  'loss' : ["linear", "square", "exponential"]
                 }          
    
    best_model = AdaBoostRegressor(best_model, random_state=seed)
    
    model_grid = RandomizedSearchCV(estimator=best_model, 
                                    param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                    cv=fold, random_state=seed, iid=False, n_jobs=-1)

  for train_i , test_i in kf.split(data_X,data_y):
    
    Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
    ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
    model.fit(Xtrain,ytrain)
    pred_ = model.predict(Xtest)
    mae = mean_absolute_error(ytest,pred_)
    mse = mean_squared_error(ytest,pred_)
    rmse = np.sqrt(mse)
    r2 = r2_score(ytest,pred_)
    max_error_ = max_error(ytest,pred_)
    score_mae = np.append(score_mae,mae)
    score_mse = np.append(score_mse,mse)
    score_rmse = np.append(score_rmse,rmse)
    score_r2 =np.append(score_r2,r2)
    score_max_error = np.append(score_max_error,max_error_)
 
  mean_mae=np.mean(score_mae)
  mean_mse=np.mean(score_mse)
  mean_rmse=np.mean(score_rmse)
  mean_r2=np.mean(score_r2)
  mean_max_error=np.mean(score_max_error)
  std_mae=np.std(score_mae)
  std_mse=np.std(score_mse)
  std_rmse=np.std(score_rmse)
  std_r2=np.std(score_r2)
  std_max_error=np.std(score_max_error)
    
  avgs_mae = np.append(avgs_mae, mean_mae)
  avgs_mae = np.append(avgs_mae, std_mae) 
  avgs_mse = np.append(avgs_mse, mean_mse)
  avgs_mse = np.append(avgs_mse, std_mse)
  avgs_rmse = np.append(avgs_rmse, mean_rmse)
  avgs_rmse = np.append(avgs_rmse, std_rmse)
  avgs_r2 = np.append(avgs_r2, mean_r2)
  avgs_r2 = np.append(avgs_r2, std_r2)
  avgs_max_error = np.append(avgs_max_error, mean_max_error)
  avgs_max_error = np.append(avgs_max_error, std_max_error)
    
  model_results = pd.DataFrame({'MAE': score_mae, 'MSE': score_mse, 'RMSE' : score_rmse, 
                                'R2' : score_r2, 'ME' : score_max_error})

  model_avgs = pd.DataFrame({'MAE': avgs_mae, 'MSE': avgs_mse, 'RMSE' : avgs_rmse, 'R2' : avgs_r2,
                             'ME' : avgs_max_error},index=['Mean', 'SD'])
    
  model_results = model_results.append(model_avgs)
  model_results = model_results.round(round)  
  model_results = model_results.style.set_table_styles([ dict(selector='th', props=[('text-align', 'center')] ) ])
 
  if verbose:
    display(model_results)
    return best_model
  else:
    return best_model

In [None]:
def plot_model(estimator, 
               plot = 'residual'):
    
    model = estimator
    
    if plot == 'residual':
        from yellowbrick.regressor import ResidualsPlot
        visualizer = ResidualsPlot(model)
        #visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        visualizer.show()
        
    elif plot == 'error':
        from yellowbrick.regressor import PredictionError
        visualizer = PredictionError(model)
        #visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.show()
    
    elif plot == 'cooks':
        from yellowbrick.regressor import CooksDistance
        visualizer = CooksDistance()
        visualizer.fit(X, y)
        visualizer.show()

    elif plot == 'feature':
        variables = abs(model.coef_)
        col_names = np.array(X_train.columns)
        global coef_df
        coef_df = pd.DataFrame({'Variable': X_train.columns, 'Value': variables})
        sorted_df = coef_df.sort_values(by='Value')
        my_range=range(1,len(sorted_df.index)+1)
        plt.figure(figsize=(8,5))
        plt.hlines(y=my_range, xmin=0, xmax=sorted_df['Value'], color='skyblue')
        plt.plot(sorted_df['Value'], my_range, "o")
        plt.yticks(my_range, sorted_df['Variable'])
        plt.title("Feature Importance Plot")
        plt.xlabel('Variable Importance')
        plt.ylabel('Features') 
        var_imp = sorted_df.reset_index(drop=True)
        var_imp_array = np.array(var_imp['Variable'])
        var_imp_array_top_n = var_imp_array[0:len(var_imp_array)]

In [None]:
def create_stacknet(estimator_list,
                    meta_model = None,
                    fold = 10,
                    round = 4,
                    restack = False):
    
    #global base_array_df
    
    from sklearn.model_selection import KFold
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_squared_log_error
    from sklearn.metrics import max_error
    from sklearn.metrics import r2_score
    
    base_level = estimator_list[0]
    inter_level = estimator_list[1:]
    data_X = X_train
    data_y = y_train
    
    #defining meta model
    
    from sklearn.linear_model import LinearRegression
    
    if meta_model == None:
        meta_model = LinearRegression()
    else:
        meta_model = meta_model
    
    base_array = np.zeros((0,0))
    base_array_df = pd.DataFrame()
    base_prediction = pd.DataFrame(y_train)
    base_prediction = base_prediction.reset_index(drop=True)
    
    for model in base_level:
                     
        base_array = cross_val_predict(model,X_train,y_train,cv=fold)#, method=predict_method)
        base_array = base_array
        base_array = pd.DataFrame(base_array)
        base_array_df = pd.concat([base_array_df, base_array], axis=1)
        base_array = np.empty((0,0))  
        
    for level in inter_level:
        
        for model in level:
            
            base_array = cross_val_predict(model,base_array_df,base_prediction,cv=fold)#, method=predict_method)
            base_array = base_array
            base_array = pd.DataFrame(base_array)
            base_array_df = pd.concat([base_array, base_array_df], axis=1)
            base_array = np.empty((0,0))
        
        if restack == False:
            base_array_df = base_array_df.iloc[:,:len(level)]
        else:
            base_array_df = base_array_df
    
    model = meta_model
    
    kf = KFold(fold, random_state=seed) #capturing fold requested by user

    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
    score_rmse =np.empty((0,0))
    score_r2 =np.empty((0,0))
    score_max_error =np.empty((0,0))
    avgs_mae =np.empty((0,0))
    avgs_mse =np.empty((0,0))
    avgs_rmse =np.empty((0,0))
    avgs_r2 =np.empty((0,0))
    avgs_max_error =np.empty((0,0))
    
    for train_i , test_i in kf.split(data_X,data_y):
        
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]

        model.fit(Xtrain,ytrain)
        pred_ = model.predict(Xtest)
        mae = mean_absolute_error(ytest,pred_)
        mse = mean_squared_error(ytest,pred_)
        rmse = np.sqrt(mse)
        r2 = r2_score(ytest,pred_)
        max_error_ = max_error(ytest,pred_)
        score_mae = np.append(score_mae,mae)
        score_mse = np.append(score_mse,mse)
        score_rmse = np.append(score_rmse,rmse)
        score_r2 =np.append(score_r2,r2)
        score_max_error = np.append(score_max_error,max_error_)

    mean_mae=np.mean(score_mae)
    mean_mse=np.mean(score_mse)
    mean_rmse=np.mean(score_rmse)
    mean_r2=np.mean(score_r2)
    mean_max_error=np.mean(score_max_error)
    std_mae=np.std(score_mae)
    std_mse=np.std(score_mse)
    std_rmse=np.std(score_rmse)
    std_r2=np.std(score_r2)
    std_max_error=np.std(score_max_error)

    avgs_mae = np.append(avgs_mae, mean_mae)
    avgs_mae = np.append(avgs_mae, std_mae) 
    avgs_mse = np.append(avgs_mse, mean_mse)
    avgs_mse = np.append(avgs_mse, std_mse)
    avgs_rmse = np.append(avgs_rmse, mean_rmse)
    avgs_rmse = np.append(avgs_rmse, std_rmse)
    avgs_r2 = np.append(avgs_r2, mean_r2)
    avgs_r2 = np.append(avgs_r2, std_r2)
    avgs_max_error = np.append(avgs_max_error, mean_max_error)
    avgs_max_error = np.append(avgs_max_error, std_max_error)

    model_results = pd.DataFrame({'MAE': score_mae, 'MSE': score_mse, 'RMSE' : score_rmse, 
                                'R2' : score_r2, 'ME' : score_max_error})

    model_avgs = pd.DataFrame({'MAE': avgs_mae, 'MSE': avgs_mse, 'RMSE' : avgs_rmse, 'R2' : avgs_r2,
                             'ME' : avgs_max_error},index=['Mean', 'SD'])

    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)  
    model_results = model_results.style.set_table_styles([ dict(selector='th', props=[('text-align', 'center')] ) ])    
    display(model_results)

In [None]:
def save_model(model, model_name):
    from sklearn.externals import joblib
    model_name = model_name + '.pkl'
    joblib.dump(model, model_name)

In [None]:
def load_model(model_name):
    from sklearn.externals import joblib
    model_name = model_name + '.pkl'
    return joblib.load(model_name)

## Final Codes until Here 

# Work in Progress / Future Release 

In [None]:
def optimize_model(data_X=X_train, n=3):
    global X_train
    drop_list = var_imp_array_top_n[0:n]
    X_train.drop(drop_list, axis=1, inplace=True)

# Modules now Available

## 1.0. compare_models

## 2.0. create_model

## 3.0. plot_model 

## 4.0. tune_model

## 5.0. ensemble_model 

## 6.0 blend_models

## 7.0. stack_models

## 8.0. create_stacknet

## 9.0. save_model 

## 10.0. load_model 

## 9.0. optimize_model (Future Release)

## 10.0. predict_stacknet (Future Release)