In [None]:
import numpy as np
import pandas as pd
import pickle
from collections import namedtuple
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import SVR,LinearSVR,NuSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,ExtraTreesRegressor,AdaBoostRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score,make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
                                              ExpSineSquared, DotProduct,
                                              ConstantKernel)


import math
import matplotlib.pyplot as plt
from collections import Counter ,OrderedDict
from itertools import compress
# TODO: Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.model_selection import GridSearchCV
from pandas.plotting import scatter_matrix
# Import supplementary visualizations code visuals.py
import visuals as vs

%matplotlib inline

In [None]:
Dataset=namedtuple('Dataset','exchange df')
DatasetMLModel= namedtuple('DatasetMLModel','exchange  train_size tscv_split test_size X_train y_train X_test y_test scaler_features scaler_target')
Regressor= namedtuple('Regressor','name regressor_class params type')
FeatureSelection= namedtuple('FeatureSelection','dataset regressor params RFECV')

In [None]:
with open('datasets/log_divided_close_datasets.pkl', 'rb') as input4:
    log_divided_close_datasets = pickle.load(input4)   
    

In [None]:
features=['volume','amount', 'avg_price','open','high','quantity',
          'EWMA26','EWMA12','EWMA9','log_return',
          'Bollinger Upper', 'Bollinger Lower','Heiking_Close','Heiking High',
          'Heiking Open','log_MACD','Variance12']
def clean_ouliers(dataset,features):
    
    outliers_list=[]
    for feature in features:

        # TODO: Calculate Q1 (25th percentile of the data) for the given feature
        Q1 = np.percentile(dataset.df[feature],25)

        # TODO: Calculate Q3 (75th percentile of the data) for the given feature
        Q3 = np.percentile(dataset.df[feature],75)

        # TODO: Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
        step = (Q3-Q1)*1.5
        df_outlier_perfeature=dataset.df[~((dataset.df[feature] >= Q1 - step) & 
                                         (dataset.df[feature] <= Q3 + step))][[feature]]
        df_outlier_perfeature[feature]=1
        # Display the outliers
        # print ("Data points considered outliers for the feature '{}':".format(feature))
        # display(dataset.df[~((dataset.df[feature] >= Q1 - step) & (dataset.df[feature] <= Q3 + step))][feature])
        outliers_list.append(df_outlier_perfeature)
        
    outliers=pd.concat(outliers_list,axis=1)
    outliers=outliers.fillna(0)
    outliers['count']=outliers.sum(axis=1)
    return outliers





In [None]:
outliers_index.shape

In [None]:
def drop_the_last_row(dataset):
    dataset.df.drop(dataset.df.index[-1], inplace=True)
    return dataset
def making_targets(dataset):
    column_targets=['Returns','close','log_return']
    for column_target in column_targets:
        dataset.df[column_target+'_Target']=dataset.df[column_target].shift(-1)
    return drop_the_last_row(dataset)

def building_series_to_each_input(dataset,N=15,column_series=['close']):  
    for n in range(1,N+1):
        for column in column_series:
            dataset.df[column+'_{0:02d}'.format(n)]=dataset.df[column].shift(n)
    
    dataset.df.drop(dataset.df.index[:n], inplace=True)
    return dataset

def building_series_to_each_input_log_return(dataset,N=15):
    return building_series_to_each_input(column_series=['log_return'])

def building_series_to_each_input_log_return(dataset,N=15):
    return building_series_to_each_input(column_series=['Returns'])

def make_X_Y(dataset , train_size,features=['close'],target=['close_Target']):    
    X_train=dataset.df[features][:train_size]
    y_train=dataset.df[target][:train_size]
    X_test=dataset.df[features][train_size:]
    y_test=dataset.df[target][train_size:]
    return X_train,y_train,X_test,y_test

def scaler(X_train,y_train,X_test,y_test):   
    scaler_features = StandardScaler()
    scaler_target = StandardScaler()
    scaled_X_train=scaler_features.fit_transform(X_train)
    scaled_X_test=scaler_features.transform(X_test)
    X_train = pd.DataFrame(scaled_X_train, index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaled_X_test, index=X_test.index, columns=X_test.columns)
    scaled_y_train=scaler_target.fit_transform(y_train)
    scaled_y_test=scaler_target.transform(y_test)
    y_train = pd.DataFrame(scaled_y_train, index=y_train.index, columns=y_train.columns)
    y_test = pd.DataFrame(scaled_y_test, index=y_test.index, columns=y_test.columns)
    
    return X_train,y_train,X_test,y_test, scaler_features,scaler_target


def preprossessing(dataset,features,target, train_percent=0.80,evaluation_percent=0.20  ):
    rows, columns=dataset.df.shape
    train_size= int(rows*train_percent)
    tscv_split=int(rows//(train_size*evaluation_percent)-1)
    test_size=rows-train_size
    X_train,y_train,X_test,y_test=make_X_Y(dataset,train_size,features,target)
    outliers=clean_ouliers(X_train,features)
    outliers_index=outliers[outliers['count']>=7].sort_values('count',ascending=False).index.values
    X_train.drop(outliers_index, inplace=True)
    X_train,y_train,X_test,y_test, scaler_features,scaler_target=scaler( X_train,y_train,X_test,y_test)
    return DatasetMLModel(dataset.exchange,train_size ,tscv_split,test_size,
                          X_train,y_train,X_test,y_test,scaler_features=None,scaler_target=None)






In [None]:
features=['volume','amount', 'avg_price','open','high','quantity',
          'EWMA26','EWMA12','EWMA9','log_return',
          'Bollinger Upper', 'Bollinger Lower','Heiking_Close','Heiking High',
          'Heiking Open','log_MACD','Variance12']
target=['log_return_Target']

#dataset.df[features][]
dataset=making_targets(log_divided_close_datasets['btc_brl'])
dataset= preprossessing(dataset,features,target)
df_to_plot=pd.concat([dataset.X_train,dataset.y_train],axis=1)
scatter_matrix(df_to_plot,alpha=0.1,figsize=(15,15), diagonal = 'kde')


In [None]:
import seaborn as sns;
ax = sns.heatmap(df_to_plot.corr(method='pearson'))

In [None]:
from sklearn.decomposition import PCA
# TODO: Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components=2,  whiten=True)
pca.fit(dataset.X_train)
# TODO: Transform the good data using the PCA fit above
reduced_data = pca.transform(dataset.X_train)
# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])
vs.biplot(dataset.X_train, reduced_data, pca)

In [None]:

# TODO: Apply PCA by fitting the good data with the same number of dimensions as features
pca = PCA(n_components=4, whiten=True,iterated_power=7)
# TODO: Transform log_samples using the PCA fit above
pca_samples = pca.fit(dataset.X_train)

# Generate PCA results plot
pca_results = vs.pca_results(dataset.X_train, pca)

In [None]:
kernels = [ConstantKernel(1.0, (1e-3, 1e3))*(DotProduct(sigma_0=5.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
           ConstantKernel(1.0, (1e-3, 1e3))*(DotProduct(sigma_0=10.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
           ConstantKernel(1.0, (1e-3, 1e3))*(DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2)]

In [None]:
parameters_per_regressor={
    'NuSVR': {'C': [0.1],
           'gamma': [0.09,0.1,0.2], 
           'kernel': ['rbf'],
            'nu':[0.6],
           'max_iter':[15000]},
     'SVR': {'C': [0.1,0.01],
           'gamma': [0.08,0.1,0.2],
           'kernel': ['rbf'],
           'degree': [2,3,4],
           'max_iter':[15000]},    

   'KernelRidge': {'alpha':range(2500,3500,100),
                   'gamma': [1],
                   'degree': [2,3],
                   'kernel': [ 'poly']},
    'GradientBoostingRegressor':{'loss' : [ 'huber'],
                             'n_estimators':range(5,20,1),
                             'alpha':[0.02,0.01,0.03],
                             'max_depth':[2,3]
                            },
    'AdaBoostRegressor':{'base_estimator':[
                                           NuSVR(C=0.1,nu=0.70),
                                           NuSVR(C=0.1,nu=0.6),
                                           NuSVR(C=0.1,nu=0.8)
                                          ],
                         'loss':['linear', 'square', 'exponential'],
                      'n_estimators':range(10,300,10)},
    'RandomForestRegressor':{'max_depth':[2,3],
                         'n_estimators':range(70,90,2),
                        'bootstrap':[True]},
    'ExtraTreesRegressor':{'max_depth':[2,3],
                         'n_estimators':range(180,300,10),
                        'bootstrap':[True]},
    'GaussianProcessRegressor':{'kernel':kernels} , 
    'LinearSVR': {'C': [0.4],
           'loss': ['epsilon_insensitive'],
           'max_iter':[15000]},
    'LinearRegression':{},
    'Lasso': { 'alpha':range(10,110,10)},
    'DecisionTreeRegressor': {'max_depth':range(1,4),'criterion':['mse','friedman_mse']},
}
regressors={
    'LinearRegression': Regressor('LinearRegression', LinearRegression, None,'linear_model'), 
    'Lasso':Regressor('Lasso', Lasso, None,'linear_model'),
    'DecisionTreeRegressor':Regressor('DecisionTreeRegressor', DecisionTreeRegressor, None,'tree'),
    'GradientBoostingRegressor':Regressor('GradientBoostingRegressor', GradientBoostingRegressor, None,'ensemble'),
    'RandomForestRegressor':Regressor('RandomForestRegressor',RandomForestRegressor,None,'ensemble'),
    'ExtraTreesRegressor':Regressor('ExtraTreesRegressor',ExtraTreesRegressor,None,'ensemble'),
    'AdaBoostRegressor':Regressor('AdaBoostRegressor',AdaBoostRegressor,None,'ensemble'),
    'SVR':Regressor('SVR', SVR, None,'svm'),
    'NuSVR':Regressor('NuSVR', NuSVR, None,'svm'),
    'LinearSVR':Regressor('LinearSVR', LinearSVR, None,'svm'),
    'GaussianProcessRegressor':Regressor('GaussianProcessRegressor', SVR, None,'svm'),
    'KernelRidge':Regressor('KernelRidge', KernelRidge, None,'svm')
}



In [None]:
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    score = r2_score(y_true,y_predict)
    return score
def fit_all(regressors,parameters_per_regressor,dataset, pca):
    columns=['Regressor','Score','BestEstimator']
    data=[]
    scoring_fnc = make_scorer(performance_metric)
    for regressor_key, regressor  in regressors.items():
        try :
            print(regressor_key)
            reg=regressor.regressor_class()
            parameters=parameters_per_regressor[regressor_key]
            cvts=TimeSeriesSplit(n_splits=dataset.tscv_split)
            grid = GridSearchCV(reg, parameters,scoring_fnc, cv=cvts,error_score=-math.inf)   
            grid = grid.fit(pca.transform(dataset.X_train), dataset.y_train.values.ravel())
            data_row=[
                regressor_key,
                grid.best_score_,
                grid.best_estimator_
                ]
            
        except  Exception as exp:
            print(exp)
            data_row=[
                regressor_key,
                -math.inf,
                None
                ]
        finally: 
            data.append(data_row)
            
    result_cv=pd.DataFrame(data,columns=columns)
    result_cv.set_index(['Regressor'],inplace=True)
    return result_cv


In [None]:
scores=fit_all(regressors,parameters_per_regressor,dataset,pca)

In [None]:
scores.sort_values('Score',ascending=False)

In [None]:
scores['BestEstimator']['AdaBoostRegressor']


In [None]:
np.logspace(-3, 3, num=10, endpoint=True)


In [None]:
int(10.869)

In [None]:
np.logspace(np.log10(800),3,num=10,endpoint=True)

In [None]:
types=(int,float,object,str)
i=10.8697
types[2]==str
np.linspace(2, 2, num=10,endpoint=True).astype(int)


In [None]:

kernels = [ConstantKernel(1.0, (1e-3, 1e3))*(DotProduct(sigma_0=5.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
           ConstantKernel(1.0, (1e-3, 1e3))*(DotProduct(sigma_0=10.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
           ConstantKernel(1.0, (1e-3, 1e3))*(DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2)]

parameters_settings={
    
    'NuSVR': {'C':(np.logspace(-3,3,num=10,endpoint=True),float,np.logspace,-3,3,True),
           'gamma':(np.logspace(-3,3,num=10,endpoint=True),float,np.logspace,-3,3,True), 
           'kernel':(['linear', 'poly', 'rbf', 'sigmoid'],object,None,None,None,False), 
           'nu':(np.linspace(0.1, 0.9, num=10,endpoint=True),float,np.linspace,0.1,0.9,True),
           'max_iter':([15000],int,None,None,None,False)
             },
    
    'SVR': {'C':(np.logspace(-3,3,num=10,endpoint=True),float,np.logspace,-3,3,True),
           'gamma':(np.logspace(-3,3,num=10,endpoint=True),float,np.logspace,-3,3,True), 
           'kernel':(['linear', 'poly', 'rbf', 'sigmoid'],object,None,None,None,False), 
           'degree': (np.linspace(1, 10, num=10,endpoint=True),float,np.linspace,1,10,True),
           'max_iter':([15000],int,None,None,None,False)
           },
    
    'KernelRidge': {'alpha':(np.logspace(0,6,num=10,endpoint=True),float,np.logspace,0,6,True),
                   'gamma':(np.logspace(-3,3,num=10,endpoint=True),float,np.logspace,-3,3,True), 
                   'degree': (np.linspace(1, 10, num=10,endpoint=True),float,np.linspace,1,10,True),  
                   'kernel':(['linear', 'poly', 'rbf', 'sigmoid'],object,None,None,None,False)
                   },
    
    'GradientBoostingRegressor':{'loss' : ([ 'ls', 'lad', 'huber', 'quantile'],object,None,None,None,False), 
                             'n_estimators':(np.logspace(0,3,num=10,endpoint=True),int,np.logspace,0,3,True),
                             'alpha':(np.logspace(-3,-0.022,num=10,endpoint=True),float,np.logspace,-3,-0.022,True), 
                             'max_depth':(np.linspace(1, 10, num=10,endpoint=True),int,None,None,None,False)
                            },                  
                    
    'AdaBoostRegressor':{'base_estimator':([
                                           NuSVR(C=0.1,nu=0.25),
                                           NuSVR(C=0.1,nu=0.50),
                                           NuSVR(C=0.1,nu=0.75),
                                           NuSVR(C=0.1,nu=0.9)
                                          ],object,None,None,None,False),
                         'loss':(['linear', 'square', 'exponential'],str,None,None,None,False), 
                      'n_estimators':(np.logspace(0,2.62,num=5,endpoint=True),int,np.logspace,0,3,True)  
                        } ,
    'RandomForestRegressor':{'max_depth':(np.linspace(1, 10, num=10,endpoint=True),int,np.linspace,1,10,True),
                         'n_estimators':(np.logspace(0,3,num=10,endpoint=True),int,np.logspace,0,3,True) } ,
                    
    'ExtraTreesRegressor':{'max_depth':(np.linspace(1, 10, num=10,endpoint=True),int,np.linspace,1,10,True),
                         'n_estimators':(np.logspace(0,3,num=10,endpoint=True),int,np.logspace,0,3,True) } ,
    'GaussianProcessRegressor':{'kernel':(kernels,object,None,None,None,False)} , 
    'LinearSVR': {'C':(np.logspace(-3,3,num=10,endpoint=True),float,np.logspace,-3,3,True),
           'loss':( ['epsilon_insensitive', 'squared_epsilon_insensitive' ],object,None,None,None,False),
           'max_iter':([15000],int,None,None,None,False)},
    'LinearRegression':{},
    'Lasso': { },
    'DecisionTreeRegressor': {}
    }  

In [None]:
def get_parameters_per_ML(parameters_settings):
    return {ML_algo:{parameter:settings[0] 
                     if type(settings[0])==object 
                     else  np.array(settings[0]).astype(settings[1])
                     for parameter, settings in parameters.items()} 
            for ML_algo,parameters in  parameters_settings.items()}
parameters_per_regressor=get_parameters_per_ML(parameters_settings)
parameters_per_regressor

In [None]:
scores=fit_all(regressors,parameters_per_regressor,dataset,pca)

In [None]:
def next_parameter_iteration(scores,parameters_settings,num=10,endpoint=True):
    new_parameters_settings={}
    for regressor, parameters in parameters_settings.items():
        parameter_setting={}
        if scores['BestEstimator'][regressor]:
            params=scores['BestEstimator'][regressor].get_params() 
            
            for parameter, value in parameters_settings[regressor].items():
                options,tuplet_type,space_function,start,end,is_all_catagories=value
                options=np.array(options).astype(tuplet_type)

                print('*********')
                print(regressor)
                print(params[parameter])
                print(options)
              
                if not is_all_catagories:
                    options=[params[parameter]]
                else:
                    index,=np.where(options==params[parameter])
                    index=np.asscalar(index[0])
                    print(index)
                    if index==0:
                        options=[params[parameter]]
                    elif index==len(options)-1:
                        if space_function== np.linspace:
                            end=int(math.log10(end))+1
                            start=np.asscalar(options[index-1])
                        if space_function== np.logspace:
                            end=end+1
                            start=end-1
                    else :
                        if space_function== np.linspace:
                            end=np.asscalar(options[index+1])
                            start=np.asscalar(options[index-1])

                        if space_function== np.logspace:
                            end=math.log10(options[index+1])
                            start=math.log10(options[index-1])                 
                
                
                
                options=space_function(start,end,num,endpoint) if  space_function else options
                parameter_setting[parameter]=(options,tuplet_type,space_function,start,end,is_all_catagories)
            new_parameters_settings[regressor]=parameter_setting
    return new_parameters_settings

def tune_models (parameters_settings, iterations,dataset,pca):
    for i in range(iterations):
        parameters_per_regressor=get_parameters_per_ML(parameters_settings)
        scores=fit_all(regressors,parameters_per_regressor,dataset,pca)
        parameters_settings=next_parameter_iteration(scores,parameters_settings,num=10,endpoint=True)
    return scores,parameters_settings

In [None]:
scores1=scores

parameters_settings1=next_parameter_iteration(scores1,parameters_settings)

scores4, parameters_settings4=tune_models(parameters_settings1, 4,dataset,pca)

In [None]:
scores2.sort_values('Score',ascending=False)

In [None]:
bool(None)