## Downloads required

In [None]:
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib

In [2]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

%cd /content/drive/My Drive/Trading

Mounted at /content/drive
/content/drive/My Drive/Trading


# Set up 

### Loading Libraries

In [3]:
#Basics
import pandas as pd
from pandas import DataFrame
import numpy as np
import talib as ta
from datetime import datetime, timedelta

#Visuals
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,7)
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

import seaborn as sns

#ARIMA
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

#Sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn import cross_decomposition
from sklearn.model_selection import cross_val_score, train_test_split


from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import BayesianRidge, ElasticNetCV, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor


from sklearn.metrics import mean_absolute_error, accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, SelectKBest, chi2, VarianceThreshold

import xgboost as xgb


# Default figure size
sns.set(rc={"figure.figsize": (12, 6)})

# Hoping to get rid of these
from scipy import stats
import statsmodels.api as sm
from itertools import product

import warnings
warnings.filterwarnings('ignore')

fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)

rand_seed = 1234

  import pandas.util.testing as tm


## Load Data

In [4]:
import pandas_datareader as pdr
import datetime
import math

In [62]:
RIOT= pdr.DataReader('RIOT', 'yahoo')
CAN= pdr.DataReader('CAN', 'yahoo')

In [63]:
RIOT = RIOT.reset_index()
CAN = CAN.reset_index()

In [64]:
for item in (RIOT, CAN):
    item.columns = map(str.lower, item.columns)
    item['date'] = pd.to_datetime(item.date)
    #item.set_index('date', inplace=True)
    item = item.fillna(method="ffill")



# Feature Engineering



*   Lagged vars
*   Rolling means
*   Rolling variances







In [65]:
#tf = tf.reset_index()
for df in (RIOT, CAN):
  
  

  print("Creating overlapping items..")
  #add technical indicators - overlapping items
  # Bollinger bands
  df['upper_band'], df['middle_band'], df['lower_band'] = ta.BBANDS(df['close'], timeperiod =30)

  # To ensure the model focuses on the most current events rather than previous trends too heavily
  df['ewm_30'] = df["close"].ewm(span=30).mean()
  df['ewm_15'] = df["close"].ewm(span=15).mean()

  #df['HT_Trendline'] = ta.HT_TRENDLINE(df['close'])

  df['dema_15'] = ta.DEMA(df['close'],15)
  df['dema_30'] = ta.DEMA(df['close'],30)

  df['kama_15'] = ta.KAMA(df['close'],15)
  df['kama_30'] = ta.KAMA(df['close'],30)

  df['MIDPOINT_15'] = ta.MIDPOINT(df['close'],15)
  df['MIDPOINT_30'] = ta.MIDPOINT(df['close'],30)
  

  df['MIDPOINT_15'] = ta.MIDPOINT(df['close'],15)
  df['MIDPOINT_30'] = ta.MIDPOINT(df['close'],30)

  df['SAR'] =ta.SAR(df['high'], df['low'], 0, 0.2)

  df['SMA_5'] = df['close'].rolling(5).mean()
  df['SMA_7'] = df['close'].rolling(7).mean()
  df['SMA_14'] = df['close'].rolling(14).mean()
  df['SMA_30'] = df['close'].rolling(30).mean()

  
  #df['tema_15'] = ta.TEMA(df['close'],15)
  #df['tema_30'] = ta.TEMA(df['close'],30)

  df['trima_15'] = ta.TRIMA(df['close'],15)
  df['trima_30'] = ta.TRIMA(df['close'],30)

  df['WMA_5'] = ta.WMA(df['close'],5)
  df['WMA_7'] = ta.WMA(df['close'],7)
  df['WMA_14'] = ta.WMA(df['close'],14)
  df['WMA_30'] = ta.WMA(df['close'],30)

  print("Variances..")

  df['roll_var_5'] = df['close'].rolling(5).var()
  df['roll_var_7'] = df['close'].rolling(7).var()
  df['roll_var_14'] = df['close'].rolling(14).var()
  df['roll_var_30'] = df['close'].rolling(30).var()
 
  print("Standard deviations..")
  # set .std(ddof=0) for population std instead of sample
  df['5 Day STD'] = df['close'].rolling(window=5).std()
  df['7 Day STD'] = df['close'].rolling(window=7).std()
  df['14 Day STD'] = df['close'].rolling(window=14).std() 
  df['30 Day STD'] = df['close'].rolling(window=30).std() 



# Plot
  print("Done!")


Creating overlapping items..
Variances..
Standard deviations..
Done!
Creating overlapping items..
Variances..
Standard deviations..
Done!


## Momentum indicators

In [66]:
# Momentum indicators
for df in (RIOT, CAN):

  df['ADX_5'] = ta.ADX(df['high'], df['low'], df['close'],5)
  df['ADX_10'] = ta.ADX(df['high'], df['low'], df['close'],10)
  df['ADX_30'] = ta.ADX(df['high'], df['low'], df['close'],30)

  df['ADXR_10'] = ta.ADXR(df['high'], df['low'], df['close'],10)

  df['APO'] = ta.APO(df.close, fastperiod=10, slowperiod=30, matype = 0)

  #df['Aroonup_10'] = ta.AROONOSC(df['close'],10)
  df['aroondown'], df['aroonup'] = ta.AROON(df['high'], df['low'], timeperiod=10)
  df['aroon_osc'] = ta.AROONOSC(df['high'], df['low'], timeperiod=10)

  df['BOP'] = ta.BOP(df['open'],df['high'], df['low'], df['close'])

  df['CCI_30'] = ta.CCI(df['high'], df['low'], df['close'],30)

  df['CMO_10'] = ta.CMO(df['close'],10)

  df['DX_10'] = ta.DX(df['high'], df['low'], df['close'],10)

  df['macd'], df['macdsignal'], df['macdhist'] = ta.MACD(df['close'], fastperiod=10, slowperiod=30, signalperiod=5)

  #df['MACD_5'] = ta.MACD(df['close'],5)
  #df['MACD_10'] = ta.MACD(df['close'],10)
  #df['MACD_30'] = ta.MACD(df['close'],30)

  df['MINUS_DI_10'] = ta.MINUS_DI(df['high'], df['low'], df['close'],10)
  df['MINUS_DM_10'] = ta.MINUS_DM(df['high'], df['low'],10)

  df['MOM_10'] = ta.MOM(df['close'],10)


  df['PLUS_DI_10'] = ta.PLUS_DI(df['high'], df['low'], df['close'],10)
  df['PLUS_DM_10'] = ta.PLUS_DM(df['high'], df['low'],10)

  df['PPO'] = ta.PPO(df['close'], fastperiod=10, slowperiod=30, matype=0)


  df['roc_5'] = ta.ROC(df['close'], 5)
  df['roc_10'] = ta.ROC(df['close'], 10)
  df['roc_30'] = ta.ROC(df['close'], 30)

  df['rsi_5'] = ta.RSI(df['close'], 5)
  df['rsi_10'] = ta.RSI(df['close'], 10)
  df['rsi_30'] = ta.RSI(df['close'], 30)

  #add stochastc oscillators 
  df['slowk'], df['slowd'] = ta.STOCH(df['high'], df['low'], df['close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
  df['fastk'], df['fastd'] = ta.STOCHF(df['high'], df['low'], df['close'], fastk_period=14, fastd_period=3, fastd_matype=0)

  df['fastk_rsi'], df['fastd_rsi']  = ta.STOCHRSI(df['close'], timeperiod=10, fastk_period=5, fastd_period=3, fastd_matype=0)

  df['TRIX_10'] = ta.TRIX(df['close'], 10)

  df['ULTOSC'] = ta.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=5, timeperiod2=10, timeperiod3=30)
 
  df['WILLR'] = ta.WILLR(df['high'], df['low'], df['close'],10)
  
  print("Oscillators added..")

  df['HT_DCPERIOD'] = ta.HT_DCPERIOD(df['close'])
  #df['HT_DCPHASE'] = ta.HT_DCPHASE(df['close'])
  df['HT_DCTREND'] = ta.HT_TRENDMODE(df['close'])

  print("Cycle indicators added..")
  df['Tomorrow']= df['close'].shift()

Oscillators added..
Cycle indicators added..
Oscillators added..
Cycle indicators added..


In [67]:
for df in (RIOT, CAN):
  df = df.drop(['high', 'low', 'open', 'volume', 'adj close'], axis = 1)

# Save dataframe

In [68]:
RIOT = RIOT.drop(['high', 'low', 'open', 'volume', 'adj close'], axis = 1)
CAN = CAN.drop(['high', 'low', 'open', 'volume', 'adj close'], axis = 1)

RIOT = RIOT.dropna()
CAN = CAN.dropna()

## Datetime conversion
RIOT['date'] = pd.to_datetime(RIOT.date)
## Datetime conversion
CAN['date'] = pd.to_datetime(CAN.date)

In [None]:
cd Thesis\ -\ submitting/

[Errno 2] No such file or directory: 'Thesis - submitting/'
/content/ta-lib/ta-lib


In [None]:
btc.to_csv('btc_w_features.csv')

eth.to_csv('eth_w_features.csv')

In [69]:
RIOT.head()

Unnamed: 0,date,close,upper_band,middle_band,lower_band,ewm_30,ewm_15,dema_15,dema_30,kama_15,kama_30,MIDPOINT_15,MIDPOINT_30,SAR,SMA_5,SMA_7,SMA_14,SMA_30,trima_15,trima_30,WMA_5,WMA_7,WMA_14,WMA_30,roll_var_5,roll_var_7,roll_var_14,roll_var_30,5 Day STD,7 Day STD,14 Day STD,30 Day STD,ADX_5,ADX_10,ADX_30,ADXR_10,APO,aroondown,aroonup,aroon_osc,BOP,CCI_30,CMO_10,DX_10,macd,macdsignal,macdhist,MINUS_DI_10,MINUS_DM_10,MOM_10,PLUS_DI_10,PLUS_DM_10,PPO,roc_5,roc_10,roc_30,rsi_5,rsi_10,rsi_30,slowk,slowd,fastk,fastd,fastk_rsi,fastd_rsi,TRIX_10,ULTOSC,WILLR,HT_DCPERIOD,HT_DCTREND,Tomorrow
59,2016-06-23,3.6,4.030811,3.534367,3.037922,3.457598,3.507361,3.543469,3.596605,3.438309,3.404644,3.5,3.72,2.31,3.566,3.495714,3.52,3.534367,3.470328,3.539646,3.573333,3.541429,3.500571,3.521497,0.00223,0.017095,0.023585,0.063739,0.047223,0.130749,0.153573,0.252466,17.484966,31.25715,44.030941,43.948204,-0.076367,50.0,0.0,-50.0,0.1875,25.759392,10.048032,14.745657,0.072011,0.066409,0.005602,21.344338,0.548846,-0.04,28.727804,0.738704,-2.16069,10.429445,-1.098907,15.384617,60.015704,55.024016,55.151819,64.074059,63.703689,66.666649,64.074059,79.503949,83.025079,0.038424,59.943398,-17.808234,15.226947,0,3.52
60,2016-06-24,3.48,4.007426,3.550033,3.092641,3.459068,3.50394,3.532576,3.591082,3.43874,3.40567,3.5,3.815,2.31,3.56,3.51,3.501429,3.550033,3.463437,3.537938,3.544667,3.5375,3.495238,3.517989,0.00325,0.014667,0.019613,0.054106,0.057009,0.121106,0.140047,0.232606,19.417596,28.273249,43.465393,40.638425,-0.094033,40.0,90.0,50.0,-0.222222,-35.422222,-1.18065,1.418146,0.062805,0.065207,-0.002402,26.988701,0.683961,-0.02,26.233926,0.664833,-2.648802,-0.8547,-0.571428,15.614619,45.903656,49.409675,53.376656,59.259249,62.345666,53.33333,59.259249,0.0,49.691746,0.047237,49.895973,-34.246574,14.97319,0,3.6
61,2016-06-27,3.21,4.006752,3.550367,3.093981,3.442738,3.467188,3.460133,3.552072,3.424798,3.404827,3.46,3.82,2.31,3.484,3.502857,3.472857,3.550367,3.457031,3.531642,3.428,3.4625,3.456381,3.496052,0.02643,0.01919,0.02436,0.053867,0.162573,0.13853,0.156078,0.232094,24.633235,26.798718,42.747132,38.265113,-0.104367,30.0,80.0,50.0,-1.0,-113.027388,-21.264889,13.527935,0.02335,0.051255,-0.027905,30.796511,0.785565,-0.1,23.457116,0.59835,-2.939603,-10.584955,-3.021145,0.3125,27.630671,39.367555,49.656505,48.333329,57.222212,25.000007,48.333329,0.0,26.501316,-0.000409,37.162788,-71.232872,14.731542,0,3.48
62,2016-06-28,3.22,4.009225,3.538033,3.066841,3.428149,3.436282,3.403105,3.517757,3.415888,3.402673,3.46,3.82,2.31,3.406,3.461429,3.437857,3.538033,3.4575,3.521471,3.34,3.391786,3.422667,3.474738,0.03228,0.030514,0.023634,0.057419,0.179666,0.174683,0.153732,0.239624,29.379468,25.675019,42.022983,36.951004,-0.100033,20.0,70.0,50.0,0.222222,-110.111147,-20.259059,15.561734,-0.005949,0.032187,-0.038136,29.769195,0.737009,-0.08,21.751657,0.538515,-2.827372,-10.803321,-2.42424,-10.306404,28.94025,39.87047,49.790582,34.841273,47.47795,26.190482,34.841273,3.212188,1.070729,-0.073075,34.169267,-69.863011,14.561804,0,3.21
63,2016-06-29,3.42,3.840916,3.504367,3.167818,3.427616,3.434247,3.403456,3.511462,3.416,3.403366,3.425,3.59,2.31,3.386,3.437143,3.422143,3.504367,3.458125,3.510008,3.344667,3.381429,3.420286,3.467123,0.02858,0.027357,0.020249,0.029293,0.169056,0.1654,0.142299,0.171151,28.285358,23.823561,41.381608,33.96476,-0.062367,10.0,60.0,50.0,0.64,-70.623336,-1.542526,7.160432,-0.004844,0.019843,-0.024688,26.766051,0.663308,0.04,23.18905,0.574664,-1.779685,-2.840906,1.183431,-22.799092,51.074086,49.228737,52.412716,36.241582,39.805394,57.534256,36.241582,62.984744,22.065644,-0.118334,42.929969,-42.465744,14.438388,1,3.22


# Feature selection

In [None]:
#!pip install eli5

In [None]:
#eth = eth.drop(['index'], axis =1)

In [70]:
CAN = CAN.set_index('date')

In [72]:
CAN.head()

Unnamed: 0_level_0,close,upper_band,middle_band,lower_band,ewm_30,ewm_15,dema_15,dema_30,kama_15,kama_30,MIDPOINT_15,MIDPOINT_30,SAR,SMA_5,SMA_7,SMA_14,SMA_30,trima_15,trima_30,WMA_5,WMA_7,WMA_14,WMA_30,roll_var_5,roll_var_7,roll_var_14,roll_var_30,5 Day STD,7 Day STD,14 Day STD,30 Day STD,ADX_5,ADX_10,ADX_30,ADXR_10,APO,aroondown,aroonup,aroon_osc,BOP,CCI_30,CMO_10,DX_10,macd,macdsignal,macdhist,MINUS_DI_10,MINUS_DM_10,MOM_10,PLUS_DI_10,PLUS_DM_10,PPO,roc_5,roc_10,roc_30,rsi_5,rsi_10,rsi_30,slowk,slowd,fastk,fastd,fastk_rsi,fastd_rsi,TRIX_10,ULTOSC,WILLR,HT_DCPERIOD,HT_DCTREND,Tomorrow
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
2020-02-18,5.75,7.228943,5.689333,4.149724,5.738617,5.765708,5.941804,5.507743,5.61595,5.839475,6.22,6.22,13.0,6.322,5.818571,5.626429,5.689333,5.356406,5.509542,6.418,6.185,5.765238,5.627548,1.82092,1.953181,1.036209,0.613034,1.349415,1.397562,1.017944,0.782965,48.34201,28.221009,9.389184,23.263845,-0.123333,60.0,70.0,10.0,-0.947368,46.368172,0.181718,28.572378,0.054964,-0.095432,0.150396,18.928191,1.893694,0.17,34.071449,3.408719,-2.167798,25.820564,3.046596,-5.271831,47.600185,50.090859,45.290766,47.48859,53.297825,32.876716,47.48859,45.564331,58.893834,0.270293,48.443356,-67.123284,19.452723,0,6.78
2020-02-19,5.71,7.221678,5.683667,4.145655,5.736738,5.758742,5.906556,5.514004,5.617077,5.838783,6.22,6.22,13.0,6.584,5.984286,5.629286,5.683667,5.40875,5.499625,6.214,6.157857,5.776381,5.628882,0.90523,1.654895,1.036592,0.611762,0.951436,1.286427,1.018131,0.782152,43.32919,28.256146,9.734063,22.403966,-0.057667,50.0,60.0,10.0,-0.161765,26.671954,-0.508853,28.572378,0.031777,-0.053029,0.084806,17.599095,1.704324,0.6,31.679027,3.067847,-1.014602,29.772725,11.741681,-2.891158,47.054137,49.745573,45.139583,40.410965,50.938616,31.963475,40.410965,0.0,37.394705,0.38764,46.794502,-68.036525,18.494683,0,5.75
2020-02-20,5.32,7.20957,5.667,4.12443,5.709415,5.703886,5.785093,5.471599,5.614724,5.834659,6.22,6.22,13.0,6.04,6.091429,5.590714,5.667,5.480938,5.491708,5.792667,5.991786,5.735143,5.605419,0.40475,1.381681,1.038253,0.615394,0.6362,1.175449,1.018947,0.784471,35.205791,26.734885,9.867361,20.941645,-0.003,40.0,50.0,10.0,-0.333333,-39.758035,-7.422215,13.04354,-0.03209,-0.046049,0.013959,22.798951,2.123892,0.38,29.638673,2.761062,-0.052937,-33.830843,7.69231,-8.591065,41.282457,46.288893,43.669317,29.299853,39.06647,23.059368,29.299853,0.0,15.18811,0.394721,29.807619,-76.940632,17.499709,0,5.71
2020-02-21,5.86,7.19353,5.658333,4.123137,5.719278,5.723404,5.811392,5.501289,5.615745,5.83481,6.22,6.22,13.0,5.884,6.3,5.580714,5.658333,5.590469,5.484417,5.732667,5.933929,5.771048,5.617871,0.29243,0.863033,1.030838,0.609525,0.540768,0.928996,1.015302,0.780721,30.049593,25.92121,10.070095,20.567081,0.103667,30.0,40.0,10.0,0.787402,-1.735128,2.952495,18.598132,-0.017255,-0.036451,0.019196,20.899489,1.911502,0.98,30.449429,2.784956,1.832107,-11.746984,20.081967,-4.248362,51.565049,51.476248,46.180237,30.136993,33.282604,35.388135,30.136993,38.699764,12.899921,0.397642,35.457663,-64.611865,16.752901,0,5.32
2020-02-24,5.65,7.134155,5.630333,4.126512,5.714745,5.714227,5.783069,5.502541,5.616009,5.832747,6.22,6.22,13.0,5.658,5.958572,5.585714,5.630333,5.742344,5.47675,5.654667,5.771429,5.780286,5.617333,0.04157,0.292848,1.03118,0.584865,0.203887,0.541154,1.01547,0.764765,27.092781,25.449877,10.300396,20.476623,0.241667,20.0,30.0,10.0,-0.548386,9.757543,-1.171765,21.207882,-0.030331,-0.034411,0.00408,19.4356,1.720352,1.1,29.898269,2.646461,4.292227,-16.666668,24.175821,-12.942985,47.519802,49.414118,45.366706,29.680372,29.705739,30.593613,29.680372,60.246985,32.982249,0.372863,37.044661,-69.406387,16.392226,0,5.86


## Reset from RIOT to CAN

In [94]:
RIOT = RIOT.set_index('date')
TA= RIOT

In [95]:
X = TA.iloc[:,0:(len(TA.columns)-1)]
y = TA.iloc[:,-1]

In [96]:
import pandas as pd

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.8, random_state = 42, shuffle= False)

## Use Random forrest regressor to figure out the best feature

In [97]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [98]:
rf.fit(X_train, y_train)

print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train, y_train), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(X_valid, y_valid)))

R^2 Training Score: 0.99 
OOB Score: 0.91 
R^2 Validation Score: -0.02


# Use elif to get the best and worst feature

In [25]:
#!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/d1/54/04cab6e1c0ae535bec93f795d8403fdf6caf66fa5a6512263202dbb14ea6/eli5-0.11.0-py2.py3-none-any.whl (106kB)
[K     |███                             | 10kB 17.5MB/s eta 0:00:01[K     |██████▏                         | 20kB 12.5MB/s eta 0:00:01[K     |█████████▎                      | 30kB 9.7MB/s eta 0:00:01[K     |████████████▍                   | 40kB 7.8MB/s eta 0:00:01[K     |███████████████▌                | 51kB 4.6MB/s eta 0:00:01[K     |██████████████████▌             | 61kB 5.1MB/s eta 0:00:01[K     |█████████████████████▋          | 71kB 5.2MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 5.5MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 5.4MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 5.8MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 5.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11.0


In [99]:
def imp_df(column_names, importances):
    df = pd.DataFrame({'feature': column_names,
                       'feature_importance': importances}) \
           .sort_values('feature_importance', ascending = False) \
           .reset_index(drop = True)
    return df

In [100]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rf, cv = None, refit = False, n_iter = 50).fit(X_train, y_train)
perm_imp_eli5 = imp_df(X_train.columns, perm.feature_importances_)

In [101]:
from sklearn.base import clone 

def drop_col_feat_imp(model, X_train, y_train, random_state = 42):
    
    # clone the model to have the exact same specification as the one initially trained
    model_clone = clone(model)
    # set random_state for comparability
    model_clone.random_state = random_state
    # training and scoring the benchmark model
    model_clone.fit(X_train, y_train)
    benchmark_score = model_clone.score(X_train, y_train)
    # list for storing feature importances
    importances = []
    
    # iterating over all columns and storing feature importance (difference between benchmark and new model)
    for col in X_train.columns:
        model_clone = clone(model)
        model_clone.random_state = random_state
        model_clone.fit(X_train.drop(col, axis = 1), y_train)
        drop_col_score = model_clone.score(X_train.drop(col, axis = 1), y_train)
        importances.append(benchmark_score - drop_col_score)
    
    importances_df = imp_df(X_train.columns, importances)
    return importances_df

In [102]:
drop_imp = drop_col_feat_imp(rf, X_train, y_train)
#var_imp_plot(drop_imp, 'Drop Column feature importance')

In [103]:
drop_imp.sort_values(by= ['feature_importance'])

drop_imp.to_csv('drop_features_list_RIOT.csv')

In [104]:
drop_imp = drop_imp.set_index('feature') 

In [105]:
to_del = drop_imp.drop("close", axis =0)

In [106]:
to_del = to_del[(to_del['feature_importance']<0)]

In [107]:
to_del= to_del.reset_index()

In [108]:
del_list = to_del["feature"].tolist()


In [109]:
clean = TA.drop(del_list, axis=1)
clean = clean.reset_index()

In [110]:
len(CAN.columns)

70

In [111]:
len(clean.columns)

36

In [112]:
clean.tail(2)

Unnamed: 0,date,close,upper_band,dema_30,kama_15,kama_30,SMA_5,SMA_7,SMA_14,SMA_30,trima_15,trima_30,WMA_7,WMA_14,WMA_30,roll_var_7,roll_var_14,roll_var_30,5 Day STD,7 Day STD,14 Day STD,ADX_5,ADX_10,APO,aroondown,BOP,DX_10,macd,MINUS_DM_10,slowd,fastk_rsi,fastd_rsi,ULTOSC,HT_DCPERIOD,HT_DCTREND,Tomorrow
1178,2021-03-01,53.599998,72.232302,54.258718,56.560892,47.222041,48.816,54.31,53.456429,36.763667,57.164532,35.62825,50.797143,54.495143,44.679441,103.380552,148.320003,325.351058,4.197621,10.167623,12.17867,39.909954,35.984774,21.196333,90.0,0.630705,3.810269,10.593591,25.758488,42.857884,100.0,33.333333,43.134678,33.674208,1,43.740002
1179,2021-03-02,49.099998,73.034844,54.437301,56.409168,47.267987,48.875999,51.134286,54.116429,37.541,57.606094,37.610584,49.494642,53.914286,45.475333,47.858471,135.090559,325.813704,4.199479,6.917982,11.622846,31.97187,33.367902,19.379,80.0,-0.92723,9.816057,9.661434,23.182639,39.932761,48.910836,49.636945,42.570749,32.958695,1,53.599998


In [None]:
#clean = clean.drop(['SAR'], axis =1)

In [39]:
ls

drop_features_list_CAN.csv


In [113]:
clean.to_csv('cleaned_RIOT.csv')