In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
!pip install -q plotly==4.2.1

## **IMPORT LIBRARIES**

In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import plot_confusion_matrix



In [4]:
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.graphics.tsaplots as tsa
import statsmodels.stats.diagnostic as diag

In [None]:
!pip install chart_studio

In [None]:
!pip install dash

In [None]:
from plotly import tools
import gc
import datetime
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots

## **READ THE DATA**

In [None]:
df = pd.read_csv('Data/BTC-USD.csv',header=0)
print(df.head())

df['Date']=pd.to_datetime(df['Date'])
print(df.info())

## **EXPLORATORY DATA ANALYSIS**

### **LJung Box Test for White Noise Detection**

### Autocorrelation Plot

In [None]:
tsa.plot_acf(df['Close'], lags=40, alpha=0.05, title='Auto-correlation coefficients for lags 1 through 40')

**Testing using Ljung Box**

In [None]:
diag.acorr_ljungbox(df['Close'], lags=[40], boxpierce=True)

Test Statistic for Ljungbox = 59590.70658337

p-value for Ljungbox = 0 


Test Statistic for Box-Pierce Test = 58902.34933406

p-value for Box-Pierce = 0

Both the p-values from Chi-Square table are less than 0.01 , So with 99% confidence , we say that the time series is not Pure White noise

**Price Trend**

In [None]:

plt.plot('Date', 'Close', data=df)
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title('Closing price trend')
plt.xticks(rotation=45)
plt.savefig('priceTrend.jpg', format='jpg')

In [None]:
import datetime
from datetime import timezone

data = df

start = datetime.datetime(2015,11,19)#2015-11-19
end = datetime.datetime(2020,11,19)#2020-11-19

# find rows between start and end time and find the first row (00:00 monday morning)
weekly_rows = data[(data['Date'] >= start) & (data['Date'] <= end)].groupby([pd.Grouper(key='Date', freq='W-MON')]).first().reset_index()
weekly_rows.head()

## **VISUALIZATION**

**HIGH PRICE MOVEMENT**

In [None]:
# Load data
import plotly.graph_objects as go


# Create figure
fig = go.Figure()

fig.add_trace(
    go.Scatter(x = list(weekly_rows.Date),
               y = list(weekly_rows.High)))

# Set title
fig.update_layout(
    title_text="Time series with range slider and selectors"
)

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
               
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)
fig.show()

**Volume Movement**

In [None]:
  # Load data
import plotly.graph_objects as go


# Create figure
fig = go.Figure()

fig.add_trace(
    go.Scatter(x = list(weekly_rows.Date),
               y = list(weekly_rows.Volume)))

# Set title
fig.update_layout(
    title_text="Time series with range slider and selectors"
)

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)
fig.show()

In [None]:
df['Daily Lag']=df['Close'].shift(1)
#df.head()
df['Daily Returns']=(df['Daily Lag']/df['Close'])-1
#df.head()
#df['Daily Returns'].hist()
mean=df['Daily Returns'].mean()
std=df['Daily Returns'].std()
print("\n Mean: ",mean)
print("\n Standard deviation: ",std)

## **MODELS FOR CLASSIFICATION**

In [None]:
logRegData=df
logRegData['PriceDifference']=logRegData['Open']-logRegData['Close']
logRegData['UpDown']=np.where(logRegData['PriceDifference']>0,'-1','1')
logRegData.head()
logRegData['LagPrice1']=logRegData['Close'].shift(1)
logRegData['LagPrice2']=logRegData['Close'].shift(2)
logRegData['LagPrice3']=logRegData['Close'].shift(3)
logRegData['LagPrice4']=logRegData['Close'].shift(4)
logRegData['LagPrice5']=logRegData['Close'].shift(5)
logRegData['MA_10']=logRegData['Close'].rolling(window=10).mean()
logRegData['MA_20']=logRegData['Close'].rolling(window=20).mean()
logRegData['STD_10']=logRegData['Close'].rolling(window=10).std()
logRegData['STD_20']=logRegData['Close'].rolling(window=20).std()
#logRegData['MACD_10']=logRegData['Close'].rolling(window=10).macd()
logRegData['EMA_12']=logRegData['Close'].ewm(span=12,adjust=False).mean()
logRegData['EMA_26']=logRegData['Close'].ewm(span=26,adjust=False).mean()
logRegData['MACD']=logRegData['EMA_12']-logRegData['EMA_26']
logRegData['UpperBollinger']=logRegData['MA_20']+2*logRegData['STD_20']
logRegData['LowerBollinger']=logRegData['MA_20']-2*logRegData['STD_20']

logRegData=logRegData.iloc[20:]
logRegData.head()

## **TRAIN TEST SPLIT**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#X=logRegData[['MA_10','MA_20','STD_10','STD_20','MACD','UpperBollinger','LowerBollinger']]
X=logRegData[['MA_10','STD_10','MACD','UpperBollinger','LowerBollinger']]
y=logRegData[['UpDown']]
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25)
y_train=y_train['UpDown'].to_list()
y_test=y_test['UpDown'].to_list()

## **LOGISTIC REGRESSION MODEL** 

In [None]:
#building logistic regression model
#training logistic regression model

from sklearn.linear_model import LogisticRegression
corr_matrix=X.corr()
print(corr_matrix)

log_model=LogisticRegression(max_iter=150)
log_model.fit(X_train,y_train)
y_pred=log_model.predict(X_test)
print(y_pred)

In [None]:
#log_model.predict_proba(X_test)
#from sklearn.metrics import log_loss
#print(log_loss(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
plot_confusion_matrix(log_model,X_test,y_test)

In [None]:
f1_log_reg = f1_score(y_test,y_pred,zero_division='warn',average=None).mean()*100
print (accuracy_score(y_test,y_pred)*100)

## **SUPPORT VECTOR CLASSIFIER MODEL**

In [None]:
from matplotlib import style
style.use("ggplot")
from sklearn import svm

In [None]:
y_train = [int(element) for element in y_train]
print(y_train)

In [None]:
y_test = [int(element) for element in y_test]

**RBF Kernel**

In [None]:
clf = svm.SVC(kernel='rbf', random_state=1, max_iter=100, gamma=0.1, C=0.02)
clf.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import GridSearchCV,StratifiedShuffleSplit

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
iter_values = [100,150,200,250]

param_grid = dict(max_iter = iter_values, gamma=gamma_range, C=C_range)

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid_svm = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=param_grid, cv=cv,scoring='f1')

grid_svm.fit(X_train,y_train)



In [None]:
print("The best parameters are %s with a score of %0.2f" % (grid_svm.best_params_, grid_svm.best_score_))

In [None]:
y_pred_svc = grid_svm.predict(X_test)
y_pred_svc

In [None]:
print(confusion_matrix(y_test,y_pred_svc))
print(classification_report(y_test,y_pred_svc))

plot_confusion_matrix(grid_svm,X_test,y_test)

In [None]:
y_pred_svc = clf.predict(X_test)
y_pred_svc

In [None]:
print(confusion_matrix(y_test,y_pred_svc))
print(classification_report(y_test,y_pred_svc))

plot_confusion_matrix(clf,X_test,y_test)

In [None]:
f1_svc = f1_score(y_test,y_pred_svc,zero_division='warn',average=None).mean()*100
print(accuracy_score(y_test,y_pred_svc)*100)

**Polynomial Kernel**

In [None]:
poly = svm.SVC(kernel='poly', random_state=1, gamma=0.1, C=0.02, max_iter=100)
poly.fit(X_train,y_train)
y_pred_svc_poly = poly.predict(X_test)
y_pred_svc_poly

In [None]:
print(confusion_matrix(y_test,y_pred_svc_poly))
print(classification_report(y_test,y_pred_svc_poly))
print(accuracy_score(y_test,y_pred_svc_poly)*100)
plot_confusion_matrix(poly,X_test,y_test)

In [None]:
f1_svc_poly = f1_score(y_test,y_pred_svc_poly,zero_division='warn',average=None).mean()*100
print(accuracy_score(y_test,y_pred_svc_poly)*100)

## **RANDOM FOREST CLASSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_rf

In [None]:

print(confusion_matrix(y_test,y_pred_rf))
print(classification_report(y_test,y_pred_rf))

plot_confusion_matrix(rf,X_test,y_test)

In [None]:
f1_rf = f1_score(y_test,y_pred_rf,zero_division='warn',average=None).mean()*100
print (accuracy_score(y_test,y_pred_rf)*100)

## **COMPARISON**

In [None]:
f1_names = ['LOGISTIC REGRESSION','SVC RBF','SVC POLY','RANDOM FOREST']
f1_data = [f1_log_reg, f1_svc, f1_svc_poly, f1_rf]
f1_df = pd.DataFrame()
f1_df['MODELS'] = f1_names
f1_df['F1 SCORES'] = f1_data
print(f1_df)
#fig = px.bar(f1_df,x = 'MODELS', y ='F1 SCORES', color =['time','medal','lifeExp','smoker'],
 #            title = "F1 Scores Comparison")

fig = px.bar(f1_df,x = 'MODELS', y ='F1 SCORES',color_continuous_scale=px.colors.sequential.Viridis,
             title = "F1 Scores Comparison")

fig.update_layout(
    autosize=False,
    width=350,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=100,
        pad=4
    ),
    
)
fig.update_traces(marker_color ='lightcoral')

fig.show()



## **TIME SERIES**

In [None]:
timeSeriesData=df[['Date','Close']]
print(timeSeriesData.head())
print(len(timeSeriesData))

In [None]:
train_data=timeSeriesData[0:1371] 
test_data=timeSeriesData[1371:]
print(train_data.head())
print(train_data.tail())
print(test_data.head())


## **SIMPLE MOVING AVERAGE METHOD**

In [None]:
#simple moving average method
timeSeriesData=df[['Date','Close']]
for i in range(0,timeSeriesData.shape[0]-4):
    timeSeriesData.loc[timeSeriesData.index[i+4],'SMA_5'] = np.round(((timeSeriesData.iloc[i,1]+ timeSeriesData.iloc[i+1,1] +timeSeriesData.iloc[i+2,1]+timeSeriesData.iloc[i+3,1]+timeSeriesData.iloc[i+4,1])/5),1)
timeSeriesData=timeSeriesData.iloc[5:]
print(timeSeriesData.head())

In [None]:
plt.figure(figsize=[15,10])
plt.grid(True)
plt.plot(timeSeriesData['Close'],label='Actual price')
plt.plot(timeSeriesData['SMA_5'],label='SMA 5 days')
plt.legend(loc=2)
plt.show()
plt.savefig('movingAvg.jpg')

In [None]:
train_data=timeSeriesData[0:1371] 
test_data=timeSeriesData[1371:]
print(train_data.head())
print(train_data.tail())
print(test_data.head())

In [None]:
import math
y=test_data['Close'].to_list()
y_hat=test_data['SMA_5'].to_list()
sum_=0
for i in range (0,len(test_data)):  
  diff=y[i]-y_hat[i]
  squared_diff=diff**2  
  sum_=sum_+squared_diff
rmse=math.sqrt(sum_/len(test_data))
print("The Root Mean Square Error is: ",rmse)

## **Augmented Dicky Fuller Test**


In [None]:
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=52,center=False).mean() 
    rolstd = timeseries.rolling(window=52,center=False).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='green',label='Bitcoin Data')
    mean_ = plt.plot(rolmean, color='red', label='Rolling Mean')
    std_ = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.xlabel('Time(Days)',fontsize=24)
    plt.ylabel('$(Dollar)',fontsize=24)
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
indexed_data = df.set_index('Date')

daily_target_prices = indexed_data['Close']

test_stationarity(daily_target_prices)

Since the p-value is larger than 0.05 , the the moving average is not constant over time and the null hypothesis of the Dickey-Fuller test cannot be rejected.The Daily Time Series is not stationary . 

In order to apply ARIMA , we have to transform the time-series into a stationary one.

In [None]:
daily_target_prices_log = np.log(daily_target_prices)

In [None]:
test_stationarity(daily_target_prices_log)

The series is still non-stationary.

**Decomposition of the series**

In [None]:
decomposition = seasonal_decompose(daily_target_prices)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid


plt.figure(figsize=(7,7))
plt.subplot(411)
plt.plot(daily_target_prices_log[-80:], label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend[-80:], label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal[-80:],label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual[-80:], label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

**Removing Trend and Seasonality with Differencing**

In [None]:
daily_prices_log_diff = daily_target_prices_log - daily_target_prices_log.shift()

In [None]:
plt.plot(daily_prices_log_diff)

In [None]:
daily_prices_log_diff.dropna(inplace=True)

In [None]:
test_stationarity(daily_prices_log_diff)

Since the p-value is less than 0.05 , the series is stationary

## **Auto Regressive Integrated Moving Average Model : ARIMA MODEL**

### **TRAIN TEST SPLIT**

In [None]:

size = int(len(daily_target_prices_log)*(0.7))
train, test = daily_target_prices_log[0:size], daily_target_prices_log[size:len(daily_target_prices_log)]

train_log_diff, test_log_diff = daily_prices_log_diff[0:size], daily_prices_log_diff[size:len(daily_prices_log_diff)]

#predictions = list()

In [None]:

#Model Fitting
model = ARIMA(daily_target_prices_log,order=(2,1,0))
results_ARIMA = model.fit(disp=-1)  
plt.plot(daily_prices_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.7f'% sum((results_ARIMA.fittedvalues-daily_prices_log_diff)**2))
plt.show()

## **MODEL TESTING**

In [None]:
history = [price for price in train]

original_values = list()
errors = list()
predictions = list()

print('Predicted vs Expected values')

for t in range(len(test)):
  model = ARIMA(history,order=(2,1,0))
  model_fit = model.fit(disp=-1)
    
  output = model_fit.forecast()
    
  pred = output[0]
    
        
  original_value = test[t]
  history.append(original_value)
    
  pred = np.exp(pred)
    
    
  original_value = np.exp(original_value)
    
  # Calculating the error
  error = ((abs(pred - original_value)) / original_value) * 100
  errors.append(error)
  print('predicted = %f,   expected = %f,   error = %f ' % (pred, original_value, error), '%')
    
  predictions.append(float(pred))
  original_values.append(float(original_value))
    
# After iterating over whole test set the overall mean error is calculated.   
print('\n Mean Error in Predicting Test Case Articles : %f ' % (sum(errors)/float(len(errors))), '%')
plt.figure(figsize=(8, 6))
test_day = [t for t in range(len(test))]
labels={'Orginal','Predicted'}
print(len(test_day))
print(len(predictions))
plt.plot(test_day, predictions, color= 'green')
plt.plot(test_day, original_values, color = 'orange')
plt.title('Expected Vs Predicted Forecasting')
plt.xlabel('Day')
plt.ylabel('Closing Price')
plt.legend(labels)
plt.show()