In [None]:
!pip install pmdarima

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
import matplotlib
import matplotlib.pylab as plt
import matplotlib.pyplot as plt2
matplotlib.style.use('seaborn')
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 5
from plotly.graph_objs import *
from tqdm import tqdm
import statsmodels.api as sm
from pmdarima.arima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = '/kaggle/input/price-volume-data-for-all-us-stocks-etfs/Stocks/pypl.us.txt'
#path = '/kaggle/input/price-volume-data-for-all-us-stocks-etfs/Stocks/mrk.us.txt'
dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d')
df = pd.read_csv(path,sep=',', index_col='Date', parse_dates=['Date'], date_parser=dateparse).fillna(0)

df['Close']
df['Close_diff'] = df['Close']-df.shift()['Close']
df = df[['Close', 'Close_diff']]
df = df.dropna()
df

In [None]:
#plot close price
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Date')
plt.ylabel('Close Prices')
plt.plot(df['Close'])
plt.title('Financial instrument closing price')
plt.show()

In [None]:
#split into test and train sets

#Choose to split on percentage

#test_size = 0.1
#train_data = df[:int(len(df['Close'])*(1-test_size))]
#test_data = df[-int(len(df['Close'])*test_size):]

#Choice to split on date range

train_data = df['2017-04-01':'2017-10-10']
test_data = df['2017-10-10':]

plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(train_data['Close'], 'black', label='Train data')
plt.plot(test_data['Close'], 'red', label='Test data')
plt.legend()

In [None]:
#Check if time series is staionary
def timeseries_stationary(ts):
    #Determing rolling statistics
    rolmean = ts.rolling(12).mean()
    rolstd = ts.rolling(12).std()
    plt.plot(rolmean, color='red', label='Mean')
    plt.plot(rolstd, color='black', label = 'Std')
    plt.legend(loc='best')
    plt.title('Mean and Standard Deviation')
    plt.show(block=False)
    
    print("ADF Result")
    adft = adfuller(ts,autolag='AIC')
    result = adft[0:4]
    output = pd.Series(data = result,index=['ADF Statistic','p-value','Lags used','Observations used'])
    for index, item in adft[4].items():
        output['critical value: %s'%index] =  item
    print(output)
    
timeseries_stationary(train_data['Close'])
timeseries_stationary(train_data['Close_diff'])

In [None]:
#Use to determine AR and MA parameters
plt = sm.graphics.tsa.plot_pacf(train_data['Close_diff'], lags=40, zero=False)
plt = sm.graphics.tsa.plot_acf(train_data['Close_diff'], lags=40, zero=False)
plt.show()

In [None]:
model = auto_arima(train_data['Close'], start_p=0, start_q=0,
                      test='adf',       
                      max_p=3, max_q=3, 
                      m=1,              
                      d=None,           
                      seasonal=False,
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

print(model.summary())
plt.show()

In [None]:
# Build Model
model = ARIMA(train_data['Close'], order=(1, 1, 2))  
fitted = model.fit(disp=-1)  

# Forecast with 95 % confidence interval
fc, se, conf = fitted.forecast(len(test_data['Close']), alpha=0.05)


# Make as pandas series
forecast_series = pd.Series(fc, index=test_data['Close'].index)
lower_bound = pd.Series(conf[:, 0], index=test_data.index)
upper_bound = pd.Series(conf[:, 1], index=test_data.index)

# Plot
plt2.figure(figsize=(12,5), dpi=100)
plt2.plot(train_data['Close'], label='training')
plt2.plot(test_data['Close'], label='testing')
plt2.plot(forecast_series, label='forecast')
plt2.fill_between(lower_bound.index, lower_bound, upper_bound, 
                 color='b', alpha=.15)
plt2.title('Forecast vs Actual')
plt2.legend(loc='upper left', fontsize=8)
plt2.show()

In [None]:
#Root mean squared Error
rmse = math.sqrt(mean_squared_error(test_data['Close'], fc))
print('Root Mean Squared Error: '+str(rmse))