In [None]:
import import_ipynb
#import YahooFinance
import yfinance as yf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras

import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima_model import ARIMA

In [None]:
data = yf.download("CL=F", period="max")

In [None]:
data.reset_index(drop=False, inplace=True)

In [None]:
train_data = data[0:int(len(data)*0.6)]
test_data = data[int(len(data)*0.6):] 

x_train = train_data['Open']
x_test = test_data['Open']

In [None]:
plt.figure(figsize=(12,7))
plt.title('WTI Prices')
plt.xlabel('Dates')
plt.ylabel('Prices')
plt.plot(data['Open'], 'black', label='Training Data')
plt.xticks(np.arange(0,len(data), 50), data['Date'][0:len(data):50])


In [None]:
plt.figure(figsize=(12,7))
plt.title('WTI Prices')
plt.xlabel('Dates')
plt.ylabel('Prices')
plt.plot(data['Open'], 'orange', label='Training Data')
plt.plot(test_data['Open'], 'blue', label='Testing Data')
plt.xticks(np.arange(0,len(data), 1000), data['Date'][0:len(data):1000])
plt.legend()

In [None]:
#Augmented Dickey-Fuller Test
def test_stationarity(timeseries, window = 5, cutoff = 0.01):
    #Determing rolling statistics
    rolmean = timeseries.rolling(window).mean()
    rolstd = timeseries.rolling(window).std()
    #Plot rolling statistics:
    fig = plt.figure(figsize=(12, 8))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.xticks(np.arange(0,len(data), 1000), data['Date'][0:len(data):1000])
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC', maxlag = 20 )
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
        pvalue = dftest[1]
        if pvalue < cutoff:
            print('p-value = %.4f. The series is likely stationary.' % pvalue)
        else:
            print('p-value = %.4f. The series is likely non-stationary.' % pvalue)
    print(dfoutput)
    
test_stationarity(data['Open'])

In [None]:
# Get the difference of each Adj Close point
close_diffs = data['Open'].diff(2)
# Drop the first row as it will have a null value in this column
close_diffs.dropna(inplace=True)

test_stationarity(close_diffs)

In [None]:
plot_acf(close_diffs)
plt.xlabel('Lags (Days)')
plt.show()
# Break these into two separate cells
plot_pacf(close_diffs)
plt.xlabel('Lags (Days)')
plt.show()

In [None]:
# fit model
wti_arima = ARIMA(train_data['Open'], order=(1,1,1))
wti_arima_fit = wti_arima.fit(disp=0)
print(wti_arima_fit.summary())

In [None]:
# Create list of x train valuess
history = [x for x in x_train]
# establish list for predictions
model_predictions = []
# Count number of test data points
N_test_observations = len(x_test)
# loop through every data point
for time_point in list(x_test.index):
    model = ARIMA(history, order=(1,1,1))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    model_predictions.append(yhat)
    true_test_value = x_test[time_point]
    history.append(true_test_value)
MAE_error = keras.metrics.mean_absolute_error(x_test, model_predictions).numpy()
print('Testing Mean Squared Error is {}'.format(MAE_error))
%store model_predictions

In [None]:
# %store model_predictions
#%store -r model_predictions
# Check to see if it reloaded
#model_predictions[:5]
# Load model
#from statsmodels.tsa.arima.model import ARIMAResults
#loaded = ARIMAResults.load('arima_111.pkl')

In [None]:
plt.rcParams['figure.figsize'] = [20, 10]
plt.plot(x_test.index, model_predictions, color='blue', marker='o', linestyle='dashed', 
         label='Predicted Price')
plt.plot(x_test.index, x_test, color='red', label='Actual Price')
plt.plot(train_data['Open'], 'orange', label='Training Data')
plt.title('WTI Prices Prediction')
plt.xlabel('Date')
plt.ylabel('Prices')
plt.xticks(np.arange(0,len(data), 1000), data['Date'][0:len(data):1000])
plt.legend()
plt.figure(figsize=(10,6))
plt.show()

plt.rcParams['figure.figsize'] = [10, 10]
plt.plot(x_test.index[-100:], model_predictions[-100:], color='blue',label='Predicted Price')
plt.plot(x_test.index[-100:], x_test[-100:], color='red', label='Actual Price')
plt.title('WTI Prices Prediction')
plt.xlabel('Date')
plt.ylabel('Prices')
# plt.xticks(np.arange(881,1259,50), df.Date[881:1259:50])
plt.legend()
plt.figure(figsize=(10,6))
plt.show()

In [None]:
from sklearn import metrics
import math
import sklearn

In [None]:
mse = sklearn.metrics.mean_squared_error(x_test, model_predictions)
rmse = math.sqrt(mse)


In [None]:
print(rmse)

print(x_test.head())

In [None]:
f= open("pred-arima.txt","w")
g= open("pred-real.txt","w")

real = list(x_test)

for i in range(0,len(test_data)):
    f.write(str(real[i]) +",")
    g.write(str(model_predictions[i][0]) +",")
    
f.close()
g.close()

In [None]:
print(len(real))