In [None]:
### Please ignore the warning messages due to duplicate methods
library(readxl)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tseries)
library(forecast)
library(TTR)
library(zoo)
library(xts)

In [None]:
### Read the excel file
airquality<-read_excel("C:\\Users\\poonam\\Downloads\\AirQualityUCI\\AirQualityUCI.xlsx")
#airquality
### add the data to dataframe for manipulation
a<-data.frame(airquality)
#a
### Selecled the few columns(date, time and co) from the data frame 
df_co<-select(a,Date,Time,CO.GT.)
#df_co
testdata <- df_co
### manipulation of time columns( chose hour)
testdata1 <- mutate(testdata, Time = hour(testdata$Time))
### created a new column date2(combine the date and time column)
testdata2 <- mutate(testdata1, date2 = update(testdata1$Date,hour = testdata1$Time))
#testdata2
### selected the date2 and co.GT column for further processing
cleanedset1 <- select(testdata2, date2, CO.GT.)
### replace the -200 values with the NA 
cleanedset1$CO.GT.[cleanedset1$CO.GT. == -200] <- NA
### use the zoo package to fill the na values and maintain the irregular time series
z <- zoo(select(cleanedset1,CO.GT.))
x <- na.fill(z, "extend")
### put the zoo series into data frame and gave the name to the column
b <- data.frame(x)
colnames(b) <- c("hourly averaged concentration CO")
### bind the two data frame
cleanset2=cbind(cleanedset1,b)
### selected the needed column
cleanset3 = select(cleanset2,date2,'hourly averaged concentration CO')
plot(cleanset3)

In [None]:
#summary(cleanset3)
### change column(date with time) into index
x3.index <- select(cleanset3,date2)
x3.data <- select(cleanset3,'hourly averaged concentration CO')
### time series data
x4 <-zoo(x3.data,x3.index$date2)
### basic commands to check the time series data
str(x4)

In [None]:
start(x4)

In [None]:
end(x4)

In [None]:
tail(cycle(x4))

In [None]:
### plot the time series data
plot(x4,main="original data(hourly CO concentration )",xlab="Time",ylab="CO.GT.")

In [None]:
### Make the data stationary
### log of data to stabilize non-constant variance
plot(log(x4))

In [None]:
### Make the mean constant
#diff(log(tsdata))
plot(diff(log(x4)))

In [None]:
###Dickey-Fuller Tests to check  data stationarity
adf_test<-adf.test(diff(log(x4))[,1],alternative='stationary')
print(adf_test)


In [None]:
###  ARIMA Model
### Auto correlation factor on non stationary data
acf(x4)

### Auto correlation factor on stationary data
###ACF is a plot of total correlation between different lag functions.
acf(diff(log(x4)))

### Partial auto correlation factor on stationary data
pacf(diff(log(x4)))

### ARIMA model
mymodel<-auto.arima(x4)
#mymodel

### to see possible combination of p,d,q
auto.arima(x4,ic="aic",trace=TRUE)

### plot the model
plot.ts(mymodel$residuals)
acf(ts(mymodel$residuals),main='ACF Residual')
pacf(ts(mymodel$residuals),main='ACF Residual')

In [None]:
### forcast the model 
myforecast<-forecast(mymodel,level=c(95),h=360)
plot(myforecast,xlab="Time",ylab="CO.GT.")

In [None]:
### line plot of real values and predicted values
predict_val=fitted(myforecast)
original_val=x4
plot(predict_val,col="red",main="Real/Predicted values",ylab="CO.GT.")
lines(x4,col="blue")

In [None]:
############ Accuracy measures of model
accuracy(myforecast)

In [None]:
####### Correlation between actual and forcast. 
res<-cor.test(predict_val,original_val,method="pearson")
res