In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import TimestampType
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("TimeSeriesForecasting") \
    .getOrCreate()


In [None]:
# Load data into a Spark DataFrame
df = spark.read.csv("T1.csv", header=True, inferSchema=True)

# Preprocess data
dataset = df.select("Date/Time", "Wind Speed (m/s)") \
            .withColumnRenamed("Date/Time", "timeStamp") \
            .withColumnRenamed("Wind Speed (m/s)", "windSpeed")

In [None]:
# Convert timeStamp to datetime
dataset = dataset.withColumn("timeStamp", col("timeStamp").cast(TimestampType()))


In [None]:
# Check stationarity (Augmented Dickey-Fuller test)
def adfuller_test(timeseries):
    result = adfuller(timeseries)
    print("ADF Statistic:", result[0])
    print("p-value:", result[1])
    print("Critical Values:")
    for key, value in result[4].items():
        print(f"\t{key}: {value}")

In [None]:
# Convert Spark DataFrame to Pandas DataFrame for statistical tests
pandas_df = dataset.toPandas()
wind_speed_series = pandas_df["windSpeed"]

In [None]:
# Perform Augmented Dickey-Fuller test
adfuller_test(wind_speed_series)


In [None]:
# Plot the time series
pandas_df.set_index("timeStamp", inplace=True)
pandas_df.plot()
plt.show()

In [None]:
# ARIMA modeling
model = ARIMA(wind_speed_series, order=(5,1,0))
model_fit = model.fit()
print(model_fit.summary())

In [None]:
# Forecasting
forecast = model_fit.forecast(steps=24)
print("Forecasted values for the next 24 time steps:")
print(forecast)

# Stop SparkSession
spark.stop()