In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import TimestampType
from pyspark.ml.feature import VectorAssembler
from statsmodels.tsa.arima_model import ARMA
import numpy as np

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("TimeSeriesForecasting") \
    .getOrCreate()

In [None]:
# Load data into a Spark DataFrame
df = spark.read.csv('T1.csv', header=True, inferSchema=True)

# Select relevant columns and rename them
dataset = df.select("Date/Time", "Wind Direction (°)") \
            .withColumnRenamed("Date/Time", "timeStamp") \
            .withColumnRenamed("Wind Direction (°)", "windDirection")

In [None]:
# Convert timeStamp to timestamp type
dataset = dataset.withColumn("timeStamp", col("timeStamp").cast(TimestampType()))

In [None]:
# Train ARMA model
data = np.array(dataset.select("windDirection").collect()).flatten()
model = ARMA(data, order=(3, 3))
results_MA = model.fit()

In [None]:
# Plot the time series data
import matplotlib.pyplot as plt
plt.plot(data)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('Fitting data _ MSE: %.2f'% (((results_MA.fittedvalues-data)**2).mean()))
plt.show()

In [None]:
# Forecasting
forecast = results_MA.predict(len(data), len(data)+24)

# Save the model
results_MA.save("wind_direction_arma_model")

# Stop SparkSession
spark.stop()