In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import to_timestamp
#from pyspark.ml.regression import ARMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from pandas.tseries.offsets import DateOffset
import pandas as pd
import numpy as np
import os

In [5]:
# Initialize SparkSession
os.environ['SPARK_HOME'] = 'C:/spark-3.5.0-bin-hadoop3'
os.environ['PATH'] += 'C:/spark-3.5.0-bin-hadoop3/bin'
spark = SparkSession.builder \
    .appName("TimeSeriesAnalysis") \
    .getOrCreate()

In [6]:
# Load data
df = spark.read.csv('../Dataset/T1.csv', header=True, inferSchema=True)


In [23]:
# Select relevant columns with double backticks to preserve case sensitivity
dataset = df.select(df["Date/Time"].alias("timeStamp"), df["Wind Speed (m/s)"].alias("windSpeed"))


In [24]:
print(dataset)

DataFrame[timeStamp: string, windSpeed: double]


In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp


input_time_format = "dd MM yyyy HH:mm"
output_time_format = "yyyy MM dd HH:mm"

# Convert the timestamp format
dataset = dataset.withColumn("timeStamp", to_timestamp(col("timeStamp"), input_time_format))

# Format the timestamp to the desired output format
dataset = dataset.withColumn("timeStamp", dataset["timeStamp"].cast("string"))
dataset.show(truncate=False)


+-------------------+-----------+
|timeStamp          |windSpeed  |
+-------------------+-----------+
|2018-01-01 00:00:00|5.31133604 |
|2018-01-01 00:10:00|5.672166824|
|2018-01-01 00:20:00|5.216036797|
|2018-01-01 00:30:00|5.659674168|
|2018-01-01 00:40:00|5.577940941|
|2018-01-01 00:50:00|5.604052067|
|2018-01-01 01:00:00|5.793007851|
|2018-01-01 01:10:00|5.306049824|
|2018-01-01 01:20:00|5.584629059|
|2018-01-01 01:30:00|5.523228168|
|2018-01-01 01:40:00|5.724115849|
|2018-01-01 01:50:00|5.934198856|
|2018-01-01 02:00:00|6.547413826|
|2018-01-01 02:10:00|6.199746132|
|2018-01-01 02:20:00|6.505383015|
|2018-01-01 02:30:00|6.634116173|
|2018-01-01 02:40:00|6.378912926|
|2018-01-01 02:50:00|6.446652889|
|2018-01-01 03:00:00|6.415082932|
|2018-01-01 03:10:00|6.437530994|
+-------------------+-----------+
only showing top 20 rows



In [26]:
#from pyspark.sql.functions import col

# Convert timeStamp column to timestamp type
dataset = dataset.withColumn("timeStamp", col("timeStamp").cast("timestamp"))

# Set timeStamp as the index
dataset = dataset.withColumn("index", col("timeStamp").cast("long")).sort("index").drop("index")


In [29]:
# (Previous code...)

# Impute missing values using the lag function
dataset = dataset.withColumn("windSpeed_imputed", when(col("windSpeed").isNull(), lag("windSpeed").over(windowSpec)).otherwise(col("windSpeed")))

# Fill initial null values with a default value
dataset = dataset.na.fill(0, subset=["windSpeed_imputed"])

# (Continued code...)

# Create lag features
for i in range(1, 5):
    dataset = dataset.withColumn(f"lag_{i}", lag("windSpeed_imputed", i).over(windowSpec))

# Drop rows with missing lag features
dataset = dataset.dropna()

# (Continued code...)


In [None]:
# Generate a column for ticks
dataset = dataset.withColumn("Ticks", (lag("windSpeed").over(windowSpec)).isNull().cast("int"))

In [None]:
# Plot original data
original_data = dataset.toPandas()
plt.figure(figsize=(20, 10))
plt.plot(original_data["Ticks"], original_data["windSpeed_imputed"])
plt.xlabel("Ticks")
plt.ylabel("Wind Speed (m/s)")
plt.title("Original Plot")
plt.show()

In [None]:
# Define a function for stationarity check using ADF test
def stationarity_check(df):
    df_values = df.select("windSpeed_imputed").toPandas()
    result = adfuller(df_values["windSpeed_imputed"])
    print('Augmented Dickey-Fuller test:')
    print(f'Test Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    print(f'Critical Values:')
    for key, value in result[4].items():
        print(f'   {key}: {value}')

# Apply stationarity check
stationarity_check(dataset)

In [None]:
# Create lag features
for i in range(1, 5):
    dataset = dataset.withColumn(f"lag_{i}", lag("windSpeed_imputed", i).over(windowSpec))

# Drop rows with missing lag features
dataset = dataset.dropna()

In [None]:
# Assemble features
feature_cols = ["lag_1", "lag_2", "lag_3", "lag_4"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
dataset = assembler.transform(dataset)

In [None]:
# Fit ARMA model
arma = ARMA(spark, labelCol="windSpeed_imputed", featuresCol="features", p=3, q=3)
model = arma.fit(dataset)


In [None]:
# Plot predictions
predictions = model.transform(dataset)
predictions_pd = predictions.select("timeStamp", "prediction").toPandas()
plt.plot(original_data["Ticks"], original_data["windSpeed_imputed"], label="Actual")
plt.plot(predictions_pd["timeStamp"], predictions_pd["prediction"], label="Predicted")
plt.xlabel("Ticks")
plt.ylabel("Wind Speed (m/s)")
plt.title("Fitted data")
plt.legend()
plt.show()

In [None]:
# Save the model
model.save("humidityModel")

In [None]:
# Generate future dates
future_dates = [pd.Timestamp(original_data["timeStamp"].iloc[-1]) + pd.DateOffset(months=x) for x in range(1, 25)]
future_datest_df = pd.DataFrame(index=future_dates[1:], columns=original_data.columns)
future_df = pd.concat([original_data, future_datest_df])

In [None]:
# Make predictions for future dates
future_df["forecast"] = model.transform(assembler.transform(spark.createDataFrame(future_df))).select("prediction").toPandas()