In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from statsmodels.tsa.arima.model import ARIMA
import pandas as pd

In [4]:
spark = SparkSession.builder \
    .appName("SparkML ARIMA Example") \
    .getOrCreate()

24/08/26 19:11:27 WARN Utils: Your hostname, Ankbot resolves to a loopback address: 127.0.1.1; using 192.168.1.140 instead (on interface wlo1)
24/08/26 19:11:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 19:11:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [26]:
# Load the data into a DataFrame
df = spark.read.csv("timeseries_data.csv", header=True, inferSchema=True)

# Convert the timestamp column to a proper timestamp data type
df = df.withColumn("timestamp", to_timestamp(col("timestamp")))

# Sort data by timestamp
df = df.orderBy("timestamp")

In [27]:
df.show()

+-------------------+-----+
|          timestamp|value|
+-------------------+-----+
|2023-01-01 00:00:00|  100|
|2023-01-01 01:00:00|  102|
|2023-01-01 02:00:00|  101|
|2023-01-01 03:00:00|  105|
|2023-01-01 04:00:00|  107|
|2023-01-01 05:00:00|  110|
|2023-01-01 06:00:00|  108|
|2023-01-01 07:00:00|  111|
|2023-01-01 08:00:00|  115|
|2023-01-01 09:00:00|  120|
|2023-01-01 10:00:00|  118|
|2023-01-01 11:00:00|  122|
|2023-01-01 12:00:00|  125|
|2023-01-01 13:00:00|  130|
|2023-01-01 14:00:00|  128|
|2023-01-01 15:00:00|  133|
|2023-01-01 16:00:00|  135|
|2023-01-01 17:00:00|  138|
|2023-01-01 18:00:00|  140|
|2023-01-01 19:00:00|  142|
+-------------------+-----+
only showing top 20 rows



In [28]:
# Assemble features for SparkML (if you want to use regression or other ML models)
assembler = VectorAssembler(inputCols=["value"], outputCol="features")
df_assembled = assembler.transform(df)

# Example of using a linear regression model in SparkML
lr = LinearRegression(featuresCol="features", labelCol="value")
lr_model = lr.fit(df_assembled)
predictions = lr_model.transform(df_assembled)

predictions.select("timestamp", "value", "prediction").show()

+-------------------+-----+------------------+
|          timestamp|value|        prediction|
+-------------------+-----+------------------+
|2023-01-01 00:00:00|  100|100.00000000000016|
|2023-01-01 01:00:00|  102|102.00000000000014|
|2023-01-01 02:00:00|  101|101.00000000000014|
|2023-01-01 03:00:00|  105|105.00000000000013|
|2023-01-01 04:00:00|  107|107.00000000000011|
|2023-01-01 05:00:00|  110|110.00000000000009|
|2023-01-01 06:00:00|  108| 108.0000000000001|
|2023-01-01 07:00:00|  111|111.00000000000009|
|2023-01-01 08:00:00|  115|115.00000000000006|
|2023-01-01 09:00:00|  120|120.00000000000003|
|2023-01-01 10:00:00|  118|118.00000000000004|
|2023-01-01 11:00:00|  122|122.00000000000001|
|2023-01-01 12:00:00|  125|124.99999999999999|
|2023-01-01 13:00:00|  130|129.99999999999997|
|2023-01-01 14:00:00|  128|127.99999999999997|
|2023-01-01 15:00:00|  133|132.99999999999994|
|2023-01-01 16:00:00|  135|134.99999999999994|
|2023-01-01 17:00:00|  138|137.99999999999991|
|2023-01-01 1

24/08/26 19:22:45 WARN Instrumentation: [eb9c4880] regParam is zero, which might cause numerical instability and overfitting.


In [29]:
df_assembled.show()

+-------------------+-----+--------+
|          timestamp|value|features|
+-------------------+-----+--------+
|2023-01-01 00:00:00|  100| [100.0]|
|2023-01-01 01:00:00|  102| [102.0]|
|2023-01-01 02:00:00|  101| [101.0]|
|2023-01-01 03:00:00|  105| [105.0]|
|2023-01-01 04:00:00|  107| [107.0]|
|2023-01-01 05:00:00|  110| [110.0]|
|2023-01-01 06:00:00|  108| [108.0]|
|2023-01-01 07:00:00|  111| [111.0]|
|2023-01-01 08:00:00|  115| [115.0]|
|2023-01-01 09:00:00|  120| [120.0]|
|2023-01-01 10:00:00|  118| [118.0]|
|2023-01-01 11:00:00|  122| [122.0]|
|2023-01-01 12:00:00|  125| [125.0]|
|2023-01-01 13:00:00|  130| [130.0]|
|2023-01-01 14:00:00|  128| [128.0]|
|2023-01-01 15:00:00|  133| [133.0]|
|2023-01-01 16:00:00|  135| [135.0]|
|2023-01-01 17:00:00|  138| [138.0]|
|2023-01-01 18:00:00|  140| [140.0]|
|2023-01-01 19:00:00|  142| [142.0]|
+-------------------+-----+--------+
only showing top 20 rows



In [30]:
df.show()

+-------------------+-----+
|          timestamp|value|
+-------------------+-----+
|2023-01-01 00:00:00|  100|
|2023-01-01 01:00:00|  102|
|2023-01-01 02:00:00|  101|
|2023-01-01 03:00:00|  105|
|2023-01-01 04:00:00|  107|
|2023-01-01 05:00:00|  110|
|2023-01-01 06:00:00|  108|
|2023-01-01 07:00:00|  111|
|2023-01-01 08:00:00|  115|
|2023-01-01 09:00:00|  120|
|2023-01-01 10:00:00|  118|
|2023-01-01 11:00:00|  122|
|2023-01-01 12:00:00|  125|
|2023-01-01 13:00:00|  130|
|2023-01-01 14:00:00|  128|
|2023-01-01 15:00:00|  133|
|2023-01-01 16:00:00|  135|
|2023-01-01 17:00:00|  138|
|2023-01-01 18:00:00|  140|
|2023-01-01 19:00:00|  142|
+-------------------+-----+
only showing top 20 rows



In [38]:
# Convert Spark DataFrame to Pandas DataFrame for ARIMA
data = df.collect()
pdf = pd.DataFrame(data, columns=df.columns)
#pdf = df.toPandas()

# Set the timestamp as index
pdf.set_index('timestamp', inplace=True)

# Fit the ARIMA model
model = ARIMA(pdf['value'], order=(5, 1, 0))  # Example order (p,d,q)
arima_model = model.fit()

# Make predictions
forecast = arima_model.forecast(steps=10)  # Forecast the next 10 steps
print(forecast)

2023-01-02 01:00:00    161.014818
2023-01-02 02:00:00    163.260033
2023-01-02 03:00:00    167.620581
2023-01-02 04:00:00    170.774071
2023-01-02 05:00:00    173.716787
2023-01-02 06:00:00    176.156320
2023-01-02 07:00:00    180.046492
2023-01-02 08:00:00    183.247630
2023-01-02 09:00:00    186.148445
2023-01-02 10:00:00    188.703396
Freq: H, Name: predicted_mean, dtype: float64


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'


In [40]:
from pyspark.sql.functions import window

# Example: Aggregating and then applying ARIMA
df_resampled = df.groupBy(window("timestamp", "1 hour")).agg({"value": "avg"})
pdf_resampled = df_resampled.toPandas()

# Fit ARIMA model on resampled data
model = ARIMA(pdf_resampled['avg(value)'], order=(5, 1, 0))
arima_model = model.fit()

# Forecast
forecast_resampled = arima_model.forecast(steps=10)
print(forecast_resampled)

25    109.529605
26    139.156622
27    110.903015
28    138.943936
29    114.313223
30    136.527726
31    114.721689
32    135.307438
33    115.832929
34    134.173056
Name: predicted_mean, dtype: float64


  warn('Non-stationary starting autoregressive parameters'
