In [3]:
import pandas as pd  # type: ignore
import numpy as np

df = pd.read_csv("MSJO_60_01-03_2023.csv", delimiter=";")
print(df)

# Basis Datansatz sortieren
df_sorted = df.sort_values(by=["MESS_ID", "VALUEDATE"], ascending=[True, True])
# df_sorted.to_csv("sortierte_datei.csv", index=False)

# Drop rows with any missing values in MESS_ID, VALUEDATE, or PVALUE
df = df.dropna(subset=["MESS_ID", "VALUEDATE", "PVALUE"], how="any")

# Drop rows with Pvalue = 0
df = df[df["PVALUE"] != "0"]

# Drop Duplicates
duplicates = df.duplicated().sum()
df.drop_duplicates(inplace=True)

# Set PVLAUE to float
df["PVALUE"] = df["PVALUE"].str.replace(",", ".").astype(float)

# Quartile berechnen
Q1 = df["PVALUE"].quantile(0.25)
Q3 = df["PVALUE"].quantile(0.75)
IQR = Q3 - Q1

# Definiere die Schwellenwerte
obere_schwelle = Q3 + 1.5 * IQR
print(obere_schwelle)

# Filtere die Zeilen
df_filtered = df[(df["PVALUE"] >= obere_schwelle)]

# Gefiltertes DataFrame anzeigen
print("Gefiltertes DataFrame:")
print(df_filtered)

# New CSV File
# df.to_csv("cleaned.csv", index=False)
# df_filtered.to_csv("test.csv", index=False)

# Calculate mean
pvalue_mean = df["PVALUE"].mean()

# Drop outliers
df = df[(df["PVALUE"] <= obere_schwelle)]

# ------------------------------------------------

# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split

# # Convert VALUEDATE to datetime and extract the date as a numerical feature
# df["VALUEDATE"] = pd.to_datetime(df["VALUEDATE"])
# df["DATEORDINAL"] = df["VALUEDATE"].apply(lambda date: date.toordinal())

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split( df[["DATEORDINAL"]], df["PVALUE"], test_size=0.2, random_state=42 )

# # Time series forecasting
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Print the coefficient and intercept of the model
# print(f"Coefficient: {model.coef_}")
# print(f"Intercept: {model.intercept_}")

#--------------------------------------------------

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

df['VALUEDATE'] = pd.to_datetime(df['VALUEDATE'])  # Convert 'date' column to datetime
df.set_index('VALUEDATE', inplace=True) 

df['avg_value'] = df.mean(axis=1)

# Resample 60-second data to 15-minute intervals
df_60s_resampled = df['avg_value'].resample('15T').mean()
df_60s_resampled = df_60s_resampled.fillna(df_60s_resampled.mean())

# Train-Test Split
train_size = int(len(df_60s_resampled) * 0.8)
train_60s, test_60s = df_60s_resampled[:train_size], df_60s_resampled[train_size:]
# train_15min, test_15min = df_15min[:train_size], df_15min[train_size:]

# Train ARIMA model on 60-second resampled data
arima_60s = ARIMA(train_60s, order=(5, 1, 0))
arima_60s_fit = arima_60s.fit()

# Train ARIMA model on 15-minute data
# arima_15min = ARIMA(train_15min, order=(5, 1, 0))
# arima_15min_fit = arima_15min.fit()

# Forecasting
arima_60s_forecast = arima_60s_fit.forecast(steps=len(test_60s))
# arima_15min_forecast = arima_15min_fit.forecast(steps=len(test_15min))

# Evaluation
arima_60s_mse = mean_squared_error(test_60s, arima_60s_forecast)
# arima_15min_mse = mean_squared_error(test_15min, arima_15min_forecast)

print(f'ARIMA 60-second resampled MSE: {arima_60s_mse}')
# print(f'ARIMA 15-minute MSE: {arima_15min_mse}')

        MESS_ID                    VALUEDATE  PVALUE
0          5235  2023-02-08 00:00:00.0000000  0,0375
1          5235  2023-02-08 00:01:00.0000000   0,045
2          5235  2023-02-08 00:02:00.0000000   0,045
3          5235  2023-02-08 00:03:00.0000000   0,045
4          5235  2023-02-08 00:04:00.0000000   0,045
...         ...                          ...     ...
316453     5241  2023-01-17 21:34:00.0000000       0
316454     5241  2023-01-17 21:35:00.0000000       0
316455     5241  2023-01-17 21:36:00.0000000       0
316456     5241  2023-01-17 21:37:00.0000000       0
316457     5241               2023-01-17 21:     NaN

[316458 rows x 3 columns]
5.4825
Gefiltertes DataFrame:
        MESS_ID                    VALUEDATE     PVALUE
37194      5425  2023-01-10 15:00:00.0000000   9.208419
37195      5425  2023-01-10 15:01:00.0000000  11.302227
37196      5425  2023-01-10 15:02:00.0000000  11.289743
37197      5425  2023-01-10 15:03:00.0000000  11.369067
37198      5425  2023-01-10

  df_60s_resampled = df['avg_value'].resample('15T').mean()


ARIMA 60-second resampled MSE: 3.4563358084501887
