In [None]:
# Clean data
import pandas as pd  # type: ignore

df60 = pd.read_csv("MSJO 60 01-03 2023.csv", delimiter=";")
df900 = pd.read_csv("MSJO 900 01-03 2023.csv", delimiter=";")
print(df60)

# Basis Datansatz sortieren
df_sorted = df60.sort_values(by=["MESS_ID", "VALUEDATE"], ascending=[True, True])
# df_sorted.to_csv("sortierte_datei.csv", index=False)

# Drop rows with any missing values in MESS_ID, VALUEDATE, or PVALUE
df60 = df60.dropna(subset=["MESS_ID", "VALUEDATE", "PVALUE"], how="any")
df900 = df900.dropna(subset=["MESS_ID", "VALUEDATE", "PVALUE"], how="any")

# Drop rows with Pvalue = 0
df60 = df60[df60["PVALUE"] != "0"]
df900 = df900[df900["PVALUE"] != "0"]

# Drop Duplicates
duplicates = df60.duplicated().sum()
df60.drop_duplicates(inplace=True)
duplicates = df900.duplicated().sum()
df900.drop_duplicates(inplace=True)

# Set PVLAUE to float
df60["PVALUE"] = df60["PVALUE"].str.replace(",", ".").astype(float)
df900["PVALUE"] = df900["PVALUE"].str.replace(",", ".").astype(float)

# Quartile berechnen
Q1_60 = df60["PVALUE"].quantile(0.25)
Q3_60 = df60["PVALUE"].quantile(0.75)
IQR60 = Q3_60 - Q1_60
Q1_900 = df900["PVALUE"].quantile(0.25)
Q3_900 = df900["PVALUE"].quantile(0.75)
IQR900 = Q3_900 - Q1_900

# Definiere die Schwellenwerte
obere_schwelle_60 = Q3_60 + 1.5 * IQR60
print(obere_schwelle_60)
obere_schwelle_900 = Q3_900 + 1.5 * IQR900
print(obere_schwelle_900)

# Filtere die Zeilen
df_filtered_60 = df60[(df60["PVALUE"] >= obere_schwelle_60)]
# df_filtered_900 = df900[(df60["PVALUE"] >= obere_schwelle_900)]

# Gefiltertes DataFrame anzeigen
print("Gefiltertes DataFrame:")
print(df_filtered_60)
# print(df_filtered_900)

# New CSV File
df60.to_csv("cleaned60.csv", index=False)
df900.to_csv("cleaned900.csv", index=False)
# df_filtered.to_csv("test.csv", index=False)

# Calculate mean
pvalue_mean = df60["PVALUE"].mean()

# Drop outliers
df60 = df60[(df60["PVALUE"] <= obere_schwelle_60)]
df900 = df900[(df900["PVALUE"] <= obere_schwelle_900)]

# ------------------------------------------------

# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split

# # Convert VALUEDATE to datetime and extract the date as a numerical feature
# df["VALUEDATE"] = pd.to_datetime(df["VALUEDATE"])
# df["DATEORDINAL"] = df["VALUEDATE"].apply(lambda date: date.toordinal())

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split( df[["DATEORDINAL"]], df["PVALUE"], test_size=0.2, random_state=42 )

# # Time series forecasting
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Print the coefficient and intercept of the model
# print(f"Coefficient: {model.coef_}")
# print(f"Intercept: {model.intercept_}")

#--------------------------------------------------


In [None]:
# ARIMA Modell
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

df60 = pd.read_csv("cleaned60.csv")
df900 = pd.read_csv("cleaned900.csv")

# Convert 'VALUEDATE' column to datetime and set it as index
df60['VALUEDATE'] = pd.to_datetime(df60['VALUEDATE'])
df60.set_index('VALUEDATE', inplace=True)

df900['VALUEDATE'] = pd.to_datetime(df900['VALUEDATE'])
df900.set_index('VALUEDATE', inplace=True)

# Calculate the average value for each DataFrame
df60['avg_value'] = df60.mean(axis=1)
df900['avg_value'] = df900.mean(axis=1)

# Resample 60-second data to 15-minute intervals and handle missing values
df_60s_resampled = df60['avg_value'].resample('15T').mean()
df_60s_resampled = df_60s_resampled.fillna(df_60s_resampled.mean())

# Ensure df900's avg_value is handled similarly
df900['avg_value'] = df900['avg_value'].fillna(df900['avg_value'].mean())

# Train-Test Split
train_size_60s = int(len(df_60s_resampled) * 0.8)
train_size_15min = int(len(df900) * 0.8)

train_60s, test_60s = df_60s_resampled[:train_size_60s], df_60s_resampled[train_size_60s:]
train_15min, test_15min = df900['avg_value'][:train_size_15min], df900['avg_value'][train_size_15min:]

# Train ARIMA model on 60-second resampled data
arima_60s = ARIMA(train_60s, order=(5, 1, 0))
arima_60s_fit = arima_60s.fit()

# Train ARIMA model on 15-minute data
arima_15min = ARIMA(train_15min, order=(5, 1, 0))
arima_15min_fit = arima_15min.fit()

# Forecasting
arima_60s_forecast = arima_60s_fit.forecast(steps=len(test_60s))
arima_15min_forecast = arima_15min_fit.forecast(steps=len(test_15min))

# Evaluation
arima_60s_mse = mean_squared_error(test_60s, arima_60s_forecast)
arima_15min_mse = mean_squared_error(test_15min, arima_15min_forecast)

print(f'ARIMA 60-second resampled MSE: {arima_60s_mse}')
print(f'ARIMA 15-minute MSE: {arima_15min_mse}')


In [3]:
# Prophet Modell
import pandas as pd
from prophet import Prophet

# Laden der 60-Sekunden-Daten
df60 = pd.read_csv('cleaned60.csv')
df60['VALUEDATE'] = pd.to_datetime(df60['VALUEDATE'])  # Umwandlung der 'VALUEDATE'-Spalte in Datetime
df60.set_index('VALUEDATE', inplace=True)  # Setzen der 'VALUEDATE'-Spalte als Index

# Laden der 15-Minuten-Daten
df900 = pd.read_csv('cleaned900.csv')
df900['VALUEDATE'] = pd.to_datetime(df900['VALUEDATE'])  # Umwandlung der 'VALUEDATE'-Spalte in Datetime
df900.set_index('VALUEDATE', inplace=True)  # Setzen der 'VALUEDATE'-Spalte als Index

# Fehlende Werte mit dem Mittelwert füllen
df60['avg_value'] = df60.mean(axis=1).fillna(df60.mean(axis=1).mean())
df900['avg_value'] = df900.mean(axis=1).fillna(df900.mean(axis=1).mean())

# Resample 60-Sekunden-Daten auf 15-Minuten-Intervalle
df_60s_resampled = df60['avg_value'].resample('15T').mean().fillna(df60.mean(axis=1).mean())

# Vorbereitung der Daten für Prophet
df_60s_prophet = df_60s_resampled.reset_index().rename(columns={'VALUEDATE': 'ds', 'avg_value': 'y'})
df_15min_prophet = df900.reset_index().rename(columns={'VALUEDATE': 'ds', 'avg_value': 'y'})

# Train-Test Split
train_size = int(len(df_60s_prophet) * 0.8)
train_60s_prophet = df_60s_prophet[:train_size]
test_60s_prophet = df_60s_prophet[train_size:]

train_15min_prophet = df_15min_prophet[:train_size]
test_15min_prophet = df_15min_prophet[train_size:]

# Training des Prophet-Modells auf 60-Sekunden-Daten
model_60s = Prophet()
model_60s.fit(train_60s_prophet)

# Training des Prophet-Modells auf 15-Minuten-Daten
model_15min = Prophet()
model_15min.fit(train_15min_prophet)

# Prognose
future_60s = model_60s.make_future_dataframe(periods=len(test_60s_prophet), freq='15T')
forecast_60s = model_60s.predict(future_60s)

future_15min = model_15min.make_future_dataframe(periods=len(test_15min_prophet), freq='15T')
forecast_15min = model_15min.predict(future_15min)

# Evaluation
from sklearn.metrics import mean_squared_error

test_60s_values = test_60s_prophet['y'].values
forecast_60s_values = forecast_60s.iloc[-len(test_60s_prophet):]['yhat'].values
mse_60s_prophet = mean_squared_error(test_60s_values, forecast_60s_values)

test_15min_values = test_15min_prophet['y'].values
forecast_15min_values = forecast_15min.iloc[-len(test_15min_prophet):]['yhat'].values
mse_15min_prophet = mean_squared_error(test_15min_values, forecast_15min_values)

print(f'Prophet 60-second resampled MSE: {mse_60s_prophet}')
print(f'Prophet 15-minute MSE: {mse_15min_prophet}')


  df_60s_resampled = df60['avg_value'].resample('15T').mean().fillna(df60.mean(axis=1).mean())
15:33:29 - cmdstanpy - INFO - Chain [1] start processing
15:33:29 - cmdstanpy - INFO - Chain [1] done processing
15:33:30 - cmdstanpy - INFO - Chain [1] start processing
15:33:32 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
  dates = pd.date_range(


Prophet 60-second resampled MSE: 8021425984472.248
Prophet 15-minute MSE: 6.2598526691826744e+16
