In [None]:
import pandas as pd  # type: ignore
import numpy as np

df60 = pd.read_csv("MSJO 60 01-03 2023.csv", delimiter=";")
df900 = pd.read_csv("MSJO 900 01-03 2023.csv", delimiter=";")
print(df60)

# Basis Datansatz sortieren
df_sorted = df60.sort_values(by=["MESS_ID", "VALUEDATE"], ascending=[True, True])
# df_sorted.to_csv("sortierte_datei.csv", index=False)

# Drop rows with any missing values in MESS_ID, VALUEDATE, or PVALUE
df60 = df60.dropna(subset=["MESS_ID", "VALUEDATE", "PVALUE"], how="any")
df900 = df900.dropna(subset=["MESS_ID", "VALUEDATE", "PVALUE"], how="any")

# Drop rows with Pvalue = 0
df60 = df60[df60["PVALUE"] != "0"]
df900 = df900[df900["PVALUE"] != "0"]

# Drop Duplicates
duplicates = df60.duplicated().sum()
df60.drop_duplicates(inplace=True)
duplicates = df900.duplicated().sum()
df900.drop_duplicates(inplace=True)

# Set PVLAUE to float
df60["PVALUE"] = df60["PVALUE"].str.replace(",", ".").astype(float)
df900["PVALUE"] = df900["PVALUE"].str.replace(",", ".").astype(float)

# Quartile berechnen
Q1_60 = df60["PVALUE"].quantile(0.25)
Q3_60 = df60["PVALUE"].quantile(0.75)
IQR60 = Q3_60 - Q1_60
Q1_900 = df900["PVALUE"].quantile(0.25)
Q3_900 = df900["PVALUE"].quantile(0.75)
IQR900 = Q3_900 - Q1_900

# Definiere die Schwellenwerte
obere_schwelle_60 = Q3_60 + 1.5 * IQR60
print(obere_schwelle_60)
obere_schwelle_900 = Q3_900 + 1.5 * IQR900
print(obere_schwelle_900)

# Filtere die Zeilen
df_filtered_60 = df60[(df60["PVALUE"] >= obere_schwelle_60)]
df_filtered_900 = df900[(df60["PVALUE"] >= obere_schwelle_900)]

# Gefiltertes DataFrame anzeigen
print("Gefiltertes DataFrame:")
print(df_filtered_60)
print(df_filtered_900)

# New CSV File
# df.to_csv("cleaned.csv", index=False)
# df_filtered.to_csv("test.csv", index=False)

# Calculate mean
pvalue_mean = df60["PVALUE"].mean()

# Drop outliers
df60 = df60[(df60["PVALUE"] <= obere_schwelle_60)]
df900 = df900[(df900["PVALUE"] <= obere_schwelle_900)]

# ------------------------------------------------

# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split

# # Convert VALUEDATE to datetime and extract the date as a numerical feature
# df["VALUEDATE"] = pd.to_datetime(df["VALUEDATE"])
# df["DATEORDINAL"] = df["VALUEDATE"].apply(lambda date: date.toordinal())

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split( df[["DATEORDINAL"]], df["PVALUE"], test_size=0.2, random_state=42 )

# # Time series forecasting
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Print the coefficient and intercept of the model
# print(f"Coefficient: {model.coef_}")
# print(f"Intercept: {model.intercept_}")

#--------------------------------------------------

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

df60['VALUEDATE'] = pd.to_datetime(df60['VALUEDATE'])  # Convert 'date' column to datetime
df60.set_index('VALUEDATE', inplace=True) 
df900['VALUEDATE'] = pd.to_datetime(df900['VALUEDATE'])  # Convert 'date' column to datetime
df900.set_index('VALUEDATE', inplace=True) 

df60['avg_value'] = df60.mean(axis=1)
df900['avg_value'] = df900.mean(axis=1)

# Resample 60-second data to 15-minute intervals
df_60s_resampled = df60['avg_value'].resample('15T').mean()
df_60s_resampled = df_60s_resampled.fillna(df_60s_resampled.mean())

# Train-Test Split
train_size = int(len(df_60s_resampled) * 0.8)
train_60s, test_60s = df_60s_resampled[:train_size], df_60s_resampled[train_size:]
train_15min, test_15min = df900[:train_size], df900[train_size:]

# Train ARIMA model on 60-second resampled data
arima_60s = ARIMA(train_60s, order=(5, 1, 0))
arima_60s_fit = arima_60s.fit()

# Train ARIMA model on 15-minute data
arima_15min = ARIMA(train_15min, order=(5, 1, 0))
arima_15min_fit = arima_15min.fit()

# Forecasting
arima_60s_forecast = arima_60s_fit.forecast(steps=len(test_60s))
arima_15min_forecast = arima_15min_fit.forecast(steps=len(test_15min))

# Evaluation
arima_60s_mse = mean_squared_error(test_60s, arima_60s_forecast)
arima_15min_mse = mean_squared_error(test_15min, arima_15min_forecast)

print(f'ARIMA 60-second resampled MSE: {arima_60s_mse}')
print(f'ARIMA 15-minute MSE: {arima_15min_mse}')

#-------------------------------------------