In [12]:
import pandas as pd

# Load the CSV files
df60 = pd.read_csv("MSJO 60 01-03 2023.csv", delimiter=";")
df900 = pd.read_csv("MSJO 900 01-03 2023.csv", delimiter=";")

# Set PVLAUE to float
df60["PVALUE"] = df60["PVALUE"].str.replace(",", ".").astype(float)
df900["PVALUE"] = df900["PVALUE"].str.replace(",", ".").astype(float)

# Define a function to filter outliers for each MESS_ID
def filter_outliers(df):
    # Group-wise calculation of quartiles and upper thresholds
    grouped = df.groupby("MESS_ID")
    filtered_dfs = []

    for name, group in grouped:
        # Calculate quartiles
        Q1 = group["PVALUE"].quantile(0.25)
        Q3 = group["PVALUE"].quantile(0.75)
        IQR = Q3 - Q1
        
        # Calculate upper threshold for outliers
        upper_threshold = Q3 + 1.5 * IQR
        
        # Filter outliers
        filtered_group = group[(group["PVALUE"] >= Q1 - 1.5 * IQR) & (group["PVALUE"] <= Q3 + 1.5 * IQR)]
        
        # Append filtered group to list
        filtered_dfs.append(filtered_group)
    
    # Concatenate filtered dataframes
    df_filtered = pd.concat(filtered_dfs)
    
    return df_filtered

# Filter outliers for df60 and df900 separately
df60_filtered = filter_outliers(df60)
df900_filtered = filter_outliers(df900)

# Sort the filtered dataframes
df60_filtered.sort_values(by=["MESS_ID", "VALUEDATE"], ascending=[True, True], inplace=True)
df900_filtered.sort_values(by=["MESS_ID", "VALUEDATE"], ascending=[True, True], inplace=True)

# Save to CSV
df60_filtered.to_csv("cleaned60.csv", index=False)
df900_filtered.to_csv("cleaned900.csv", index=False)


# ------------------------------------------------

# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split

# # Convert VALUEDATE to datetime and extract the date as a numerical feature
# df["VALUEDATE"] = pd.to_datetime(df["VALUEDATE"])
# df["DATEORDINAL"] = df["VALUEDATE"].apply(lambda date: date.toordinal())

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split( df[["DATEORDINAL"]], df["PVALUE"], test_size=0.2, random_state=42 )

# # Time series forecasting
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Print the coefficient and intercept of the model
# print(f"Coefficient: {model.coef_}")
# print(f"Intercept: {model.intercept_}")

#--------------------------------------------------