In [1]:
from sympy import *
init_printing()

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.stattools import adfuller

# File paths
file_path_bygholm = "/Users/nicolaigarderhansen/Desktop/Bachelorprojekt/Data/Nedbor_Dag_Bygholm Skov_DMI.csv"
file_path_hesselballe = "/Users/nicolaigarderhansen/Desktop/Bachelorprojekt/Data/Nedbor_Dag_Hesselballe_DMI.csv"
file_path_korup_parken = "/Users/nicolaigarderhansen/Desktop/Bachelorprojekt/Data/Bygholm_Kørup_Parken_VNF_døgn_1975-2023.csv"

file_path_kørup_bro = "/Users/nicolaigarderhansen/Desktop/Bachelorprojekt/Data/28.02_Vandforing, Dognmiddel (DMP)_Dag.csv"
file_path_waterlevel = "/Users/nicolaigarderhansen/Desktop/Bachelorprojekt/Data/28.02_Vandstand (DMP)_Dag.csv"

# Read CSVs with correct delimiter and skip headers
bygholm_df = pd.read_csv(file_path_bygholm, skiprows=13, encoding="ISO-8859-1", delimiter=";")
hesselballe_df = pd.read_csv(file_path_hesselballe, skiprows=13, encoding="ISO-8859-1", delimiter=";")
korup_parken_df = pd.read_csv(file_path_korup_parken, decimal=",", encoding="ISO-8859-1", delimiter=";")

kørup_bro_df = pd.read_csv(file_path_kørup_bro, skiprows=13, encoding="ISO-8859-1", delimiter=";")
waterlevel_df = pd.read_csv(file_path_waterlevel, skiprows=13, encoding="ISO-8859-1", delimiter=";")

# Clean column names (remove leading spaces)
bygholm_df.columns = bygholm_df.columns.str.strip()
hesselballe_df.columns = hesselballe_df.columns.str.strip()
korup_parken_df.columns = korup_parken_df.columns.str.strip()

kørup_bro_df.columns = kørup_bro_df.columns.str.strip()
waterlevel_df.columns = waterlevel_df.columns.str.strip()

# Rename columns
bygholm_df.columns = ["timestamp", "NedbørByg", "ksMRK"]
hesselballe_df.columns = ["timestamp", "NedbørHessel", "ksMRK"]
korup_parken_df.columns = ["timestamp", "KarupBro", "BygholmPark"]

kørup_bro_df.columns = ["timestamp", "KørupBro", "ksMRK"]
waterlevel_df.columns = ["timestamp", "Vandstand", "ksMRK"]

# Convert timestamp to datetime format
bygholm_df["timestamp"] = pd.to_datetime(bygholm_df["timestamp"], format="%d-%m-%Y")
hesselballe_df["timestamp"] = pd.to_datetime(hesselballe_df["timestamp"], format="%d-%m-%Y")
korup_parken_df["timestamp"] = pd.to_datetime(korup_parken_df["timestamp"], format="%d-%m-%Y")
waterlevel_df["timestamp"] = pd.to_datetime(waterlevel_df["timestamp"], format="%d-%m-%Y")

# For Kørup: Remove time component ("00:00")
kørup_bro_df["timestamp"] = kørup_bro_df["timestamp"].astype(str).str.strip()
kørup_bro_df["timestamp"] = kørup_bro_df["timestamp"].str.split(" ").str[0]
kørup_bro_df["timestamp"] = pd.to_datetime(kørup_bro_df["timestamp"], format="%d-%m-%Y", errors='coerce')

# Merge datasets on timestamp (left join)
merged_df = korup_parken_df.copy()
merged_df = merged_df.merge(bygholm_df[["timestamp", "NedbørByg"]], on="timestamp", how="left")
merged_df = merged_df.merge(hesselballe_df[["timestamp", "NedbørHessel"]], on="timestamp", how="left")

merged_df = merged_df.merge(kørup_bro_df[["timestamp", "KørupBro"]], on="timestamp", how="left")
merged_df = merged_df.merge(waterlevel_df[["timestamp", "Vandstand"]], on="timestamp", how="left")

# Drop the BygholmPark column
merged_df = merged_df.drop(columns=["BygholmPark"])

# Remove rows where NedbørByg or NedbørHessel is missing
merged_df = merged_df.dropna(subset=["NedbørByg", "NedbørHessel"])

# Create a sequential Time index
merged_df["Time"] = range(len(merged_df))

# reset index
merged_df = merged_df.reset_index(drop=True)

# Display the cleaned and merged dataset
print(shape(merged_df))
merged_df.head()

(5110, 7)


Unnamed: 0,timestamp,KarupBro,NedbørByg,NedbørHessel,KørupBro,Vandstand,Time
0,2010-01-04,1806.75,0.2,0.2,1822.633,7.516,0
1,2010-01-05,1676.6,0.6,0.5,1676.103,7.7,1
2,2010-01-06,1562.37,0.3,0.2,1543.136,7.891,2
3,2010-01-07,1427.74,0.0,0.1,1387.737,7.862,3
4,2010-01-08,1280.8,0.0,0.0,1220.419,7.83,4


In [10]:
merged_df.corr()

Unnamed: 0,timestamp,KarupBro,NedbørByg,NedbørHessel,KørupBro,Vandstand,Time
timestamp,1.0,0.044957,0.010718,-0.001288,-0.018054,0.523953,1.0
KarupBro,0.044957,1.0,0.110146,0.180354,0.997713,0.703265,0.044957
NedbørByg,0.010718,0.110146,1.0,0.61716,0.095418,0.124816,0.010718
NedbørHessel,-0.001288,0.180354,0.61716,1.0,0.157565,0.170333,-0.001288
KørupBro,-0.018054,0.997713,0.095418,0.157565,1.0,0.664444,-0.018054
Vandstand,0.523953,0.703265,0.124816,0.170333,0.664444,1.0,0.523953
Time,1.0,0.044957,0.010718,-0.001288,-0.018054,0.523953,1.0
