# BWDB Tidal Data Processing

In [6]:
import mikeio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from datetime import datetime, timedelta
from calendar import monthrange


# function for creating blank time series at "specified interval" 
# creating a dictinary with first_date & end_date of each month of each year 
def blank_ts_and_year_dict(dataframe):

    df = dataframe

    start_end_time_dict = {}

    total_time_df = pd.DataFrame()

    start_yr = df['Date'].dt.year.unique().min()
    end_yr = df['Date'].dt.year.unique().max()


    year_list = [y for y in range(start_yr, end_yr+1 , 1)]

    for yr in year_list:

        year_dict = {}

        yr_df = df[df['Date'].dt.strftime('%Y') == str(yr)]

        months_list = [m for m in range(1, 12+1 , 1)]


        yearly_time_df = pd.DataFrame()


        for month in months_list:

            start_end_time_list = []

            end_day = monthrange(yr, month)[1]

            start_date = (datetime(year= yr, month= month, day=1, hour=0, minute=0, second =0)).strftime('%Y-%m-%d %H:%M:%S')
            end_date = (datetime(year= yr, month= month, day= end_day, hour=23, minute=30, second =0)).strftime('%Y-%m-%d %H:%M:%S')

            start_end_time_list.append(start_date)
            start_end_time_list.append(end_date)

            monthly_time_list = pd.date_range(start_date, end_date, freq="30min")

            monthly_time_df = pd.DataFrame({'Date':monthly_time_list})

            yearly_time_df = pd.concat([yearly_time_df,monthly_time_df])

            year_dict[str(month)] = start_end_time_list

        total_time_df = pd.concat([total_time_df,yearly_time_df])

        start_end_time_dict[str(yr)] = year_dict

    return total_time_df, start_end_time_dict


# Defining a function to extract daily maximum and minimum value and calculating their average
def daily_avg_wl_df(timeseries_df, column_name):
    
    without_duplicate_df = timeseries_df

    max_min_column = column_name

    # Extracting daily max and min value from timeseries data
    # Defining two dataframes to store daily maximum and minimum value
    max_df = pd.DataFrame()
    min_df = pd.DataFrame()

    # looping through all the unique date in the timeseries dataframe to determine daily maximum and minimum value
    for day in without_duplicate_df['Date'].dt.date.unique():

        # Creating a dataframe by matching the date
        day_df = without_duplicate_df[without_duplicate_df['Date'].dt.date == day]

        # Finding daily maximum and minimum value for the required cloumn and also extracting associated other column data
        max_row_df = day_df.nlargest(1, max_min_column)
        min_row_df = day_df.nsmallest(1, max_min_column)

        # Updating the maximum and minimum value in the max_df and min_df respectively
        max_df = pd.concat([max_df, max_row_df])
        min_df = pd.concat([min_df, min_row_df])

    # Creating a dataframe by merging maximum and minimum dataframe
    daily_max_min_df = pd.concat([max_df, min_df])

    # Sorting the dataframe by "Date" [Oldest to Newest]
    daily_max_min_df.sort_values(by='Date',ascending=True, inplace = False)

    # Calculating daily average value from daily maximum and minimum value and reseting the index
    davg_df = daily_max_min_df.groupby(by=daily_max_min_df['Date'].dt.date).mean()
    davg_df.reset_index(inplace = True)
    davg_df['Date'] = pd.to_datetime(davg_df['Date'], format="%Y-%m-%d")
    
    return daily_max_min_df, davg_df


# Function for calculating the average of each column the dataframe except the date column and then transposing it
def df_column_avg(df):
    
    a_df = pd.DataFrame(df.loc[:, df.columns != "Date"].mean())
    ret_df = a_df.transpose()
    
    return ret_df



# Function for creating segment wise water level average 
def seg_wl_average(no_of_intervals, time_dic, timeseries_df):

    N = no_of_intervals
    start_end_time_dict = time_dic
    m_df = timeseries_df
    
    wl_df = pd.DataFrame()
    
    for yr_key in start_end_time_dict:
        for month_key in start_end_time_dict[yr_key]:
            first_date = start_end_time_dict[yr_key][month_key][0]
            last_date = start_end_time_dict[yr_key][month_key][1]

            start_datetime = pd.to_datetime(first_date)
            end_datetime = pd.to_datetime(last_date)


            dtm_list = []

            diff = end_datetime - start_datetime

            increment = (diff.round('D') - timedelta(days = diff.round('D').days % N))/ N

            for idx in range(1, N+1, 1):

                if idx == 0:
                    dtm_list.append((start_datetime + idx * increment).strftime("%Y/%m/%d"))

                elif 0 < idx < N:
                    dtm_list.append((start_datetime + idx * increment - timedelta(days=1)).strftime("%Y/%m/%d"))

                else:
                    dtm_list.append((end_datetime).strftime("%Y/%m/%d"))
                    

            for i in range(0,len(dtm_list)):

                if i == 0:
                    mask = (m_df['Date'] >= start_datetime) & (m_df['Date'] <= dtm_list[i])
                    v_df = m_df.loc[mask]


                    fdt_df = pd.DataFrame({'Date/Time':[dtm_list[i]]})
                    avg_df = df_column_avg(v_df)
                    n_df = pd.concat([fdt_df,avg_df], axis=1, join="inner")
                    n_df['Remarks'] = i+1

                    wl_df = pd.concat([wl_df, n_df])

                elif 0 < i < len(dtm_list):
                    mask = (m_df['Date'] > dtm_list[i-1]) & (m_df['Date'] <= dtm_list[i])
                    v_df = m_df.loc[mask]

                    fdt_df = pd.DataFrame({'Date/Time':[dtm_list[i]]})
                    avg_df = df_column_avg(v_df)
                    n_df = pd.concat([fdt_df,avg_df], axis=1, join="inner")
                    n_df['Remarks'] = i+1

                    wl_df = pd.concat([wl_df, n_df])

                else:
                    break
    
    return wl_df

for file in glob.glob(r'G:\MJI\BWDB_Tidal (104 nos.)\Tidal (104 nos.)\*.dfs0'):
    
    try:
        path_list = file.split(os.sep)

        # Reading dfs0 file
        ds = mikeio.read(file)

        # Converting dataset to dataframe
        df = ds.to_dataframe()

        # Creating the date/time column and renaming it
        df.reset_index(inplace =True)
        df.rename(columns = {'index':'Date'}, inplace = True)
        df.sort_values(by='Date',ascending=True, inplace = True)

        # Rounding datetime time to nearest 30 minutes
        df['Date'] = df.loc[:,'Date'].dt.round(freq='30T')


        # Creating a datetime timeseries at 30 minutes interval 
        total_time_df, start_end_time_dict = blank_ts_and_year_dict(df)
        total_time_df.reset_index(inplace = True, drop=True)

        # Droping the 'Duplicate rows' and keeping the first value from duplicate rows
        without_duplicate_df = df.drop_duplicates(subset=['Date'], keep='first')


        # Merging Blank timeseries dataframe and processed dataframe from input timeseries together
        ts_df = total_time_df.merge(without_duplicate_df, how="outer", on="Date")
        ts_df.sort_values(by='Date',ascending=True, inplace = True)
        ts_df.reset_index(inplace = True, drop = True)

        # Extracting the daily maximum and minimum and calculating their average, then returning their average using defined function
        daily_max_min_df, davg_df = daily_avg_wl_df(timeseries_df = without_duplicate_df, column_name = 'Modified_Cyclone')

        # Defining a dataframe blank daily timeseries dataframe (frequency = 1 day)
        daily_ts_df = pd.DataFrame({'Date':ts_df['Date'].dt.date.unique()})
        daily_ts_df['Date'] = pd.to_datetime(daily_ts_df['Date'], format="%Y-%m-%d")

        # Merging daily blank timeseries dataframe and processed (from input timeseries) daily average water level dataframe together
        equi_davg_df = daily_ts_df.merge(davg_df, how="outer", on="Date")

        # Total number time segment in a month (example: fortnightly-2, weekly-4 etc.) [User Input]
        no_of_intervals = 2   ## Fortnightly

        # Applying function to calculate fortnightly average water level
        fawl_df = seg_wl_average(no_of_intervals = no_of_intervals, time_dic = start_end_time_dict , timeseries_df = equi_davg_df)


        # Saving 30 minutes interval timeseries file
        ts_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Output\30min_TS\{}_30min_TS.xlsx".format(path_list[-1][:-5]), 
                       float_format="%.3f", index= False)

        # Saving daily maximum and minimum timeseries file
        daily_max_min_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Output\Daily_Max_Min\{}_Daily_Max_Min.xlsx".format(path_list[-1][:-5]),
                       float_format="%.3f", index= False)

        # Saving daily average water level timeseries file
        equi_davg_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Output\Daily_Avg\{}_Daily_Avg.xlsx".format(path_list[-1][:-5]),
                       float_format="%.3f", index= False)

        # Saving fortnightly average water level
        fawl_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Output\Fortnightly_Avg\{}_Fortnightly_Avg.xlsx".format(path_list[-1][:-5]),
                       float_format="%.3f", index= False)

#         print(path_list[-1][:-5])
        
    except:
        print('!!!!ERROR!!!! ' + path_list[-1][:-5])

!!!!ERROR!!!! SW180 (Narayanganj)
!!!!ERROR!!!! SW3A (Brahmanbaria)


In [1]:
import mikeio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from datetime import datetime, timedelta
from calendar import monthrange


# function for creating blank time series at "specified interval" 
# creating a dictinary with first_date & end_date of each month of each year 
def blank_ts_and_year_dict(dataframe):

    df = dataframe

    start_end_time_dict = {}

    total_time_df = pd.DataFrame()

    start_yr = df['Date'].dt.year.unique().min()
    end_yr = df['Date'].dt.year.unique().max()


    year_list = [y for y in range(start_yr, end_yr+1 , 1)]

    for yr in year_list:

        year_dict = {}

        yr_df = df[df['Date'].dt.strftime('%Y') == str(yr)]

        months_list = [m for m in range(1, 12+1 , 1)]


        yearly_time_df = pd.DataFrame()


        for month in months_list:

            start_end_time_list = []

            end_day = monthrange(yr, month)[1]

            start_date = (datetime(year= yr, month= month, day=1, hour=0, minute=0, second =0)).strftime('%Y-%m-%d %H:%M:%S')
            end_date = (datetime(year= yr, month= month, day= end_day, hour=23, minute=30, second =0)).strftime('%Y-%m-%d %H:%M:%S')

            start_end_time_list.append(start_date)
            start_end_time_list.append(end_date)

            monthly_time_list = pd.date_range(start_date, end_date, freq="30min")

            monthly_time_df = pd.DataFrame({'Date':monthly_time_list})

            yearly_time_df = pd.concat([yearly_time_df,monthly_time_df])

            year_dict[str(month)] = start_end_time_list

        total_time_df = pd.concat([total_time_df,yearly_time_df])

        start_end_time_dict[str(yr)] = year_dict

    return total_time_df, start_end_time_dict


# Defining a function to extract daily maximum and minimum value and calculating their average
def daily_avg_wl_df(timeseries_df, column_name):
    
    without_duplicate_df = timeseries_df

    max_min_column = column_name

    # Extracting daily max and min value from timeseries data
    # Defining two dataframes to store daily maximum and minimum value
    max_df = pd.DataFrame()
    min_df = pd.DataFrame()

    # looping through all the unique date in the timeseries dataframe to determine daily maximum and minimum value
    for day in without_duplicate_df['Date'].dt.date.unique():

        # Creating a dataframe by matching the date
        day_df = without_duplicate_df[without_duplicate_df['Date'].dt.date == day]

        # Finding daily maximum and minimum value for the required cloumn and also extracting associated other column data
        max_row_df = day_df.nlargest(1, max_min_column)
        min_row_df = day_df.nsmallest(1, max_min_column)

        # Updating the maximum and minimum value in the max_df and min_df respectively
        max_df = pd.concat([max_df, max_row_df])
        min_df = pd.concat([min_df, min_row_df])

    # Creating a dataframe by merging maximum and minimum dataframe
    daily_max_min_df = pd.concat([max_df, min_df])

    # Sorting the dataframe by "Date" [Oldest to Newest]
    daily_max_min_df.sort_values(by='Date',ascending=True, inplace = False)

    # Calculating daily average value from daily maximum and minimum value and reseting the index
    davg_df = daily_max_min_df.groupby(by=daily_max_min_df['Date'].dt.date).mean()
    davg_df.reset_index(inplace = True)
    davg_df['Date'] = pd.to_datetime(davg_df['Date'], format="%Y-%m-%d")
    
    return daily_max_min_df, davg_df


# Function for calculating the average of each column the dataframe except the date column and then transposing it
def df_column_avg(df):
    
    a_df = pd.DataFrame(df.loc[:, df.columns != "Date"].mean())
    ret_df = a_df.transpose()
    
    return ret_df



# Function for creating segment wise water level average 
def seg_wl_average(no_of_intervals, time_dic, timeseries_df):

    N = no_of_intervals
    start_end_time_dict = time_dic
    m_df = timeseries_df
    
    wl_df = pd.DataFrame()
    
    for yr_key in start_end_time_dict:
        for month_key in start_end_time_dict[yr_key]:
            first_date = start_end_time_dict[yr_key][month_key][0]
            last_date = start_end_time_dict[yr_key][month_key][1]

            start_datetime = pd.to_datetime(first_date)
            end_datetime = pd.to_datetime(last_date)


            dtm_list = []

            diff = end_datetime - start_datetime

            increment = (diff.round('D') - timedelta(days = diff.round('D').days % N))/ N

            for idx in range(1, N+1, 1):

                if idx == 0:
                    dtm_list.append((start_datetime + idx * increment).strftime("%Y/%m/%d"))

                elif 0 < idx < N:
                    dtm_list.append((start_datetime + idx * increment - timedelta(days=1)).strftime("%Y/%m/%d"))

                else:
                    dtm_list.append((end_datetime).strftime("%Y/%m/%d"))
                    

            for i in range(0,len(dtm_list)):

                if i == 0:
                    mask = (m_df['Date'] >= start_datetime) & (m_df['Date'] <= dtm_list[i])
                    v_df = m_df.loc[mask]


                    fdt_df = pd.DataFrame({'Date/Time':[dtm_list[i]]})
                    avg_df = df_column_avg(v_df)
                    n_df = pd.concat([fdt_df,avg_df], axis=1, join="inner")
                    n_df['Remarks'] = i+1

                    wl_df = pd.concat([wl_df, n_df])

                elif 0 < i < len(dtm_list):
                    mask = (m_df['Date'] > dtm_list[i-1]) & (m_df['Date'] <= dtm_list[i])
                    v_df = m_df.loc[mask]

                    fdt_df = pd.DataFrame({'Date/Time':[dtm_list[i]]})
                    avg_df = df_column_avg(v_df)
                    n_df = pd.concat([fdt_df,avg_df], axis=1, join="inner")
                    n_df['Remarks'] = i+1

                    wl_df = pd.concat([wl_df, n_df])

                else:
                    break
    
    return wl_df

for file in glob.glob(r'G:\MJI\BWDB_Tidal (104 nos.)\Tidal (104 nos.)\Problem\*.dfs0'):
    
    try:
        path_list = file.split(os.sep)

        # Reading dfs0 file
        ds = mikeio.read(file)

        # Converting dataset to dataframe
        df = ds.to_dataframe()

        # Creating the date/time column and renaming it
        df.reset_index(inplace =True)
        df.rename(columns = {'index':'Date'}, inplace = True)
        df.sort_values(by='Date',ascending=True, inplace = True)

        # Rounding datetime time to nearest 30 minutes
        df['Date'] = df.loc[:,'Date'].dt.round(freq='30T')


        # Creating a datetime timeseries at 30 minutes interval 
        total_time_df, start_end_time_dict = blank_ts_and_year_dict(df)
        total_time_df.reset_index(inplace = True, drop=True)

        # Droping the 'Duplicate rows' and keeping the first value from duplicate rows
        without_duplicate_df = df.drop_duplicates(subset=['Date'], keep='first')


        # Merging Blank timeseries dataframe and processed dataframe from input timeseries together
        ts_df = total_time_df.merge(without_duplicate_df, how="outer", on="Date")
        ts_df.sort_values(by='Date',ascending=True, inplace = True)
        ts_df.reset_index(inplace = True, drop = True)

        # Extracting the daily maximum and minimum and calculating their average, then returning their average using defined function
        daily_max_min_df, davg_df = daily_avg_wl_df(timeseries_df = without_duplicate_df, column_name = 'Modified_Cyclone')

        # Defining a dataframe blank daily timeseries dataframe (frequency = 1 day)
        daily_ts_df = pd.DataFrame({'Date':ts_df['Date'].dt.date.unique()})
        daily_ts_df['Date'] = pd.to_datetime(daily_ts_df['Date'], format="%Y-%m-%d")

        # Merging daily blank timeseries dataframe and processed (from input timeseries) daily average water level dataframe together
        equi_davg_df = daily_ts_df.merge(davg_df, how="outer", on="Date")

        # Total number time segment in a month (example: fortnightly-2, weekly-4 etc.) [User Input]
        no_of_intervals = 2   ## Fortnightly

        # Applying function to calculate fortnightly average water level
        fawl_df = seg_wl_average(no_of_intervals = no_of_intervals, time_dic = start_end_time_dict , timeseries_df = equi_davg_df)


        # Saving 30 minutes interval timeseries file
        ts_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Tidal (104 nos.)\Problem\{}_30min_TS.xlsx".format(path_list[-1][:-5]), 
                       float_format="%.3f", index= False)

        # Saving daily maximum and minimum timeseries file
        daily_max_min_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Tidal (104 nos.)\Problem\{}_Daily_Max_Min.xlsx".format(path_list[-1][:-5]),
                       float_format="%.3f", index= False)

        # Saving daily average water level timeseries file
        equi_davg_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Tidal (104 nos.)\Problem\{}_Daily_Avg.xlsx".format(path_list[-1][:-5]),
                       float_format="%.3f", index= False)

        # Saving fortnightly average water level
        fawl_df.to_excel(r"G:\MJI\BWDB_Tidal (104 nos.)\Tidal (104 nos.)\Problem\{}_Fortnightly_Avg.xlsx".format(path_list[-1][:-5]),
                       float_format="%.3f", index= False)

#         print(path_list[-1][:-5])
        
    except:
        print('!!!!ERROR!!!! ' + path_list[-1][:-5])