In [None]:
# =========================================================
# Imports
# =========================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm


# =========================================================
# Funtions
# =========================================================

class precip_time_series:
    def __init__(self, data_path):

        self.data,self.statid = self.read_raw_data_as_pandas_df(data_path)
        
        self.padded = False

        self.events = None
        
        self.dimensioneless_curves = None
        
    def read_raw_data_as_pandas_df(self, raw_data_file_path):

        # Read file with timestamp as index
        precip = pd.read_csv(raw_data_file_path, encoding="ISO-8859-1",index_col=1)

        # Timestamps str -> datetime
        precip.index = pd.to_datetime(precip.index)

        # Save ID of station
        station_id = str(precip.station.iloc[0])

        # Remove column with station ID
        precip = precip.drop("station",axis=1) 

        return precip,station_id
    
    def return_specific_event(self,event_idx):

        # Size of timesteps
        time_delta = self.data.index[1] - self.data.index[0]
        time_delta_minutes = time_delta.seconds / 60

        # Extract event data
        event = self.data.loc[self.events[event_idx][0]:self.events[event_idx][1]]

        return event        

    def pad_and_resample(self,freq = '5min',pad_value = 0):
        # Resample the data to the specified frequency and pad missing values with pad_value
        self.data = self.data.resample(freq).sum().fillna(pad_value)
        self.padded = True

    def get_events(self,threshold='11h',min_duration = 30, min_precip = 1):
        
        if not self.padded:
            self.pad_and_resample()

        self.init_events(threshold)
        self.filter_events_by_length(min_duration)
        self.filter_events_by_amount(min_precip)

    def init_events(self,threshold):
        
        precip = self.data
        
        # Size of timesteps
        time_delta = precip.index[1]-precip.index[0]

        # Rolling 11 hour sum
        precip_sum = precip.rolling(threshold).sum()

        # dates with no precip last 11 hours
        dates_w_zero_sum = precip_sum.index[(precip_sum.mask(precip_sum!=0)==precip_sum).values[:,0]]

        # Add first date with rain
        for date in precip.index:
            if precip.loc[date].values[0] != 0:
                start_dates = [date]
                break

        # Save start and end dates
        end_dates   = []
        for date in tqdm(dates_w_zero_sum):
            if precip_sum.loc[date- time_delta].values[0]!=0:
                end_dates += [date- pd.to_timedelta(threshold)]
            if precip_sum.loc[date+ time_delta].values[0]!=0:
                start_dates += [date+ time_delta]
        
        # Add end to last event
        for date in reversed(precip.index):  # Iterate from last to first
            if precip.loc[date].values[0] != 0:  # Check if value is not zero
                end_dates += [date]
                break  # Stop at the first nonzero value
        
        # Save events as list of tuples
        events = []
        for i in range(len(end_dates)):
            events+=[(start_dates[i],end_dates[i])]

        # update events
        self.events = events

    def filter_events_by_length(self,min_duration):
        
        # Remove events with duration under min duration
        filtered_events = [event for event in self.events if event[1]-event[0]>=pd.Timedelta(minutes=min_duration)]

        # Update events
        self.events = filtered_events
    
    def filter_events_by_amount(self,min_precip):
        
        # Remove events with total precip under minimum
        filtered_events = [event for event in self.events if self.data.loc[event[0]:event[1]].sum().values[0]>=min_precip]
        
        # update events
        self.events = filtered_events

    def create_dimensioneless_curves(self):

        # Make sure events have been computed
        if self.events == None:
            self.get_events()
        
        # Make list of nparrays containing the values of the dimensioneless curve
        dimensioneless_curves = [self.get_dimensioneless_curve(self.data.loc[event[0]:event[1]].values) for event in self.events]

        # Assign to global value
        self.dimensioneless_curves = dimensioneless_curves

    def get_dimensioneless_curve(self,series):
    
        # Calculate cumulative rainfall
        cumulative_rainfall = np.cumsum(series)
        cumulative_rainfall = np.append([0],cumulative_rainfall)

        # normalize
        normalized_cumulative_rainfall = cumulative_rainfall/cumulative_rainfall[-1]

        return normalized_cumulative_rainfall



class rainfall_analysis:
    def __init__(self,ts: precip_time_series):
        self.ts = ts
        self.metrics = {} 
        
        # Prepere ts for analysis
        if not self.ts.padded:
            ts.pad_and_resample()

        if ts.events == None:
            ts.get_events()
        
        if ts.dimensioneless_curves == None:
            ts.create_dimensioneless_curves()

    def huff_quantile(self,ts):
      
        sums = self.split_ts_sum(ts,4)

        quantile = np.argmax(sums)+1

        return quantile


    def interpolate_rainfall(self, ts, bin_number):
        if rainfall is None or len(ts) < 2:
            return None

        # Define target points for bin_number bins
        target_points = np.linspace(0, 1, bin_number+1)

        # Create interpolation function based on existing data points
        rainfall_times = np.array(range(0, len(ts)))

        # Normalize time from 0 to 1
        normalized_time = (rainfall_times - rainfall_times[0]) / (rainfall_times[-1] - rainfall_times[0])
        interpolation_func = interp1d(normalized_time, ts, kind='linear', fill_value="extrapolate")

        # Interpolate values at target points
        interpolated_values = interpolation_func(target_points)

        return interpolated_values, target_points    

    
    def split_ts_sum(self,ts,num):
        
        # if list length divisible by num, equal split trivial
        if len(ts) % num == 0:
            splits = np.array_split(ts.values,num)
            sums = [split.sum() for split in splits]
            
            return sums
        
        # Find temporal resolution of data
        time_delta = ts.index[1]-ts.index[0]
        time_delta_minuts = time_delta.seconds/60

        # When does recording start and end
        ts_start = ts.index[0]-time_delta
        ts_end   = ts.index[-1]

        # Find the num+1 timestamps between which the num equal length splits are defined
        time_splits = pd.date_range(start=ts_start, end=ts_end, periods=num+1)

        # Init list for saving splits
        list_of_splits = []

        for i in range(num+1):
            # first and last split timestamp, is just edges of entire array and is not needed
            if i != 0 and i!= num:
                
                # Find closest previous and next x-minute marks
                prev_time = time_splits[i].floor(f"{time_delta_minuts}min")  
                next_time = time_splits[i].ceil(f"{time_delta_minuts}min")   
                mid_time = prev_time + pd.Timedelta(minutes=time_delta_minuts)  

                # if next time = prevtime, no need to interpolate
                if next_time == prev_time:
                    
                    # ts1 is the ts before split point
                    ts1 = ts[ts.index < time_splits[i]].values

                    # ts is updated by removing t1
                    ts = ts[ts.index >= time_splits[i]]

                    # List is updated
                    list_of_splits +=[ ts1]
                else:
                    # Value that should be split
                    mid_value = ts.loc[mid_time].values

                    # weight of value for each split
                    total_interval = (next_time - prev_time).total_seconds()
                    weight_prev = (next_time - time_splits[i]).total_seconds() / total_interval
                    weight_next = (time_splits[i] - prev_time).total_seconds() / total_interval

                    # ts1 is the ts before split point
                    ts1 = ts[ts.index < time_splits[i]]
  
                    # add value from folliwing timestep
                    ts1 = np.append(ts1.values,weight_prev*mid_value)
                    
                    # ts is updated by removing t1
                    ts = ts[ts.index > time_splits[i]]

                    # Value beloning to previous split is removed from ts
                    ts = ts.copy()
                    ts.iloc[0] -= weight_prev * mid_value

                    # list of splits is updated
                    list_of_splits +=[ ts1]
        
        # Add last split to list
        list_of_splits +=[ ts.values[:,0]]
        
        # compute sum of each split
        sums = [split.sum() for split in list_of_splits]
     
        return sums


    def get_metrics(self):

        padded_precip = self.ts.data
        events_list = self.ts.events

        # resolution    
        time_delta = padded_precip.index[1]-padded_precip.index[0]
        time_delta_minuts = time_delta.seconds/60

        #####################################
        # Properties of Events to calculate
        #####################################


        #####################################
        # Rainfall metrics
        #####################################

        # huff quantiles
        self.metrics["huff_quantile"] = np.array([[self.huff_quantile(padded_precip[event[0]:event[1]])] for event in events_list])



# =========================================================
# Input
# =========================================================

# Path to data file
raw_data_file =  "/nfs/a319/gy17m2a/PhD/datadir/DanishRainData/Sample1.csv"

# =========================================================
# Script
# =========================================================

# Load ts
ts = precip_time_series(raw_data_file)

# pad and resample 
ts.pad_and_resample('5min')

# Get filtered events
#ts.get_events()

# Get dimensioneless curves
#ts.create_dimensioneless_curves()

# # Analysis
analysis = rainfall_analysis(ts)
