In [497]:
# Import to be able to import python package from src
import sys
sys.path.insert(0, '../src')

import pandas as pd
import ontime as on

from darts.datasets import EnergyDataset

# Load data

In [498]:
ts = EnergyDataset().load()

## Create test dataframe

In [499]:
df = ts.pd_dataframe()
df = df.interpolate()
cols = ['generation biomass', 'generation solar', 'generation nuclear']
df = df[cols]
df = df[:1000]

In [500]:
def rolling_corr(df, window):

    df = df.rolling(window=window).corr()
    
    cols = df.columns
    
    # Initialize new columns
    for col_x in cols:
        for col_y in cols:
            df[f"{col_x}_{col_y}"] = pd.NA
    
    # Populate the new columns
    df = df.reset_index(level=1)
    for index, row in df.iterrows():
        for col in cols:
            new_col_name = f"{col}_{row['component']}"
            df.at[index, new_col_name] = row[col]
    
    # Drop unneeded columns
    
    # 'component' column and diagonal 
    df = df.drop(columns=list(cols) + ['component'])
    for col_x in cols:
        for col_y in cols:
            col_name = f"{col_x}_{col_y}"
            if col_x == col_y:
                df = df.drop(columns=col_name)
    
    # drop columns that are equivalent
    cols = df.columns.tolist()
    to_drop_as_set = []
    to_drop_as_str = []
    
    for col in cols:
        prefix, suffix = col.split('_')
        col_set = frozenset([prefix, suffix])
        to_drop_as_set.append(col_set)
    
    unique_sets = set(to_drop_as_set)
    for col in unique_sets:
        col = list(col)
        to_drop_as_str.append(f"{col[0]}_{col[1]}")
    
    # Drop the marked components from the dataframe
    df = df.drop(columns=to_drop_as_str)
    
    # For some reason, drop duplicates
    df = df[~df.index.duplicated(keep='first')]

    return df

In [501]:
import pandas as pd
import numpy as np

def rolling_corr_2a(df, window):
    df = df.rolling(window=window).corr()
    
    # Step 1: Get unique time indices (level 0 of your MultiIndex)
    unique_times = df.index.get_level_values(0).unique()
    
    # Prepare a container for non-redundant items
    non_redundant_items = []
    
    for time in unique_times:
        # Step 2: Get the slice of the dataframe corresponding to the current time
        current_df = df.xs(time, level=0)
        
        # Step 3: Extract upper triangle of the correlation matrix without the diagonal
        # Since indices and columns are the same, we can assume it's a square matrix
        upper_triangle_indices = np.triu_indices(n=current_df.shape[0], k=1)  # k=1 excludes the main diagonal
        upper_triangle_values = current_df.values[upper_triangle_indices]
        
        # Step 4: Store non-redundant items with their respective labels and time
        # We iterate directly over the upper_triangle_indices, which are pairs of (row, col) positions in the matrix
        for (row, col), value in zip(zip(*upper_triangle_indices), upper_triangle_values):
            non_redundant_items.append({
                'time': time,
                'var_a': current_df.index[row],
                'var_b': current_df.columns[col],
                'correlation': value
            })
    
    # Step 5: Convert the list of non-redundant items to a new DataFrame
    result_df = pd.DataFrame(non_redundant_items)
    return result_df

def rolling_corr_2b(df):
    # Step 1: Set 'time' as the index
    df.set_index('time', inplace=True)
    
    # Step 2: Create a unique identifier for each (row_index, col_index) pair
    # This will become the column names in the reshaped DataFrame
    df['pair'] = df['var_a'] + '_' + df['var_b']
    
    # Step 3: Pivot the table so that each 'pair' becomes a column
    # The values in the table will be the 'correlation' values
    pivoted_df = df.pivot(columns='pair', values='correlation')

    return pivoted_df

## Check perf

In [502]:
%%timeit

rc1 = rolling_corr(df, '1D')

531 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [480]:
%%timeit

res = rolling_corr_2a(df, '1D')
rc2 = rolling_corr_2b(res)

102 ms ± 829 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Check results

In [483]:
rc1 = rolling_corr(df, '1D')

In [484]:
rc1

component,generation solar_generation biomass,generation solar_generation nuclear,generation nuclear_generation biomass
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-12-31 23:00:00,,,
2015-01-01 00:00:00,1.0,,
2015-01-01 01:00:00,0.866025,0.5,0.0
2015-01-01 02:00:00,-0.197386,0.555556,-0.328976
2015-01-01 03:00:00,0.853583,0.153707,-0.085483
...,...,...,...
2015-02-11 10:00:00,-0.177426,0.685228,0.164548
2015-02-11 11:00:00,-0.132317,0.654058,0.143194
2015-02-11 12:00:00,0.088282,0.624602,0.239771
2015-02-11 13:00:00,0.187487,0.594382,0.218134


In [503]:
res = rolling_corr_2a(df, '1D')
rc2 = rolling_corr_2b(res)

In [637]:
rc2

pair,generation biomass_generation nuclear,generation biomass_generation solar,generation solar_generation nuclear
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-12-31 23:00:00,,,
2015-01-01 00:00:00,,1.000000,
2015-01-01 01:00:00,0.000000,0.866025,0.500000
2015-01-01 02:00:00,-0.328976,-0.197386,0.555556
2015-01-01 03:00:00,-0.085483,0.853583,0.153707
...,...,...,...
2015-02-11 10:00:00,0.164548,-0.177426,0.685228
2015-02-11 11:00:00,0.143194,-0.132317,0.654058
2015-02-11 12:00:00,0.239771,0.088282,0.624602
2015-02-11 13:00:00,0.218134,0.187487,0.594382


In [638]:
df = rc2

In [None]:
df