In [1]:
import pandas as pd
from pathlib import Path

In [22]:
def resample(file):
    """
    Read a CSV file, resample to 15-minute resolution by averaging all columns,
    and save with _resampled15 suffix.
    
    Parameters:
    -----------
    input_filepath : str or Path
        Path to the input CSV file
        
    Returns:
    --------
    str
        Path to the output CSV file
    """
    # Convert to Path object for easier manipulation
    input_path = Path(file)
    
    # Create output filename with suffix
    output_filename = input_path.stem + "_resampled15.csv"
    output_path = input_path.parent / output_filename

    # Convert to Path object for easier manipulation
    input_path = Path(file)

    # Create output filename with suffix
    output_filename = input_path.stem + "_resampled15.csv"
    output_path = input_path.parent / output_filename

    # Read the CSV file
    df = pd.read_csv(input_path, parse_dates=[1], index_col=1)
    try:
        df.drop("Unnamed: 0", axis=1, inplace=True)
    except:
        pass

    # Resample to 15 minutes and calculate mean for all columns
    df_resampled = df.resample('15min').mean()
    df_resampled.reset_index(inplace=True)

    # Save to CSV
    df_resampled.to_csv(output_path)
    print(f"Resampled file saved to: {output_path}")
    return df_resampled    

In [39]:
# Collect all CSV files from dfab, solace, and umar subfolders
csv_files = []

# Get current directory
current_dir = Path(".")

# Search in dfab_*, solace_*, and umar_* folders
for folder_pattern in ["dfab_*", "solace_*", "umar_*"]:
    for folder in current_dir.glob(folder_pattern):
        if folder.is_dir():
            # Find all CSV files in this folder
            csv_files.extend(folder.glob("*.csv"))

# Convert to list of strings
csv_files = [str(f) for f in sorted(csv_files)]

# Drop files containing "resampled" or "metadata"
csv_files = [f for f in csv_files if "resampled" not in f.lower() and "metadata" not in f.lower()]

print(f"Found {len(csv_files)} CSV files:", csv_files)


Found 12 CSV files: ['dfab_24551989\\dfab_2019-07-01_2020-07-01_wide_descId.csv', 'dfab_24551989\\dfab_2020-07-01_2021-07-01_wide_descId.csv', 'dfab_24551989\\dfab_2021-07-01_2022-07-01_wide_descId.csv', 'dfab_24551989\\dfab_2022-07-01_2023-07-01_wide_descId.csv', 'solace_24551971\\solace_2019-07-01_2020-07-01_wide_descId.csv', 'solace_24551971\\solace_2020-07-01_2021-07-01_wide_descId.csv', 'solace_24551971\\solace_2021-07-01_2022-07-01_wide_descId.csv', 'solace_24551971\\solace_2022-07-01_2023-07-01_wide_descId.csv', 'umar_24551992\\umar_2019-07-01_2020-07-01_wide_descId.csv', 'umar_24551992\\umar_2020-07-01_2021-07-01_wide_descId.csv', 'umar_24551992\\umar_2021-07-01_2022-07-01_wide_descId.csv', 'umar_24551992\\umar_2022-07-01_2023-07-01_wide_descId.csv']


In [46]:
# Resample all CSV files
resampled_dfs = {}
for f in csv_files:
    print(f"Processing: {f}")
    df_resampled = resample(f)
    resampled_dfs[f] = df_resampled

Processing: dfab_24551989\dfab_2019-07-01_2020-07-01_wide_descId.csv
Resampled file saved to: dfab_24551989\dfab_2019-07-01_2020-07-01_wide_descId_resampled15.csv
Processing: dfab_24551989\dfab_2020-07-01_2021-07-01_wide_descId.csv
Resampled file saved to: dfab_24551989\dfab_2019-07-01_2020-07-01_wide_descId_resampled15.csv
Processing: dfab_24551989\dfab_2020-07-01_2021-07-01_wide_descId.csv
Resampled file saved to: dfab_24551989\dfab_2020-07-01_2021-07-01_wide_descId_resampled15.csv
Processing: dfab_24551989\dfab_2021-07-01_2022-07-01_wide_descId.csv
Resampled file saved to: dfab_24551989\dfab_2020-07-01_2021-07-01_wide_descId_resampled15.csv
Processing: dfab_24551989\dfab_2021-07-01_2022-07-01_wide_descId.csv
Resampled file saved to: dfab_24551989\dfab_2021-07-01_2022-07-01_wide_descId_resampled15.csv
Processing: dfab_24551989\dfab_2022-07-01_2023-07-01_wide_descId.csv
Resampled file saved to: dfab_24551989\dfab_2021-07-01_2022-07-01_wide_descId_resampled15.csv
Processing: dfab_24551

In [None]:
for key,value in resampled_dfs:
    print (df.head(1))

KeyError: 0

In [None]:
def merge_resampled_files():
    """
    Merge all resampled15 CSV files from dfab_, umar_, and solace_ subfolders.
    Groups files by prefix and saves combined files.
    
    Returns:
    --------
    dict
        Dictionary with prefix as key and combined dataframe as value
    """
    from collections import defaultdict
    
    current_dir = Path(".")
    combined_by_prefix = defaultdict(list)
    
    # Search for resampled15 files in dfab_*, solace_*, and umar_* folders
    for folder_pattern in ["dfab_*", "solace_*", "umar_*"]:
        for folder in current_dir.glob(folder_pattern):
            if folder.is_dir():
                # Find all resampled15 CSV files
                for csv_file in folder.glob("*_resampled15.csv"):
                    prefix = folder.name.split('_')[0]  # Get 'dfab', 'solace', or 'umar'
                    df = pd.read_csv(csv_file)
                    combined_by_prefix[prefix].append((str(csv_file), df))
                    print(f"Found: {csv_file}")
    
    # Combine and save by prefix
    combined_results = {}
    for prefix, file_list in combined_by_prefix.items():
        print(f"\nCombining {len(file_list)} resampled files for prefix: {prefix}")
        
        # Concatenate all dataframes for this prefix
        combined_df = pd.concat([df for _, df in file_list], ignore_index=True, sort=False)
        
        # Sort by time column if it exists
        time_cols = [col for col in combined_df.columns if 'time' in col.lower() or 'date' in col.lower()]
        if time_cols:
            combined_df[time_cols[0]] = pd.to_datetime(combined_df[time_cols[0]])
            combined_df = combined_df.sort_values(time_cols[0])
        
        combined_results[prefix] = combined_df
        
        # Save combined file
        output_filename = f"{prefix}_all_resampled15_merged.csv"
        output_path = current_dir / output_filename
        combined_df.to_csv(output_path, index=False)
        
        print(f"  Combined shape: {combined_df.shape}")
        print(f"  Saved to: {output_path}")
    
    return combined_results


# Run the merge function
merged_data = merge_resampled_files()


12

In [None]:
df = pd.read_csv(input_path, parse_dates=[1], index_col=1)


Unnamed: 0_level_0,Unnamed: 0,heating_cooling_power,irrad,kitchen_active_power,setp_371,setp_472,setp_474,setp_476,setp_571,setp_573,...,y2_574,y3_371,y3_472,y3_476,y3_571,y3_574,y4_371,y5_371,y6_371,y7_371
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-07-01 00:00:00,1,-3.36993,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2019-07-01 00:01:00,2,-3.26605,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2019-07-01 00:02:00,3,-3.49809,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2019-07-01 00:03:00,4,-3.44770,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2019-07-01 00:04:00,5,-3.19427,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 23:55:00,527036,-3.04173,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-06-30 23:56:00,527037,-3.14641,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-06-30 23:57:00,527038,-3.06143,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-06-30 23:58:00,527039,-2.27189,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0


In [None]:
# Resample to 15 minutes and calculate mean for all columns
df_resampled

Unnamed: 0_level_0,Unnamed: 0,heating_cooling_power,irrad,kitchen_active_power,setp_371,setp_472,setp_474,setp_476,setp_571,setp_573,...,y2_574,y3_371,y3_472,y3_476,y3_571,y3_574,y4_371,y5_371,y6_371,y7_371
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-07-01 00:00:00,8.0,-3.330995,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.000000,1.0,1.0,1.0,0.400000,1.000000,1.000000,1.000000,1.000000
2019-07-01 00:15:00,23.0,-3.087947,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.000000,1.0,1.0,1.0,0.000000,1.000000,1.000000,1.000000,1.000000
2019-07-01 00:30:00,38.0,-3.441945,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.000000,1.0,1.0,1.0,0.600000,1.000000,1.000000,1.000000,1.000000
2019-07-01 00:45:00,53.0,-3.274495,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.000000,1.0,1.0,1.0,1.000000,1.000000,1.000000,1.000000,1.000000
2019-07-01 01:00:00,68.0,-3.298245,0.0,0.0,25.0,22.0,22.0,19.0,25.5,22.0,...,1.0,1.000000,1.0,1.0,1.0,0.666667,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 22:45:00,526973.0,-2.267955,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,0.800000,1.0,1.0,1.0,1.000000,0.666667,0.666667,0.666667,0.866667
2020-06-30 23:00:00,526988.0,-2.476937,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,1.000000,1.0,1.0,1.0,0.933333,0.000000,0.000000,0.000000,0.000000
2020-06-30 23:15:00,527003.0,-2.474310,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,0.866667,1.0,1.0,1.0,0.000000,0.666667,0.666667,0.800000,0.666667
2020-06-30 23:30:00,527018.0,-2.486334,0.0,0.0,25.5,23.0,23.0,16.0,24.5,23.0,...,1.0,0.000000,1.0,1.0,1.0,0.066667,0.000000,0.000000,1.000000,0.000000
