# Import packages


In [2]:
%matplotlib inline
import pandas as pd
import os


# Processing function


In [3]:
def preprocess_datasets(dataset_map, base_dir="../data/original", save_dir="../data/raw"):
    """
    Preprocess datasets by renaming the target column, resampling datetime data to hourly,
    and saving the processed datasets.
    
    Args:
        dataset_map (dict): A dictionary where keys are dataset names and values are target column names.
        base_dir (str): Base directory where the original datasets are stored.
        save_dir (str): Directory where the processed datasets will be saved.
    """
    os.makedirs(save_dir, exist_ok=True)  # Ensure save directory exists

    for dataset_name, target_column in dataset_map.items():
        file_path = os.path.join(base_dir, dataset_name)
        
        # Check file extension (supports CSV and TXT)
        if dataset_name.endswith(".csv"):
            df = pd.read_csv(file_path)
        elif dataset_name.endswith(".txt"):
            # Use semicolon as the delimiter for txt files
            df = pd.read_csv(file_path, delimiter=";")
        else:
            print(f"Unsupported file format for {dataset_name}. Skipping.")
            continue
        
        # Rename the target column to "conso"
        if target_column in df.columns:
            df.rename(columns={target_column: "conso"}, inplace=True)
        else:
            print(f"Target column '{target_column}' not found in {dataset_name}. Skipping.")
            continue
        
        # Handle datetime column
        if dataset_name.endswith(".txt") and 'Date' in df.columns and 'Time' in df.columns:  # For txt files
            # Combine 'Date' and 'Time' into a single 'date' column
            df['date'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format="%d/%m/%Y %H:%M:%S", errors='coerce')
            # Drop the original 'Date' and 'Time' columns
            df.drop(columns=['Date', 'Time'], inplace=True)
        elif 'date' in df.columns:  # For other files with a single 'date' column
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
        else:
            print(f"No valid datetime columns found in {dataset_name}. Skipping.")
            continue
        
        # Drop rows with invalid dates
        df.dropna(subset=['date'], inplace=True)
        
        # Resample to hourly
        df.set_index('date', inplace=True)
        df = df.resample('H').asfreq().reset_index()  # Resample to hourly without aggregating
        
        # Save the processed dataset
        # Ensure the filename always has a .csv extension
        output_name = os.path.splitext(dataset_name)[0] + ".csv"
        save_path = os.path.join(save_dir, output_name)
        df.to_csv(save_path, index=False)
        print(f"Processed dataset saved to {save_path}")


# Dataset list


In [4]:
dataset_map = {
    #"electricity.csv": "OT",
    #"energy.csv": "Appliances",
    #"household.txt": "Global_active_power",
    #"tetuancity.csv": "Power Consumption",
    "ETTh1.csv": "OT",
    "ETTh2.csv": "OT",
    "ETTm1.csv": "OT",
    "ETTm2.csv": "OT",
}


# Processing pipeline


In [6]:
preprocess_datasets(dataset_map)


  df = df.resample('H').asfreq().reset_index()  # Resample to hourly without aggregating
  df = df.resample('H').asfreq().reset_index()  # Resample to hourly without aggregating


Processed dataset saved to ../data/raw/ETTh1.csv
Processed dataset saved to ../data/raw/ETTh2.csv


  df = df.resample('H').asfreq().reset_index()  # Resample to hourly without aggregating
  df = df.resample('H').asfreq().reset_index()  # Resample to hourly without aggregating


Processed dataset saved to ../data/raw/ETTm1.csv
Processed dataset saved to ../data/raw/ETTm2.csv
