
# Preprocessing phase 1:
##    - Splitting hours from an original file
##    - Save its data on seperate files

In [1]:
import gc
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import display

# WARNING: install below packages
# !conda install openpyxl
# !conda install xlrd

In [2]:
root_path = os.getcwd()
original_dataset_path = os.path.join(root_path, 'datasets', 'Original_MCP_Data')
preprocessed_dataset_path = os.path.join(root_path, 'datasets', 'Preprocess_Phase_1')

os.makedirs(name=preprocessed_dataset_path, exist_ok=True)

In [3]:
def get_need_preprocessing_paths(load_path):
    # Create an empty list to store path of files that need preprocessing
    
    need_preprocessing = []

    for year in os.listdir(path=load_path):
        for month in os.listdir(path=os.path.join(load_path, year)):
            month_path = os.path.join(load_path, year, month)
            
            need_preprocessing += [os.path.join(month_path, day) for day in os.listdir(path=month_path)]

    return need_preprocessing

In [4]:
def correct_time(x):
    day = x[:2]
    month = x[3:5]
    year = x[6:10]
    hour = x[11:13]
    minute = x[14:16]
    second = x[17:19]
    return f'{year}-{month}-{day} {hour}:{minute}:{second}'

In [5]:
def process_one_hour_data(one_hour_original_data, time_column, invalid_column_names, save_path):

    # Create a new empty dictionary for storing one hour final data
    one_hour_final_data = {}

    # Check if the first row of the first column of the one hour original data is 'Bid curve chart data (Reference time)'
    # In another word, check if the first column contain column names
    assert one_hour_original_data.iloc[0, 0] == time_column
    
    ################################################
    # Add name, value to empty dictionary
    #===============================================
    # For each row
    for row_index in range(len(one_hour_original_data)):
        name, value = one_hour_original_data.iloc[row_index, 0], one_hour_original_data.iloc[row_index, 1]

        # Check if the name is not in the invalid column names
        if name not in invalid_column_names:

            # Creating column names
            if name not in one_hour_final_data:
                one_hour_final_data[name] = []

            # Fixing 'Bid curve chart data (Reference time)'
            if name == time_column:
                value = value.replace(' +', '')

                assert len(value) == 19, f'Time should have 19 characters, but got {len(value)}'

                # Change time format
                value = correct_time(x=value)  # %Y-%m-%d %H:%M:%S
                # Determine save path
                save_path = os.path.join(save_path, value[0:4], value[5:7], value[8:10], value[11:13])
                # Original save name
                original_save_name = value.replace('-', '_').replace(' ', '_').replace(':', '_')

            # Add this value to the empty dictionary
            one_hour_final_data[name].append(value)
    ################################################

    ################################################
    # All name, value in one_hour_final_data
    # should have the same length
    #===============================================
    # Get maximum number of values in one_hour_final_data
    max_count_values = max([len(v) for v in one_hour_final_data.values()])

    # Checking length of names and values in one_hour_final_data
    for k, v in one_hour_final_data.items():

        # Number of values in 'Price value' and 'Volume value' should be equal to max_count_values
        if len(v) != 1:
            assert len(v) == max_count_values, f'{k} should have {max_count_values} values, but got {len(v)}'

        # Other names should have only 1 value. It will be broadcast max_count_values times
        else:
            one_hour_final_data[k] = [v[0] for _ in range(max_count_values)]
    ################################################


    # Create one_hour_final_data dataframe
    one_hour_final_data = pd.DataFrame(
        data=one_hour_final_data,
        columns=one_hour_final_data.keys()
    )
    one_hour_final_data[time_column] = pd.to_datetime(arg=one_hour_final_data[time_column], format='%Y-%m-%d %H:%M:%S')

    # Create directories for storing this new prepared data if they are not exist
    os.makedirs(name=save_path, exist_ok=True)

    ################################################
    # Check if the save name already exists
    # (for different data with exactly the same date)
    #===============================================
    # Saving file
    if len(original_save_name) != 19:
        raise ValueError(f'Invalid tmp name {original_save_name}!')
    else:
        duplicate_count = 1
        save_name = f'{original_save_name}.csv'
        while os.path.exists(path=os.path.join(save_path, save_name)):
            save_name = f'{original_save_name}_{duplicate_count}.csv'
            duplicate_count += 1

    one_hour_final_data.to_csv(
        path_or_buf=os.path.join(save_path, save_name),
        columns=one_hour_final_data.columns,
        index=True
    )
    ################################################
    
    # Delete this file in RAM for efficiency
    del one_hour_final_data

    # Clear the RAM
    gc.collect()

In [6]:
def process_one_file(file_path, time_column, invalid_column_names, save_path):

    # Read the original file
    original_dataset = pd.read_excel(io=file_path, header=None)

    # Number of columns should be even
    assert len(original_dataset.columns) % 2 == 0, f'Error in {file_path} file:\n' \
        f'Number of columns should be even, but got {len(original_dataset.columns)}'

    # For every even number (column index) in range of number of original dataset columns
    for i in range(0, len(original_dataset.columns), 2):

        # process every two column (one hour data) and then save that in separate files
        process_one_hour_data(
            one_hour_original_data=original_dataset.iloc[:, i:i+2],
            time_column=time_column,
            invalid_column_names=invalid_column_names,
            save_path=save_path,
        )

In [7]:
def preprocess_phase_one(data_path, save_path, invalid_column_names, time_column):

    # Get file paths that need preprocessing
    need_preprocessing_paths = get_need_preprocessing_paths(load_path=data_path)

    # For displaying the process as progressive bar
    t = tqdm(need_preprocessing_paths)

    # for each file that needs preprocessing
    for path in t:
        
        t.set_description_str(desc=f'Preprocessing {path}')

        # Process the file
        process_one_file(
            file_path=path,
            time_column=time_column,
            invalid_column_names=invalid_column_names,
            save_path=save_path
        )

In [8]:
preprocess_phase_one(
    data_path=original_dataset_path,
    save_path=preprocessed_dataset_path,
    invalid_column_names=[float('nan'), None, np.nan],
    time_column='Bid curve chart data (Reference time)'
)

Preprocessing /home/naeim_md93/Projects/NordPool_MCP_Forecasting/datasets/Original_MCP_Data/2021/07_July_2021/mcp_data_report_23-07-2021-00_00_00.xls: 100%|██████████| 3636/3636 [4:34:39<00:00,  4.53s/it]        
