In [None]:
import os
import pandas as pd

# Parameters
days_before_event = 3
days_after_event = 3

RTD_higher_than = -70
voltage_lower_than = 200

remove_from_date = '2017-11-01'
remove_until_date = '2018-08-01'

# List of error events that indicate a problem in the freezer
error_events = [
    "setpoint_change",                     # Wrong setpoint
    "Bad insulation",                      # Bad insulation
    "door_adjustment",                     # Door tightness
    "High condensation water",             # High condensation
    "display",                             # Data missing
    "instability",                         # Unstable operation
    "compressor_stage_1 malfunctional",    # 1st stage compressor
    "compressor_stage_2 malfunctional",    # 2nd stage compressor
    "electric_wiring",                     # Electrical malfunction
    "Refrigerant leakage at stage 1",      # Refrigerant leakage at stage 1
    "Refrigerant leakage at stage 2"       # Refrigerant leakage at stage 2
]


# Path to the directory containing freezer data
path = '/Users/muhammadhussain/Desktop/Data/Revco/'  # Replace with the actual path to your dataset

# Loop through all files in the directory
for file_name in os.listdir(path):
    # Check if the file matches the expected format
    if file_name.endswith('_temp.parquet'):
        # Extract the freezer number from the file name
        Freezer_number = file_name.split('_')[0]
        
        # Load the Parquet file
        df = pd.read_parquet(os.path.join(path, file_name))
        
        # Check if 'main_fault' column exists
        if 'main_fault' not in df.columns:
            print(f"'main_fault' column not found in {file_name}, skipping...")
            continue  # Skip this file and move to the next
        
        # Ensure 'Datetime' is in datetime format
        df['Datetime'] = pd.to_datetime(df['Datetime'])
        
        # **Step 1**: Filter out rows based on `RTD` and `Mains Voltage`
        df = df[(df['RTD'] <= RTD_higher_than) & (df['Mains Voltage'] >= voltage_lower_than)]
        
        # **Step 2**: Remove rows between dates (if needed)
        # df = df[~((df['Datetime'] >= remove_from_date) & (df['Datetime'] <= remove_until_date))]
        
        # **Step 3**: Identify the indices where a main fault has occurred
        fault_indices = df[df['main_fault'].notnull()].index
        
        # Sets to collect indices for rows to keep in the time window
        indices_to_collect_error = set()
        indices_to_collect_non_error = set()
        
        # Loop through each main fault
        for idx in fault_indices:
            # Get the main fault timestamp and fault value
            fault_time = df.loc[idx, 'Datetime']
            fault_value = df.loc[idx, 'main_fault']  # Get the fault value at the current index
            
            # Define the time window around the main fault
            start_time = fault_time - pd.Timedelta(days=days_before_event)
            end_time = fault_time + pd.Timedelta(days=days_after_event)
            
            # Select rows within the time window
            mask = (df['Datetime'] >= start_time) & (df['Datetime'] <= end_time)
            window_indices = df[mask].index
            
            # Check if the main fault is an error event
            if fault_value in error_events:
                indices_to_collect_error.update(window_indices)
                
                # Update the 'main_fault' column for all rows in this time window
                df.loc[window_indices, 'main_fault'] = fault_value
            else:
                indices_to_collect_non_error.update(window_indices)
        
        # Create DataFrames for error events and clean data
        df_around_events = df.loc[list(indices_to_collect_error)].copy()  # Data with error events in the time window
        df_cleaned = df.drop(indices_to_collect_error.union(indices_to_collect_non_error))  # Remove all handled rows from main DataFrame
        
        # **Step 6**: Save both DataFrames
        df_around_events.to_parquet(
            os.path.join('/Users/muhammadhussain/Desktop/Data/filter/', f'around_events_data_{Freezer_number}.parquet'), index=False
        )
        df_cleaned.to_parquet(
            os.path.join('/Users/muhammadhussain/Desktop/Data/filter/', f'cleaned_data_{Freezer_number}.parquet'), index=False
        )


In [10]:
import pandas as pd
import glob
import time
from sklearn.preprocessing import MinMaxScaler
import os

def process_parquet_files(path, output_file, window_size=60):
    start_time = time.time()  # Track total time

    # Modelnumre, der skal ekskluderes
    excluded_models = {"806026", "806030", "806031", "806276", "806279"}

    # Step 1: Load all parquet files matching the pattern
    all_files = glob.glob(path + "around_events_data_*.parquet")
    if not all_files:
        print(f"No files found in the directory {path}. Exiting.")
        return

    # Step 2: Initialize a list to store processed windows
    time_windows = []
    total_groups = 0  # Counter for total number of groups across all models
    skipped_groups_total = 0  # Counter for total skipped groups across all models

    # Step 3: Process each file
    for file in all_files:
        # Extract the model number from the file name
        model_number = os.path.basename(file).split('_')[3].split('.')[0]

        # Check if the model number is in the excluded list
        if model_number in excluded_models:
            print(f"Skipping model {model_number} (excluded)")
            continue

        df = pd.read_parquet(file)
        df['Datetime'] = pd.to_datetime(df['Datetime'])

        # Ensure the data is sorted by time
        df = df.sort_values('Datetime').reset_index(drop=True)

        # Step 4: Identify continuous sequences
        df['time_diff'] = df['Datetime'].diff().dt.total_seconds()
        df['is_continuous'] = (df['time_diff'].between(59, 61)) | (df.index == 0)
        df['sequence_group'] = (~df['is_continuous']).cumsum()

        # Step 5: Count total groups and filter valid sequences
        groups = df.groupby('sequence_group')
        total_groups += len(groups)  # Add total number of groups for this model
        valid_sequences = groups.filter(lambda x: len(x) >= window_size)
        skipped_groups_for_model = len(groups) - len(valid_sequences.groupby('sequence_group'))
        skipped_groups_total += skipped_groups_for_model  # Add skipped groups for this model

        # Step 6: Create time-windowed data
        def process_group(group):
            group_windows = []
            for start_idx in range(0, len(group) - window_size + 1):
                window = group.iloc[start_idx:start_idx + window_size]

                # Flatten the window
                flattened_window = {}
                for column in window.columns:
                    # Exclude 'main_fault' from being flattened
                    if column not in ['Datetime', 'time_diff', 'is_continuous', 'sequence_group', 'main_fault']:
                        if pd.api.types.is_numeric_dtype(window[column]):
                            flattened_window.update({f"{column}_{i}": v for i, v in enumerate(window[column].tolist())})
                        else:
                            flattened_window.update({f"{column}_0": window[column].iloc[0]})  # Only take the first non-numeric value

                # Add the latest Datetime in the window
                flattened_window["Datetime"] = window['Datetime'].iloc[-1]

                # Add metadata
                flattened_window["main_fault"] = window['main_fault'].mode()[0] if not window['main_fault'].isna().all() else None
                flattened_window["seriesnumber"] = model_number
                group_windows.append(flattened_window)

            return group_windows

        # Process each sequence group sequentially
        valid_groups = [group for _, group in valid_sequences.groupby('sequence_group')]

        print(f"Processing {len(valid_groups)} valid groups for model {model_number}...")
        print(f"Skipped groups for model {model_number}: {skipped_groups_for_model}")

        for group in valid_groups:
            time_windows.extend(process_group(group))

    print(f"Total groups across all models: {total_groups}")
    print(f"Total skipped groups across all models: {skipped_groups_total}")

    # Combine all flattened windows into a single DataFrame
    all_data = pd.DataFrame(time_windows)

    # Step 7: Normalize numeric columns
    scaler = MinMaxScaler()
    numeric_columns = all_data.select_dtypes(include=['number']).columns
    if not numeric_columns.empty:
        all_data[numeric_columns] = scaler.fit_transform(all_data[numeric_columns])

    # Step 8: Save the prepared data
    all_data.to_parquet(output_file, index=False)

    elapsed_time = time.time() - start_time
    print(f"Total elapsed time: {elapsed_time:.2f} seconds")


In [11]:
# Angiv sti til dine data og output-fil
path = "/Users/muhammadhussain/Desktop/Data/filter/"
output_file = "/Users/muhammadhussain/Desktop/Data/filter/finished.parquet"

# Kald funktionen
process_parquet_files(path, output_file, window_size=30)


Skipping model 806031 (excluded)
Processing 0 valid groups for model 806272...
Skipped groups for model 806272: 0
Processing 8 valid groups for model 806017...
Skipped groups for model 806017: 6
Processing 0 valid groups for model 806029...
Skipped groups for model 806029: 0
Processing 5 valid groups for model 806020...
Skipped groups for model 806020: 4
Skipping model 806030 (excluded)
Processing 0 valid groups for model 806273...
Skipped groups for model 806273: 0
Processing 5 valid groups for model 806278...
Skipped groups for model 806278: 4
Processing 0 valid groups for model 806023...
Skipped groups for model 806023: 0
Processing 0 valid groups for model 806033...
Skipped groups for model 806033: 0
Skipping model 806279 (excluded)
Processing 0 valid groups for model 806269...
Skipped groups for model 806269: 0
Processing 0 valid groups for model 808301...
Skipped groups for model 808301: 0
Processing 0 valid groups for model 806265...
Skipped groups for model 806265: 0
Skipping m

In [14]:
pd.read_parquet('/Users/muhammadhussain/Desktop/Data/filter/around_events_data_806031.parquet')

Unnamed: 0,Datetime,RTD,1st Suc.,Cond. Air In,Evap. In,Evap. Out,2nd Suc.,Chil. water In,2nd Sump,H.E.,SetPoint,Mains Voltage,State,Type,Event,main_fault


In [None]:
pd.read_parquet('/Users/muhammadhussain/Desktop/Data/Revco/806269_temp.parquet')["main_fault"]

array([None, 'relay', 'compressor_stage_2'], dtype=object)

In [9]:
pd.read_parquet("/Users/muhammadhussain/Desktop/Data/filter/finished.parquet")

Unnamed: 0,RTD_0,RTD_1,RTD_2,RTD_3,RTD_4,RTD_5,RTD_6,RTD_7,RTD_8,RTD_9,...,State_55,State_56,State_57,State_58,State_59,Type_0,Event_0,Datetime,main_fault,seriesnumber
0,0.187500,0.198198,0.207207,0.207207,0.207207,0.207207,0.216216,0.216216,0.207207,0.189189,...,0.157895,0.157895,0.157895,0.157895,0.157895,,,2015-03-16 12:59:12,Refrigerant leakage at stage 1,806017
1,0.196429,0.207207,0.207207,0.207207,0.207207,0.216216,0.216216,0.207207,0.189189,0.153153,...,0.157895,0.157895,0.157895,0.157895,0.157895,,,2015-03-16 13:00:12,Refrigerant leakage at stage 1,806017
2,0.205357,0.207207,0.207207,0.207207,0.216216,0.216216,0.207207,0.189189,0.153153,0.081081,...,0.157895,0.157895,0.157895,0.157895,0.052632,,,2015-03-16 13:01:12,Refrigerant leakage at stage 1,806017
3,0.205357,0.207207,0.207207,0.216216,0.216216,0.207207,0.189189,0.153153,0.081081,0.036036,...,0.157895,0.157895,0.157895,0.052632,0.000000,,,2015-03-16 13:02:12,Refrigerant leakage at stage 1,806017
4,0.205357,0.207207,0.216216,0.216216,0.207207,0.189189,0.153153,0.081081,0.036036,0.018018,...,0.157895,0.157895,0.052632,0.000000,0.000000,,,2015-03-16 13:03:12,Refrigerant leakage at stage 1,806017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59120,0.035714,0.054054,0.072072,0.090090,0.081081,0.126126,0.144144,0.153153,0.171171,0.180180,...,0.157895,0.157895,0.157895,0.157895,0.157895,,,2022-10-27 11:55:41,instability,806277
59121,0.053571,0.072072,0.090090,0.081081,0.126126,0.144144,0.153153,0.171171,0.180180,0.180180,...,0.157895,0.157895,0.157895,0.157895,0.157895,,,2022-10-27 11:56:41,instability,806277
59122,0.071429,0.090090,0.081081,0.126126,0.144144,0.153153,0.171171,0.180180,0.180180,0.198198,...,0.157895,0.157895,0.157895,0.157895,0.157895,,,2022-10-27 11:57:41,instability,806277
59123,0.089286,0.081081,0.126126,0.144144,0.153153,0.171171,0.180180,0.180180,0.198198,0.207207,...,0.157895,0.157895,0.157895,0.157895,0.157895,,,2022-10-27 11:58:41,instability,806277


In [8]:
import os
import pandas as pd

# Define the directory path
directory_path = "/Users/muhammadhussain/Desktop/Data/Revco/"

# Initialize a set to store unique values of 'main_fault'
unique_main_faults = set()

# Loop through all files in the directory
for file_name in os.listdir(directory_path):
    # Check if the file is a Parquet file
    if file_name.endswith(".parquet"):
        # Construct the full file path
        file_path = os.path.join(directory_path, file_name)
        
        try:
            # Load the Parquet file
            df = pd.read_parquet(file_path)
            
            # Check if the 'main_fault' column exists
            if 'main_fault' in df.columns:
                # Add unique values from the 'main_fault' column to the set
                unique_main_faults.update(df['main_fault'].dropna().unique())
        except Exception as e:
            print(f"Could not process file {file_name}: {e}")

# Display the total number of unique values
print(f"Total unique 'main_fault' values: {len(unique_main_faults)}")
print(f"Unique values: {unique_main_faults}")


Total unique 'main_fault' values: 23
Unique values: {'main_board', 'setpoint_change', 'Refrigerant leakage at stage 2', 'fuse_blown', 'warm_alarm', 'door_adjustment', 'display', 'filter_drier', 'relay', 'battery', 'compressor_stage_2 malfunctional', 'logic_wiring', 'OS_update', 'instability', 'warm_interstage', 'compressor_stage_2', 'High condensation water', 'water_control_valve', 'electric_wiring', 'Refrigerant leakage at stage 1', 'compressor_stage_1', 'compressor_stage_1 malfunctional', 'handle_tightening'}
