In [None]:
import pandas as pd
import glob
import time
from sklearn.preprocessing import MinMaxScaler
import os

def process_parquet_files(path, output_file, window_size=60):
    start_time = time.time()  # Track total time

    # Step 1: Load all parquet files matching the pattern
    all_files = glob.glob(path + "around_events_data_*.parquet")
    if not all_files:
        print(f"No files found in the directory {path}. Exiting.")
        return

    # Step 2: Initialize a list to store processed windows
    time_windows = []

    # Step 3: Process each file
    for file in all_files:
        # Extract the model number from the file name
        model_number = os.path.basename(file).split('_')[3].split('.')[0]

        df = pd.read_parquet(file)
        df['Datetime'] = pd.to_datetime(df['Datetime'])

        # Ensure the data is sorted by time
        df = df.sort_values('Datetime').reset_index(drop=True)

        # Step 4: Identify continuous sequences
        df['time_diff'] = df['Datetime'].diff().dt.total_seconds()
        df['is_continuous'] = (df['time_diff'].between(59, 61)) | (df.index == 0)
        df['sequence_group'] = (~df['is_continuous']).cumsum()

        # Step 5: Filter valid sequences
        valid_sequences = df.groupby('sequence_group').filter(lambda x: len(x) >= window_size)

        # Step 6: Create time-windowed data
        def process_group(group):
            group_windows = []
            for start_idx in range(0, len(group) - window_size + 1):
                window = group.iloc[start_idx:start_idx + window_size]
                flattened_window = {}

                # Separate numerical and non-numerical attributes
                for column in window.columns:
                    if column not in ['Datetime', 'time_diff', 'is_continuous', 'sequence_group']:
                        if pd.api.types.is_numeric_dtype(window[column]):
                            flattened_window[column] = window[column].tolist()
                        else:
                            flattened_window[column] = window[column].iloc[0]  # Take the first non-numeric value

                # Add the window, main_fault value, and seriesnumber
                flattened_window["main_fault"] = window['main_fault'].mode()[0] if not window['main_fault'].isna().all() else None
                flattened_window["seriesnumber"] = model_number
                group_windows.append(flattened_window)

            return group_windows

        # Process each sequence group sequentially
        groups = [group for _, group in valid_sequences.groupby('sequence_group')]

        print(f"Processing {len(groups)} groups for model {model_number}...")
        for group in groups:
            time_windows.extend(process_group(group))

    # Step 7: Normalize numeric columns
    scaler = MinMaxScaler()

    # Combine all windows into a single DataFrame
    all_data = pd.DataFrame(time_windows)

    # Normalize only numeric columns if they exist
    numeric_columns = all_data.select_dtypes(include=['number']).columns
    if not numeric_columns.empty:
        all_data[numeric_columns] = scaler.fit_transform(all_data[numeric_columns])

    # Step 8: Save the prepared data
    all_data.to_parquet(output_file, index=False)

    elapsed_time = time.time() - start_time
    print(f"Total elapsed time: {elapsed_time:.2f} seconds")


In [2]:
# Angiv sti til dine data og output-fil
path = "/Users/muhammadhussain/Desktop/Data/filter/"
output_file = "/Users/muhammadhussain/Desktop/DEEPLEARNINGPROJECT/Data/finished.parquet"

# Kald funktionen
process_parquet_files(path, output_file, window_size=60)


Processing 0 groups for model 806031...
Processing 0 groups for model 806272...
Processing 8 groups for model 806017...
Processing 0 groups for model 806029...
Processing 5 groups for model 806020...
Processing 3 groups for model 806030...
Processing 0 groups for model 806273...
Processing 5 groups for model 806278...
Processing 0 groups for model 806023...
Processing 0 groups for model 806033...
Processing 6 groups for model 806279...
Processing 0 groups for model 806269...
Processing 0 groups for model 808301...
Processing 0 groups for model 806265...
Processing 3 groups for model 806026...
Processing 0 groups for model 806281...
Processing 2 groups for model 806274...
Processing 3 groups for model 806018...
Processing 14 groups for model 806276...
Processing 0 groups for model 806282...
Processing 0 groups for model 808783...
Processing 7 groups for model 806277...
Processing 0 groups for model 806024...
Processing 0 groups for model 806283...


ValueError: at least one array or dtype is required

In [5]:
pd.read_parquet('/Users/muhammadhussain/Desktop/Data/filter/around_events_data_806031.parquet')

Unnamed: 0,Datetime,RTD,1st Suc.,Cond. Air In,Evap. In,Evap. Out,2nd Suc.,Chil. water In,2nd Sump,H.E.,SetPoint,Mains Voltage,State,Type,Event,main_fault


In [None]:
pd.read_parquet('/Users/muhammadhussain/Desktop/Data/Revco/806269_temp.parquet')["main_fault"]

array([None, 'relay', 'compressor_stage_2'], dtype=object)

In [8]:
import os
import pandas as pd

# Define the directory path
directory_path = "/Users/muhammadhussain/Desktop/Data/Revco/"

# Initialize a set to store unique values of 'main_fault'
unique_main_faults = set()

# Loop through all files in the directory
for file_name in os.listdir(directory_path):
    # Check if the file is a Parquet file
    if file_name.endswith(".parquet"):
        # Construct the full file path
        file_path = os.path.join(directory_path, file_name)
        
        try:
            # Load the Parquet file
            df = pd.read_parquet(file_path)
            
            # Check if the 'main_fault' column exists
            if 'main_fault' in df.columns:
                # Add unique values from the 'main_fault' column to the set
                unique_main_faults.update(df['main_fault'].dropna().unique())
        except Exception as e:
            print(f"Could not process file {file_name}: {e}")

# Display the total number of unique values
print(f"Total unique 'main_fault' values: {len(unique_main_faults)}")
print(f"Unique values: {unique_main_faults}")


Total unique 'main_fault' values: 23
Unique values: {'main_board', 'setpoint_change', 'Refrigerant leakage at stage 2', 'fuse_blown', 'warm_alarm', 'door_adjustment', 'display', 'filter_drier', 'relay', 'battery', 'compressor_stage_2 malfunctional', 'logic_wiring', 'OS_update', 'instability', 'warm_interstage', 'compressor_stage_2', 'High condensation water', 'water_control_valve', 'electric_wiring', 'Refrigerant leakage at stage 1', 'compressor_stage_1', 'compressor_stage_1 malfunctional', 'handle_tightening'}
