In [8]:
import pandas as pd
import glob
import time
from sklearn.preprocessing import MinMaxScaler
import os

def process_parquet_files(path, output_file, window_size=60):
    start_time = time.time()  # Track total time

    # Modelnumre, der skal ekskluderes
    excluded_models = {"806026", "806030", "806031", "806276", "806279"}

    # Step 1: Load all parquet files matching the pattern
    around_event_files = glob.glob(path + "around_events_data_*.parquet")
    cleaned_data_file = os.path.join(path, "cleaned_data_806278.parquet")

    if not around_event_files:
        print(f"No 'around_events_data_*' files found in the directory {path}. Exiting.")
        return

    # Step 2: Initialize a list to store processed windows
    time_windows = []
    total_groups = 0  # Counter for total number of groups across all models
    skipped_groups_total = 0  # Counter for total skipped groups across all models

    # Step 3: Process "around_events_data_*.parquet" files
    for file in around_event_files:
        model_number = os.path.basename(file).split('_')[3].split('.')[0]

        # Check if the model number is in the excluded list
        if model_number in excluded_models:
            print(f"Skipping model {model_number} (excluded)")
            continue

        df = pd.read_parquet(file)
        df['Datetime'] = pd.to_datetime(df['Datetime'])

        # Ensure the data is sorted by time
        df = df.sort_values('Datetime').reset_index(drop=True)

        # Step 4: Identify continuous sequences
        df['time_diff'] = df['Datetime'].diff().dt.total_seconds()
        df['is_continuous'] = (df['time_diff'].between(59, 61)) | (df.index == 0)
        df['sequence_group'] = (~df['is_continuous']).cumsum()

        # Step 5: Count total groups and filter valid sequences
        groups = df.groupby('sequence_group')
        total_groups += len(groups)  # Add total number of groups for this model
        valid_sequences = groups.filter(lambda x: len(x) >= window_size)
        skipped_groups_for_model = len(groups) - len(valid_sequences.groupby('sequence_group'))
        skipped_groups_total += skipped_groups_for_model

        # Step 6: Create time-windowed data
        def process_group(group):
            group_windows = []
            for start_idx in range(0, len(group) - window_size + 1):
                window = group.iloc[start_idx:start_idx + window_size]

                # Flatten the window
                flattened_window = {}
                for column in window.columns:
                    if column not in ['Datetime', 'time_diff', 'is_continuous', 'sequence_group', 'main_fault']:
                        if pd.api.types.is_numeric_dtype(window[column]):
                            flattened_window.update({f"{column}_{i}": v for i, v in enumerate(window[column].tolist())})
                        else:
                            flattened_window.update({f"{column}_0": window[column].iloc[0]})  # Only take the first non-numeric value

                # Add the latest Datetime in the window
                flattened_window["Datetime"] = window['Datetime'].iloc[-1]

                # Add metadata
                flattened_window["main_fault"] = window['main_fault'].mode()[0] if not window['main_fault'].isna().all() else None
                flattened_window["seriesnumber"] = model_number
                group_windows.append(flattened_window)

            return group_windows

        valid_groups = [group for _, group in valid_sequences.groupby('sequence_group')]
        print(f"Processing {len(valid_groups)} valid groups for model {model_number}...")
        print(f"Skipped groups for model {model_number}: {skipped_groups_for_model}")

        for group in valid_groups:
            time_windows.extend(process_group(group))

    print(f"Total groups across all models: {total_groups}")
    print(f"Total skipped groups across all models: {skipped_groups_total}")

    # Step 4: Process "cleaned_data_806278.parquet" file
    if os.path.exists(cleaned_data_file):
        print(f"Processing cleaned data from: {cleaned_data_file}")
        df_cleaned = pd.read_parquet(cleaned_data_file)
        df_cleaned['Datetime'] = pd.to_datetime(df_cleaned['Datetime'])
        df_cleaned = df_cleaned.sort_values('Datetime').reset_index(drop=True)

        # Identify continuous sequences
        df_cleaned['time_diff'] = df_cleaned['Datetime'].diff().dt.total_seconds()
        df_cleaned['is_continuous'] = (df_cleaned['time_diff'].between(59, 61)) | (df_cleaned.index == 0)
        df_cleaned['sequence_group'] = (~df_cleaned['is_continuous']).cumsum()

        # Determine how many rows to include (10% of the total from time_windows)
        limit = int(len(time_windows) * 0.1)

        # Select rows up to the limit
        cleaned_windows = []
        for _, group in df_cleaned.groupby('sequence_group'):
            if len(group) >= window_size:
                for start_idx in range(0, len(group) - window_size + 1):
                    window = group.iloc[start_idx:start_idx + window_size]

                    # Flatten the window
                    flattened_window = {}
                    for column in window.columns:
                        if column not in ['Datetime', 'time_diff', 'is_continuous', 'sequence_group']:
                            if pd.api.types.is_numeric_dtype(window[column]):
                                flattened_window.update({f"{column}_{i}": v for i, v in enumerate(window[column].tolist())})
                            else:
                                flattened_window.update({f"{column}_0": window[column].iloc[0]})

                    # Add the latest Datetime in the window
                    flattened_window["Datetime"] = window['Datetime'].iloc[-1]
                    flattened_window["main_fault"] = None  # Always None for cleaned data
                    flattened_window["seriesnumber"] = "806278"
                    cleaned_windows.append(flattened_window)

                    # Stop if we've reached the limit
                    if len(cleaned_windows) >= limit:
                        break
            if len(cleaned_windows) >= limit:
                break
        print(f"Added {len(cleaned_windows)} rows from cleaned data.")
        time_windows.extend(cleaned_windows)

    # Combine all flattened windows into a single DataFrame
    all_data = pd.DataFrame(time_windows)

    # Step 7: Normalize numeric columns
    scaler = MinMaxScaler()
    numeric_columns = all_data.select_dtypes(include=['number']).columns
    if not numeric_columns.empty:
        all_data[numeric_columns] = scaler.fit_transform(all_data[numeric_columns])

    # Step 8: Save the prepared data
    all_data.to_parquet(output_file, index=False)

    elapsed_time = time.time() - start_time
    print(f"Total elapsed time: {elapsed_time:.2f} seconds")


In [9]:
# Angiv sti til dine data og output-fil
path = "/Users/muhammadhussain/Desktop/Data/filter/"
output_file = "/Users/muhammadhussain/Desktop/Data/filter/finished.parquet"

# Kald funktionen
process_parquet_files(path, output_file, window_size=30)


Skipping model 806031 (excluded)
Processing 0 valid groups for model 806272...
Skipped groups for model 806272: 0
Processing 8 valid groups for model 806017...
Skipped groups for model 806017: 6
Processing 0 valid groups for model 806029...
Skipped groups for model 806029: 0
Processing 5 valid groups for model 806020...
Skipped groups for model 806020: 4
Skipping model 806030 (excluded)
Processing 0 valid groups for model 806273...
Skipped groups for model 806273: 0
Processing 5 valid groups for model 806278...
Skipped groups for model 806278: 4
Processing 0 valid groups for model 806023...
Skipped groups for model 806023: 0
Processing 0 valid groups for model 806033...
Skipped groups for model 806033: 0
Skipping model 806279 (excluded)
Processing 0 valid groups for model 806269...
Skipped groups for model 806269: 0
Processing 0 valid groups for model 808301...
Skipped groups for model 808301: 0
Processing 0 valid groups for model 806265...
Skipped groups for model 806265: 0
Skipping m

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


Total elapsed time: 97.72 seconds


In [10]:
pd.read_parquet("/Users/muhammadhussain/Desktop/Data/filter/finished.parquet")

Unnamed: 0,RTD_0,RTD_1,RTD_2,RTD_3,RTD_4,RTD_5,RTD_6,RTD_7,RTD_8,RTD_9,...,State_26,State_27,State_28,State_29,Type_0,Event_0,Datetime,main_fault,seriesnumber,main_fault_0
0,0.455090,0.463855,0.469880,0.469880,0.469880,0.469880,0.475904,0.475904,0.469880,0.457831,...,0.157895,0.157895,0.157895,0.157895,,,2015-03-16 12:29:12,Refrigerant leakage at stage 1,806017,
1,0.461078,0.469880,0.469880,0.469880,0.469880,0.475904,0.475904,0.469880,0.457831,0.433735,...,0.157895,0.157895,0.157895,0.157895,,,2015-03-16 12:30:12,Refrigerant leakage at stage 1,806017,
2,0.467066,0.469880,0.469880,0.469880,0.475904,0.475904,0.469880,0.457831,0.433735,0.385542,...,0.157895,0.157895,0.157895,0.157895,,,2015-03-16 12:31:12,Refrigerant leakage at stage 1,806017,
3,0.467066,0.469880,0.469880,0.475904,0.475904,0.469880,0.457831,0.433735,0.385542,0.355422,...,0.157895,0.157895,0.157895,0.157895,,,2015-03-16 12:32:12,Refrigerant leakage at stage 1,806017,
4,0.467066,0.469880,0.475904,0.475904,0.469880,0.457831,0.433735,0.385542,0.355422,0.343373,...,0.157895,0.157895,0.157895,0.157895,,,2015-03-16 12:33:12,Refrigerant leakage at stage 1,806017,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66023,0.467066,0.469880,0.469880,0.463855,0.451807,0.433735,0.397590,0.379518,0.355422,0.343373,...,0.000000,0.000000,0.000000,0.000000,,,2013-02-24 10:58:12,,806278,
66024,0.467066,0.469880,0.463855,0.451807,0.433735,0.397590,0.379518,0.355422,0.343373,0.343373,...,0.000000,0.000000,0.000000,0.000000,,,2013-02-24 10:59:12,,806278,
66025,0.467066,0.463855,0.451807,0.433735,0.397590,0.379518,0.355422,0.343373,0.343373,0.355422,...,0.000000,0.000000,0.000000,0.000000,,,2013-02-24 11:00:12,,806278,
66026,0.461078,0.451807,0.433735,0.397590,0.379518,0.355422,0.343373,0.343373,0.355422,0.367470,...,0.000000,0.000000,0.000000,0.000000,,,2013-02-24 11:01:12,,806278,
