In [1]:
import pandas as pd
import numpy as np
import os
os.chdir(r'D:\lym本科\科研相关\华为项目资料\2025-1-23\Encoder_PELT')   # set as your own working directory

In [6]:
file_path = f'./data/PUMP/sensor.csv'   # set as your own working directory
df = pd.read_csv(file_path)

missing_threshold = 0.05

missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({
    '缺失数量': missing_values,
    '缺失百分比(%)': missing_percent
}).sort_values('缺失百分比(%)', ascending=False)

print("\n=== Missing Value Analysis ===")
if not missing_data[missing_data['缺失数量'] > 0].empty:
    print("Columns with missing values:")
    print(missing_data[missing_data['缺失数量'] > 0])
else:
    print("No missing values found")

print(f"\n=== Removing columns with missing values exceeding {missing_threshold*100}% ===")
threshold_count = len(df) * missing_threshold
df_cleaned = df.dropna(axis=1, thresh=len(df) - threshold_count)

dropped_cols = set(df.columns) - set(df_cleaned.columns)
if dropped_cols:
    print(f"Removed {len(dropped_cols)} columns:")
    for col in dropped_cols:
        print(f"- {col} (Missing: {missing_percent[col]:.2f}%)")
else:
    print("No columns were removed")

print(f"\nData shape after removal: {df_cleaned.shape}")

print("\n=== Removing rows with missing values ===")
rows_before = len(df_cleaned)
df_cleaned = df_cleaned.dropna()
rows_after = len(df_cleaned)

print(f"Rows removed: {rows_before - rows_after}")
print(f"Final data shape: {df_cleaned.shape}")

print("\n=== Transforming machine_status column ===")
print("Original value distribution:")
print(df_cleaned['machine_status'].value_counts())

df_cleaned['machine_status'] = np.where(
    df_cleaned['machine_status'] == 'NORMAL', 0, 1)

print("\nTransformed distribution:")
print(df_cleaned['machine_status'].value_counts())

print("\n=== Processed data sample ===")
print(df_cleaned.head())


=== Missing Value Analysis ===
Columns with missing values:
             缺失数量    缺失百分比(%)
sensor_15  220320  100.000000
sensor_50   77017   34.956881
sensor_51   15383    6.982117
sensor_00   10208    4.633261
sensor_07    5451    2.474129
sensor_08    5107    2.317992
sensor_06    4798    2.177741
sensor_09    4595    2.085603
sensor_01     369    0.167484
sensor_30     261    0.118464
sensor_29      72    0.032680
sensor_32      68    0.030864
sensor_18      46    0.020879
sensor_17      46    0.020879
sensor_22      41    0.018609
sensor_25      36    0.016340
sensor_16      31    0.014070
sensor_40      27    0.012255
sensor_41      27    0.012255
sensor_43      27    0.012255
sensor_44      27    0.012255
sensor_39      27    0.012255
sensor_38      27    0.012255
sensor_45      27    0.012255
sensor_46      27    0.012255
sensor_47      27    0.012255
sensor_48      27    0.012255
sensor_42      27    0.012255
sensor_49      27    0.012255
sensor_14      21    0.009532
sensor_26

In [3]:
def create_windows(data, window_size):
    windows = []
    for i in range(len(data) - window_size + 1):
        window = data[i:i+window_size,:]
        windows.append(window)
    return np.array(windows)

In [4]:
df_cleaned = df_cleaned.iloc[:, 2:]

X = df_cleaned.drop(columns=['machine_status'])
y = df_cleaned['machine_status']

X_array = X.to_numpy()
y_array = y.to_numpy()

train_size = 20000

if len(X_array) > train_size:
    X_train = X_array[:train_size, :]
    X_test = X_array[train_size:, :]
    y_train = y_array[:train_size]
    y_test = y_array[train_size:]
else:
    raise ValueError(f"Do not have enough data: {len(X_array)}，At least {train_size} !!!")

print("=== Data Processing Results ===")
print(f"Original data shape: {df.shape}")
print(f"Shape after removing first two columns: {df_cleaned.shape}")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")

print("\nFirst 5 rows of training data:")
print(X_train[:5])
print("\nFirst 5 values of test labels:")
print(y_test[:5])

=== Data Processing Results ===
Original data shape: (220320, 55)
Shape after removing first two columns: (208785, 50)
Training data shape: (20000, 49)
Testing data shape: (188785, 49)
Training labels shape: (20000,)
Testing labels shape: (188785,)

First 5 rows of training data:
[[  2.465394    47.09201     53.2118      46.31076    634.375
   76.45975     13.41146     16.13136     15.56713     15.05353
   37.2274      47.52422     31.11716      1.681353   419.5747
  461.8781     466.3284       2.565284   665.3993     398.9862
  880.0001     498.8926     975.9409     627.674      741.7151
  848.0708     429.0377     785.1935     684.9443     594.4445
  682.8125     680.4416     433.7037     171.9375     341.9039
  195.0655      90.32386     40.36458     31.51042     70.57291
   30.98958     31.77083206  41.92708     39.6412      65.68287
   50.92593     38.19444    157.9861      67.70834   ]
 [  2.465394    47.09201     53.2118      46.31076    634.375
   76.45975     13.41146     16.1

In [5]:
window_size = 10
train_window = create_windows(X_train, window_size)
train_window = train_window.reshape(train_window.shape[0],-1)
test_window = create_windows(X_test, window_size)
test_window = test_window.reshape(test_window.shape[0],-1)
test_label = y_test[window_size-1:]
print("train_window:",train_window.shape)
print("test_window:",test_window.shape)
print("test_label:",test_label.shape)

np.save(f'./data/PUMP/train_{window_size}_v3.npy', train_window)
np.save(f'./data/PUMP/test_{window_size}_v3.npy', test_window)
np.save(f'./data/PUMP/test_label_{window_size}_v3.npy', test_label)

train_window: (19991, 490)
test_window: (188776, 490)
test_label: (188776,)
