In [1]:
import pandas as pd
import numpy as np
from scipy.stats import linregress
from scipy.fft import fft

In [2]:
def extract_features(sensor_data):
    features = []

    # Basic statistical features
    mean = np.mean(sensor_data)
    std = np.std(sensor_data)
    min_val = np.min(sensor_data)
    max_val = np.max(sensor_data)

    # Top 5 minimal and maximal values
    top5_min = np.mean(sorted(sensor_data)[:5])
    top5_max = np.mean(sorted(sensor_data)[-5:])

    # Standard deviations from the mean
    min_std = (min_val - mean) / std if std != 0 else 0
    max_std = (max_val - mean) / std if std != 0 else 0

    # FFT-based features
    fft_vals = fft(sensor_data)
    top_freqs = np.abs(fft_vals)[3:6]  # Ignoring the first three frequencies

    # Linear regression parameters
    # slope, intercept, _, _, std_err = linregress(range(len(sensor_data)), sensor_data)

    # Extend the feature list
    features.extend([
        mean, std, min_val, max_val, top5_min, top5_max,
        min_std, max_std, np.max(top_freqs)
        # , slope, intercept, std_err
    ])
    
    return features

In [3]:
def generate_column_names(sensors):
    feature_types = [
        'mean', 'std', 'min', 'max',
        'top5_min', 'top5_max',
        'min_std', 'max_std',
        'top_freqs_max'
        # , 'slope', 'intercept', 'std_err'
    ]
    column_names = []
    
    for sensor in sensors:
        for feature in feature_types:
            # Format: Sensor_Feature (e.g., AN311_mean)
            column_name = f'{sensor}_{feature}'
            column_names.append(column_name)
    
    return column_names


In [4]:
def feature_engineering_for_all_sensors(df):
    feature_data = []
    sensors = [
        'AN311', 'AN422', 'AN423', 'TP1721', 'RH1722', 'BA1723', 
        'TP1711', 'RH1712', 'BA1713', 'MM252', 'MM261', 'MM262', 
        'MM263', 'MM264', 'MM256', 'MM211', 'CM861', 'CR863', 
        'P_864', 'TC862', 'WM868', 'AMP1_IR', 'AMP2_IR', 'DMP3_IR', 
        'DMP4_IR', 'AMP5_IR', 'F_SIDE', 'V'
    ]
    
    for index, row in df.iterrows():
        row_features = []
        for sensor in sensors:
            sensor_columns = [f'{sensor}_value_{i}' for i in range(1, 601)]  # Assuming 600 readings per sensor
            sensor_data = row[sensor_columns].values

            # Extract features for this sensor
            sensor_features = extract_features(sensor_data)
            
            # Add the sensor features to the row features
            row_features.extend(sensor_features)

        # Append the row's features to the overall feature data
        feature_data.append(row_features)
    
    # Generate column names
    column_names = generate_column_names(sensors)
    
    # Convert the feature data into a DataFrame with proper column names
    feature_df = pd.DataFrame(feature_data, columns=column_names)
    return feature_df


In [13]:
# Create an empty DataFrame to store all feature-engineered data
all_features_df = pd.DataFrame()

# Load the zip file in chunks and process each chunk
chunksize = 1000  # Adjust this size based on available memory
testdata = '../extracted_data/testDataFull.csv'

for chunk in pd.read_csv(testdata, chunksize=chunksize):
    # Apply the feature engineering process to the current chunk
    feature_df = feature_engineering_for_all_sensors(chunk)
    
    # Append the processed chunk to the final DataFrame
    all_features_df = pd.concat([all_features_df, feature_df], ignore_index=True)
    # print(all_features_df.head())
    # break


In [18]:
# Save the final feature-engineered data into a compressed zip file
output_zip_path = '../extracted_data/feature_extracted_data/testData_features_01_01.csv'
all_features_df.to_csv(output_zip_path, index=False)

In [16]:
all_features_df.shape

(5076, 252)

In [None]:
all_features_df.head()