In [1]:
import pandas as pd
import numpy as np
from scipy.stats import linregress
from scipy.fft import fft

In [2]:
# def extract_features(sensor_data):
#     features = []

#     # Basic statistical features
#     mean = np.mean(sensor_data)
#     std = np.std(sensor_data)
#     min_val = np.min(sensor_data)
#     max_val = np.max(sensor_data)

#     # Top 5 minimal and maximal values
#     top5_min = np.mean(sorted(sensor_data)[:5])
#     top5_max = np.mean(sorted(sensor_data)[-5:])

#     # Standard deviations from the mean
#     min_std = (min_val - mean) / std if std != 0 else 0
#     max_std = (max_val - mean) / std if std != 0 else 0

#     # FFT-based features
#     fft_vals = fft(sensor_data)
#     top_freqs = np.abs(fft_vals)[3:6]  # Ignoring the first three frequencies

#     # Linear regression parameters
#     # slope, intercept, _, _, std_err = linregress(range(len(sensor_data)), sensor_data)

#     # Extend the feature list
#     features.extend([
#         mean, std, min_val, max_val, top5_min, top5_max,
#         min_std, max_std, np.max(top_freqs)
#         # , slope, intercept, std_err
#     ])
    
#     return features

In [3]:
import numpy as np
from scipy.stats import linregress
from scipy.fft import fft

def extract_features_extended(sensor_data):
    features = []

    # Ensure the sensor_data is a numpy array and convert it to float for safety
    sensor_data = np.asarray(sensor_data, dtype=np.float64)

    # Check for and remove NaNs and infinite values
    if sensor_data.dtype.kind in 'fc':  # Check if the data is float or complex type
        sensor_data = sensor_data[np.isfinite(sensor_data)]  # Remove both NaN and Inf
    else:
        # If data type is not float or complex, skip these checks (assumes data has been pre-cleaned)
        sensor_data = sensor_data

    if len(sensor_data) == 0:
        return [np.nan] * 21  # Return a list of NaNs if no valid data

    # Basic statistical features
    mean = np.mean(sensor_data)
    std = np.std(sensor_data) if np.std(sensor_data) != 0 else 1
    min_val = np.min(sensor_data)
    max_val = np.max(sensor_data)

    # Top 5 minimal and maximal values
    sorted_data = np.sort(sensor_data)
    top5_min = np.mean(sorted_data[:5]) if len(sorted_data) >= 5 else np.nan
    top5_max = np.mean(sorted_data[-5:]) if len(sorted_data) >= 5 else np.nan

    # Standard deviations from the mean
    min_std = (min_val - mean) / std
    max_std = (max_val - mean) / std

    # FFT-based features
    if len(sensor_data) > 5:
        fft_vals = fft(sensor_data)
        top_freqs = np.abs(fft_vals)[3:6]  # Ignoring the first three frequencies
    else:
        top_freqs = [np.nan, np.nan, np.nan]

    # Linear regression parameters
    x = np.arange(len(sensor_data))
    if len(x) > 1:  # Ensure at least two points for regression
        slope, intercept, r_value, p_value, std_err = linregress(x, sensor_data)
        predicted_y = slope * x + intercept
        mse = np.mean((sensor_data - predicted_y) ** 2)
    else:
        slope, intercept, mse = np.nan, np.nan, np.nan

    # Polynomial fitting (second-degree, parabolic fit)
    if len(sensor_data) > 2:  # Ensure at least three points for polynomial fitting
        coeffs = np.polyfit(x, sensor_data, 2)
        a0, a1, a2 = coeffs
    else:
        a0, a1, a2 = np.nan, np.nan, np.nan

    # Extend the feature list with all calculated values
    features.extend([
        mean, std, min_val, max_val, top5_min, top5_max,
        min_std, max_std, *top_freqs, 
        slope, intercept, mse, abs(slope),
        a0, a1, a2
    ])
    
    return features


In [14]:
def generate_column_names(sensors):
    feature_types = [
        'mean', 'std', 'min', 'max',
        'top5_min', 'top5_max',
        'min_std', 'max_std',
        'top_freq_1', 'top_freq_2', 'top_freq_3',  # For each of the top frequencies
        'slope', 'intercept', 'mse', 'abs_slope',  # Linear regression features
        'a0', 'a1', 'a2'  # Polynomial coefficients
    ]
    column_names = []
    
    for sensor in sensors:
        for feature in feature_types:
            # Format: Sensor_Feature (e.g., AN311_mean)
            column_name = f'{sensor}_{feature}'
            column_names.append(column_name)
    
    return column_names


In [16]:
def feature_engineering_for_all_sensors(df):
    feature_data = []
    sensors = [
        'AN311', 'AN422', 'AN423', 'TP1721', 'RH1722', 'BA1723', 
        'TP1711', 'RH1712', 'BA1713', 'MM252', 'MM261', 'MM262', 
        'MM263', 'MM264', 'MM256', 'MM211', 'CM861', 'CR863', 
        'P_864', 'TC862', 'WM868', 'AMP1_IR', 'AMP2_IR', 'DMP3_IR', 
        'DMP4_IR', 'AMP5_IR', 'F_SIDE', 'V'
    ]
    
    for index, row in df.iterrows():
        row_features = []
        for sensor in sensors:
            sensor_columns = [f'{sensor}_value_{i}' for i in range(1, 601)]  # Assuming 600 readings per sensor
            sensor_data = row[sensor_columns].values

            # Extract features for this sensor
            sensor_features = extract_features_extended(sensor_data)
            
            # Add the sensor features to the row features
            row_features.extend(sensor_features)

        # Append the row's features to the overall feature data
        feature_data.append(row_features)
    
    # Generate column names
    column_names = generate_column_names(sensors)
    
    # Convert the feature data into a DataFrame with proper column names
    feature_df = pd.DataFrame(feature_data, columns=column_names)
    return feature_df


In [20]:
# Create an empty DataFrame to store all feature-engineered data
all_features_df = pd.DataFrame()

# Load the zip file in chunks and process each chunk
chunksize = 1000  # Adjust this size based on available memory
zip_file_path = '../extracted_data/trainingDataFull.zip'

for chunk in pd.read_csv(zip_file_path, chunksize=chunksize, compression='zip'):
    # Apply the feature engineering process to the current chunk
    feature_df = feature_engineering_for_all_sensors(chunk)
    
    # Append the processed chunk to the final DataFrame
    all_features_df = pd.concat([all_features_df, feature_df], ignore_index=True)
    # print(all_features_df.head())
    # break


KeyboardInterrupt: 

In [None]:
# # Save the final feature-engineered data into a compressed zip file
# output_zip_path = '../extracted_data/feature_extracted_data/trainingData_features_01_1.zip'
# all_features_df.to_csv(output_zip_path, index=False, compression='zip')

In [None]:
all_features_df.shape

In [None]:
all_features_df.head()