In [2]:
import pandas as pd
import numpy as np

In [4]:
# Function to extract non-overlapping window features for each sensor
def extract_non_overlapping_features(sensor_data, num_windows):
    features = []
    window_size = len(sensor_data) // num_windows
    
    # Split sensor data into non-overlapping windows
    for i in range(num_windows):
        start = i * window_size
        end = start + window_size
        
        window = sensor_data[start:end]
        
        # Example features: mean, std, min, max, etc.
        mean = np.mean(window)
        std = np.std(window)
        min_val = np.min(window)
        max_val = np.max(window)
        
        # Add features to the list
        features.extend([mean, std, min_val, max_val])
    
    return features

In [6]:
# Generate the column names based on sensors, windows, and features
def generate_column_names(sensors, num_windows):
    feature_types = ['mean', 'std', 'min', 'max']
    column_names = []
    
    for sensor in sensors:
        for window in range(1, num_windows + 1):
            for feature in feature_types:
                # Format: Sensor_Window_Feature (e.g., AN311_window_1_mean)
                column_name = f'{sensor}_window_{window}_{feature}'
                column_names.append(column_name)
    
    return column_names

In [8]:
# Apply this function to all sensors in the dataframe
def feature_engineering_for_all_sensors(df, num_windows):
    feature_data = []
    
    # List of all the sensors
    sensors = [
        'AN311', 'AN422', 'AN423', 'TP1721', 'RH1722', 'BA1723', 
        'TP1711', 'RH1712', 'BA1713', 'MM252', 'MM261', 'MM262', 
        'MM263', 'MM264', 'MM256', 'MM211', 'CM861', 'CR863', 
        'P_864', 'TC862', 'WM868', 'AMP1_IR', 'AMP2_IR', 'DMP3_IR', 
        'DMP4_IR', 'AMP5_IR', 'F_SIDE', 'V']
    
    # Loop over each row (which corresponds to a 10-second timeframe for all sensors)
    for index, row in df.iterrows():
        row_features = []
        
        for sensor in sensors:
            sensor_columns = [f'{sensor}_value_{i}' for i in range(1, 601)]  # 600 readings per sensor
            sensor_data = row[sensor_columns].values
            
            # Extract non-overlapping windowed features for this sensor
            sensor_features = extract_non_overlapping_features(sensor_data, num_windows)
            
            # Add the sensor features to the row features
            row_features.extend(sensor_features)
        
        # Append the row's features to the overall feature data
        feature_data.append(row_features)
    
    # Generate column names based on sensors, windows, and features
    column_names = generate_column_names(sensors, num_windows)
    
    # Convert the feature data into a DataFrame with proper column names
    feature_df = pd.DataFrame(feature_data, columns=column_names)
    
    return feature_df

In [16]:
# Define the number of windows (e.g., divide the 600 time points into 5 non-overlapping windows)
num_windows = 5  # This means each window will contain 120 seconds 

# Create an empty DataFrame to store all feature-engineered data
test_features_df = pd.DataFrame()

# Load the zip file in chunks and process each chunk
chunksize = 1000  # Adjust this size based on available memory
testdata = '../extracted_data/testDataFull.csv'

for chunk in pd.read_csv(testdata, chunksize=chunksize):
    # Apply the feature engineering process to the current chunk
    feature_df = feature_engineering_for_all_sensors(chunk, num_windows)
    
    # Append the processed chunk to the final DataFrame
    test_features_df = pd.concat([test_features_df, feature_df], ignore_index=True)


In [20]:
# Save the final feature-engineered data into a compressed zip file
output_zip_path = '../extracted_data/feature_extracted_data/testData_features.csv'
test_features_df.to_csv(output_zip_path, index=False)

In [22]:
test_features_df.shape

(5076, 560)

In [24]:
test_features_df.head()

Unnamed: 0,AN311_window_1_mean,AN311_window_1_std,AN311_window_1_min,AN311_window_1_max,AN311_window_2_mean,AN311_window_2_std,AN311_window_2_min,AN311_window_2_max,AN311_window_3_mean,AN311_window_3_std,...,V_window_3_min,V_window_3_max,V_window_4_mean,V_window_4_std,V_window_4_min,V_window_4_max,V_window_5_mean,V_window_5_std,V_window_5_min,V_window_5_max
0,3.345833,0.246271,2.9,3.9,3.205,0.133448,2.8,3.7,3.274167,0.198954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.148333,0.100816,3.9,4.3,4.056667,0.098939,3.9,4.2,4.1,0.125167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.1775,0.350823,2.7,4.0,3.215833,0.31649,2.7,3.9,3.261667,0.192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.918333,0.172715,3.6,4.2,3.79,0.159896,3.6,4.2,3.873333,0.185173,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.306667,0.235136,3.0,3.9,3.346667,0.164789,3.0,3.6,3.3,0.237346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
