# Sensor Data Processing

This notebook provides a reusable function to generate combined sensor data (temperature, humidity, irradiance + power) for any sensor IDs.

## Setup

Install required dependencies from `requirements.txt`:
```bash
pip install -r requirements.txt
```

Required packages (from requirements.txt):
- `pandas>=2.0.0`: Data manipulation and Excel file handling
- `numpy>=1.24.0`: Numerical operations  
- `openpyxl>=3.1.0`: Excel file reading/writing support
- `matplotlib>=3.7.0`: Plotting (optional, for visualization)


In [None]:
# Import required packages (as specified in requirements.txt)
import pandas as pd
import numpy as np
import os


In [12]:
def generate_combined_data(sensor_ids, power_data_paths, output_dir='data'):
    """
    Generate combined data (temperature, humidity, irradiance + power) for specified sensors.
    Each sensor is saved to a separate file.
    
    Parameters:
    -----------
    sensor_ids : list
        List of sensor IDs to process (e.g., [17, 20])
    power_data_paths : dict
        Dictionary mapping month names to file paths
        e.g., {'august': '../aug_2025/Area_1/august_fivemin_power.csv',
               'september': '../sep_2025/Area_1/september_fivemin_power.csv',
               'october': '../oct_2025/Area_1/october_fivemin_power.csv'}
    output_dir : str
        Directory to save output files (default: 'data')
    
    Returns:
    --------
    dict : Dictionary mapping sensor_id to the combined dataframe
    """
    import os
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Read raw sensor data
    file_path = 'data/raw/panel_temperature_and_light_raw_dataset_MK_2025_december_3.xlsx'
    df = pd.read_excel(file_path)
    
    # Filter for specified sensor IDs
    columns_to_process = ['datetime', 'upper_temp_C', 'upper_humidity', 'upper_irradiance(μW/cm²)', 'sensor_id']
    sensor_data = df[df['sensor_id'].isin(sensor_ids)][columns_to_process].copy()
    sensor_data['datetime'] = pd.to_datetime(sensor_data['datetime'])
    
    print(f"Processing sensors: {sensor_ids}")
    print(f"Total raw data points: {len(sensor_data)}")
    
    # Process each sensor
    sensor_results = {}
    
    for sensor_id in sensor_ids:
        print(f"\n{'='*60}")
        print(f"Processing sensor_id: {sensor_id}")
        print(f"{'='*60}")
        
        # Filter data for this sensor
        sensor_specific_data = sensor_data[sensor_data['sensor_id'] == sensor_id][
            ['datetime', 'upper_temp_C', 'upper_humidity', 'upper_irradiance(μW/cm²)']
        ].copy()
        
        # Remove duplicates
        sensor_specific_data = sensor_specific_data.drop_duplicates(subset=['datetime'], keep='first')
        sensor_specific_data = sensor_specific_data.sort_values('datetime').reset_index(drop=True)
        
        # Set datetime as index
        sensor_indexed = sensor_specific_data.set_index('datetime')
        
        # Create 10-minute grid starting from 2025-08-12 17:00:00
        start_time = pd.Timestamp('2025-08-12 17:00:00')
        end_time = sensor_indexed.index.max()
        
        # Align start_time
        data_start = sensor_indexed.index.min()
        if data_start < start_time:
            resample_start = start_time
        else:
            time_since_start = (data_start - start_time).total_seconds() / 60
            intervals_since_start = int((time_since_start // 10) + (1 if time_since_start % 10 > 0 else 0))
            resample_start = start_time + pd.Timedelta(minutes=10 * intervals_since_start)
        
        # Create date range with 10-minute frequency
        resample_range = pd.date_range(start=resample_start, end=end_time, freq='10min')
        
        # Union original timestamps and grid
        original_index = sensor_indexed.index
        combined_index = original_index.union(resample_range).sort_values()
        
        # Reindex to combined index
        sensor_combined = sensor_indexed.reindex(combined_index)
        
        # Interpolate
        columns_to_interpolate = ['upper_temp_C', 'upper_humidity', 'upper_irradiance(μW/cm²)']
        for col in columns_to_interpolate:
            sensor_combined[col] = sensor_combined[col].interpolate(method='time', limit_direction='both')
        
        # Select only grid timestamps
        sensor_resampled = sensor_combined.loc[resample_range]
        sensor_final = sensor_resampled.reset_index()
        sensor_final.rename(columns={'index': 'datetime'}, inplace=True)
        sensor_final = sensor_final.rename(columns={'upper_irradiance(μW/cm²)': 'upper_irradiance'})
        sensor_final['sensor_id'] = sensor_id
        
        print(f"  Resampled data shape: {sensor_final.shape}")
        print(f"  Time range: {sensor_final['datetime'].min()} to {sensor_final['datetime'].max()}")
        
        # Process power data - use eu column matching sensor_id
        print(f"\n  Processing power data for eu_{sensor_id}...")
        eu_column_name = f'eu_{sensor_id} (W)'
        all_power_data = []
        
        for month, power_file in power_data_paths.items():
            try:
                df_power = pd.read_csv(power_file)
                df_power['datetime'] = pd.to_datetime(df_power['time (min)'])
                
                # Check if the eu column exists for this sensor_id
                if eu_column_name not in df_power.columns:
                    print(f"    {month}: Column '{eu_column_name}' not found, skipping")
                    continue
                
                # Select the eu column matching sensor_id
                power_data = df_power[['datetime', eu_column_name]].copy()
                power_data = power_data.rename(columns={eu_column_name: 'power_W'})
                
                # Filter from start_time to end
                power_filtered = power_data[power_data['datetime'] >= start_time].copy()
                power_filtered = power_filtered.drop_duplicates(subset=['datetime'], keep='first')
                
                # Filter to 10-minute intervals
                end_time_power = power_filtered['datetime'].max()
                resample_range_power = pd.date_range(start=start_time, end=end_time_power, freq='10min')
                power_final = power_filtered[power_filtered['datetime'].isin(resample_range_power)].copy()
                power_final = power_final.sort_values('datetime').reset_index(drop=True)
                
                all_power_data.append(power_final)
                print(f"    {month}: {len(power_final)} data points")
            except Exception as e:
                print(f"    {month}: Error - {e}")
        
        if all_power_data:
            # Combine all power data
            combined_power = pd.concat(all_power_data, ignore_index=True)
            combined_power = combined_power.sort_values('datetime').reset_index(drop=True)
            combined_power['datetime'] = pd.to_datetime(combined_power['datetime'])
            
            # Merge sensor data with power data
            sensor_final['datetime'] = pd.to_datetime(sensor_final['datetime'])
            combined_sensor_data = pd.merge(
                sensor_final,
                combined_power,
                on='datetime',
                how='inner'
            )
            combined_sensor_data = combined_sensor_data.sort_values('datetime').reset_index(drop=True)
        else:
            combined_sensor_data = sensor_final.copy()
            print("    Warning: No power data available")
        
        # Save to file
        output_file = os.path.join(output_dir, f'sensor_{sensor_id}_combined_data.xlsx')
        combined_sensor_data.to_excel(output_file, index=False)
        
        sensor_results[sensor_id] = combined_sensor_data
        
        print(f"\n  Saved to: {output_file}")
        print(f"  Final data shape: {combined_sensor_data.shape}")
        print(f"  Columns: {combined_sensor_data.columns.tolist()}")
    
    print(f"\n{'='*60}")
    print(f"Processing complete!")
    print(f"{'='*60}")
    
    return sensor_results


In [29]:
# Example usage: Generate combined data for sensors #
power_data_paths = {
    'august': '../aug_2025/Area_1/august_fivemin_power.csv',
    'september': '../sep_2025/Area_1/september_fivemin_power.csv',
    'october': '../oct_2025/Area_1/october_fivemin_power.csv'
}

# Call the function
results = generate_combined_data(
    sensor_ids=[24],
    power_data_paths=power_data_paths,
    output_dir='data'
)

# Display summary
for sensor_id, df in results.items():
    print(f"\nSensor {sensor_id}:")
    print(f"  Rows: {len(df)}")
    print(f"  Time range: {df['datetime'].min()} to {df['datetime'].max()}")
    print(f"  Columns: {df.columns.tolist()}")


Processing sensors: [24]
Total raw data points: 138

Processing sensor_id: 24
  Resampled data shape: (7546, 5)
  Time range: 2025-08-19 17:10:00 to 2025-10-11 02:40:00

  Processing power data for eu_24...
    august: 2778 data points
    september: 4320 data points
    october: 4464 data points

  Saved to: data\sensor_24_combined_data.xlsx
  Final data shape: (7546, 6)
  Columns: ['datetime', 'upper_temp_C', 'upper_humidity', 'upper_irradiance', 'sensor_id', 'power_W']

Processing complete!

Sensor 24:
  Rows: 7546
  Time range: 2025-08-19 17:10:00 to 2025-10-11 02:40:00
  Columns: ['datetime', 'upper_temp_C', 'upper_humidity', 'upper_irradiance', 'sensor_id', 'power_W']
