# Linear Interpolation for Sensor Data

This notebook performs linear interpolation for missing values in sensor data.

## Setup

Install required dependencies:
```bash
pip install -r requirements.txt
```

Required packages:
- `pandas`: Data manipulation and Excel file handling
- `openpyxl`: Excel file reading/writing support


In [None]:
import pandas as pd

# Read the Excel file
file_path = 'data/raw/panel_temperature_and_light_raw_dataset_MK_2025_december_3.xlsx'
df = pd.read_excel(file_path)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Convert datetime to datetime type and sort
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(['sensor_id', 'datetime'])

# Identify the columns to interpolate
columns_to_interpolate = ['upper_temp_C', 'upper_humidity', 'upper_irradiance(μW/cm²)']

print(f"Columns to interpolate: {columns_to_interpolate}")
print(f"\nMissing values before interpolation:")
print(df[columns_to_interpolate].isnull().sum())


In [None]:
# Perform linear interpolation for each sensor_id
# Group by sensor_id and interpolate within each group
interpolated_data = []

for sensor_id, group in df.groupby('sensor_id'):
    group = group.copy()
    
    # Sort by datetime within each sensor group
    group = group.sort_values('datetime')
    
    # Set datetime as index for interpolation
    group_indexed = group.set_index('datetime')
    
    # Perform linear interpolation on the specified columns
    for col in columns_to_interpolate:
        if col in group_indexed.columns:
            group_indexed[col] = group_indexed[col].interpolate(method='linear', limit_direction='both')
    
    # Reset index to get datetime back as a column
    group_interpolated = group_indexed.reset_index()
    
    interpolated_data.append(group_interpolated)

# Combine all interpolated groups
df_interpolated = pd.concat(interpolated_data, ignore_index=True)

print(f"Interpolated dataset shape: {df_interpolated.shape}")
print(f"\nMissing values after interpolation:")
print(df_interpolated[columns_to_interpolate].isnull().sum())


In [None]:
# Select and rename columns for output
# Note: upper_irradiance column name has special characters, we'll rename it
output_df = df_interpolated[['datetime', 'area', 'eu', 'sensor_id', 'upper_temp_C', 'upper_humidity', 'upper_irradiance(μW/cm²)']].copy()

# Rename the irradiance column to match the requested output format
output_df = output_df.rename(columns={'upper_irradiance(μW/cm²)': 'upper_irradiance'})

# Sort by datetime, area, eu, sensor_id for better organization
output_df = output_df.sort_values(['datetime', 'area', 'eu', 'sensor_id']).reset_index(drop=True)

print(f"Output dataset shape: {output_df.shape}")
print(f"\nOutput columns: {output_df.columns.tolist()}")
print(f"\nFirst few rows:")
output_df.head(10)


In [None]:
# Save to a new Excel file
output_file = 'data/interpolated_data.xlsx'
output_df.to_excel(output_file, index=False)

print(f"Interpolated data saved to: {output_file}")
print(f"\nSummary statistics:")
print(output_df[['upper_temp_C', 'upper_humidity', 'upper_irradiance']].describe())
