# Linear Interpolation for Sensor Data

This notebook performs linear interpolation for missing values in sensor data.

## Setup

Install required dependencies:
```bash
pip install -r requirements.txt
```

Required packages:
- `pandas`: Data manipulation and Excel file handling
- `openpyxl`: Excel file reading/writing support


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Read the Excel file
file_path = 'data/raw/panel_temperature_and_light_raw_dataset_MK_2025_december_3.xlsx'
df = pd.read_excel(file_path)

# Filter for sensor_id 1 and select only datetime and upper_temp_C
sensor_1_data = df[df['sensor_id'] == 1][['datetime', 'upper_temp_C']].copy()

# Convert datetime to datetime type
sensor_1_data['datetime'] = pd.to_datetime(sensor_1_data['datetime'])

# Sort by datetime
sensor_1_data = sensor_1_data.sort_values('datetime').reset_index(drop=True)

print("Original data:")
print(f"Data shape: {sensor_1_data.shape}")
print(f"Time range: {sensor_1_data['datetime'].min()} to {sensor_1_data['datetime'].max()}")
print(f"\nFirst few rows (original timestamps):")
print(sensor_1_data.head(10))

# Step 1: Remove duplicate datetimes (keep first occurrence)
duplicates_before = len(sensor_1_data)
sensor_1_data = sensor_1_data.drop_duplicates(subset=['datetime'], keep='first')
duplicates_removed = duplicates_before - len(sensor_1_data)
if duplicates_removed > 0:
    print(f"\nRemoved {duplicates_removed} duplicate datetime entries")

# Step 2: Set datetime as index
sensor_1_indexed = sensor_1_data.set_index('datetime')

# Step 3: Create regular 10-minute intervals starting from 2025-08-12 17:00:00
start_time = pd.Timestamp('2025-08-12 17:00:00')
end_time = sensor_1_indexed.index.max()

# Align start_time: if data starts before 17:00:00, use 17:00:00; otherwise align to next 10-min interval
data_start = sensor_1_indexed.index.min()
if data_start < start_time:
    resample_start = start_time
else:
    # Find the next 10-minute interval from start_time that is >= data_start
    time_since_start = (data_start - start_time).total_seconds() / 60
    intervals_since_start = int((time_since_start // 10) + (1 if time_since_start % 10 > 0 else 0))
    resample_start = start_time + pd.Timedelta(minutes=10 * intervals_since_start)

# Create date range with 10-minute frequency (target grid)
resample_range = pd.date_range(
    start=resample_start,
    end=end_time,
    freq='10min'  # 10 minutes
)

print(f"\nCreating 10-minute grid:")
print(f"  Start time: {resample_start}")
print(f"  End time: {end_time}")
print(f"  Number of grid points: {len(resample_range)}")
print(f"  First few grid points: {resample_range[:5].tolist()}")

# Step 4: Union the original timestamps (anchors) and the 10-minute grid (targets)
original_index = sensor_1_indexed.index
combined_index = original_index.union(resample_range).sort_values()

print(f"\nCombined index (original + grid):")
print(f"  Original timestamps: {len(original_index)}")
print(f"  Grid timestamps: {len(resample_range)}")
print(f"  Combined (union): {len(combined_index)}")

# Step 5: Reindex to the combined index (includes both original and grid timestamps)
sensor_1_combined = sensor_1_indexed.reindex(combined_index)

# Step 6: Interpolate on the combined index (uses original timestamps as anchors, fills grid targets)
missing_before = sensor_1_combined['upper_temp_C'].isnull().sum()
print(f"\nMissing values in combined index: {missing_before}")
sensor_1_combined['upper_temp_C'] = sensor_1_combined['upper_temp_C'].interpolate(method='time', limit_direction='both')
missing_after = sensor_1_combined['upper_temp_C'].isnull().sum()
print(f"Missing values after interpolation: {missing_after}")

# Step 7: Select only the grid timestamps from the interpolated result
sensor_1_resampled = sensor_1_combined.loc[resample_range]

# Reset index to get datetime back as a column
sensor_1_final = sensor_1_resampled.reset_index()
sensor_1_final.rename(columns={'index': 'datetime'}, inplace=True)

print(f"\nResampled data:")
print(f"Data shape: {sensor_1_final.shape}")
print(f"Time range: {sensor_1_final['datetime'].min()} to {sensor_1_final['datetime'].max()}")
print(f"\nFirst few rows (resampled to 10-minute intervals):")
print(sensor_1_final.head(10))
print(f"\nMissing values after interpolation: {sensor_1_final['upper_temp_C'].isnull().sum()}")
print(f"\nSummary statistics:")
print(sensor_1_final['upper_temp_C'].describe())


Original data:
Data shape: (12252, 2)
Time range: 2025-08-12 17:07:56 to 2025-11-05 22:15:33

First few rows (original timestamps):
             datetime  upper_temp_C
0 2025-08-12 17:07:56         35.12
1 2025-08-12 17:17:57         34.68
2 2025-08-12 17:27:58         33.77
3 2025-08-12 17:37:59         31.87
4 2025-08-12 17:48:00         30.74
5 2025-08-12 17:58:01         31.53
6 2025-08-12 18:08:02         31.08
7 2025-08-12 18:18:03         30.18
8 2025-08-12 18:28:04         29.36
9 2025-08-12 18:38:05         29.10

Creating 10-minute grid:
  Start time: 2025-08-12 17:10:00
  End time: 2025-11-05 22:15:33
  Number of grid points: 12271
  First few grid points: [Timestamp('2025-08-12 17:10:00'), Timestamp('2025-08-12 17:20:00'), Timestamp('2025-08-12 17:30:00'), Timestamp('2025-08-12 17:40:00'), Timestamp('2025-08-12 17:50:00')]

Combined index (original + grid):
  Original timestamps: 12252
  Grid timestamps: 12271
  Combined (union): 24502

Missing values in combined index: 127

In [None]:
# Plot original and resampled data on the same plot
fig, ax = plt.subplots(figsize=(16, 8))

# Plot original data as scatter points
ax.scatter(sensor_1_data['datetime'], sensor_1_data['upper_temp_C'], 
          alpha=0.5, s=15, color='red', label='Original Data', marker='o')

# Plot resampled data as a line
ax.plot(sensor_1_final['datetime'], sensor_1_final['upper_temp_C'], 
       linewidth=1.5, color='blue', label='Resampled (10-min intervals)', linestyle='-', marker='.', markersize=3)

ax.set_xlabel('Time', fontsize=12)
ax.set_ylabel('Temperature (Â°C)', fontsize=12)
ax.set_title('Sensor ID 1: Original vs Resampled Data (10-minute intervals)', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

# Format x-axis dates
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter(ax.xaxis.get_major_locator()))
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

print(f"Original data points: {len(sensor_1_data)}")
print(f"Resampled data points: {len(sensor_1_final)}")
print(f"\nOriginal time range: {sensor_1_data['datetime'].min()} to {sensor_1_data['datetime'].max()}")
print(f"Resampled time range: {sensor_1_final['datetime'].min()} to {sensor_1_final['datetime'].max()}")
