## LOESS with Tricubic Weighing
- **Span:** Approximately 20% of data.
- **Tricubic vs Weight Variables:** Tricubic is distinct from weight variables. It's determined by distance. It's feasible to apply both Exposure and Tricubic weight simultaneously.

## Validation Error
- After fitting a localized regression spanning between 1% and 20%, we utilize a hold-out/validation dataset.
- We compute the Root Mean Squared Error (RMSE) on this validation dataset.
- The span yielding the smallest RMSE is deemed the optimal span.


In [None]:
# Step 1: Load the Data and Create a Copy
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
# Define the compute_rmse function to calculate RMSE between observed and predicted values.
def compute_rmse(observed, predicted):
    """
    Compute the Root Mean Square Error (RMSE) between observed and predicted values using mean_squared_error.

    Parameters:
    - observed: Original values (Pandas Series or array-like)
    - predicted: Interpolated values (Pandas Series or array-like)

    Returns:
    - RMSE value
    """
    return np.sqrt(mean_squared_error(observed, predicted))

In [None]:
# Load the data from a given path.
file_path = 'Cleaned_Historic_Data.csv'
data = pd.read_csv(file_path, parse_dates=['UTC'])
data.set_index('UTC', inplace=True)

# Create a copy of the data for further manipulation.
data_copy = data.copy()

# Define the shift_fill_na function to shift values in order to fill NaNs in a series.
def shift_fill_na(series):
    while series.isna().sum() > 0:
        series = series.shift(fill_value=np.nan)
        mask = series.isna()
        series[mask] = series.shift(-1)[mask]
    return series

# Step 1.1: Shift the Data Points to Fill NaN
data_copy['Outside temp (0.1 °C)'] = shift_fill_na(data_copy['Outside temp (0.1 °C)'])
test_dataset = data_copy.copy()

# Visualization of the data after shifting to fill NaN values.
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_copy.index, y=data_copy['Outside temp (0.1 °C)'], mode='lines+markers', name='Time Series'))

# Identify the indices where the value was NaN in the original data.
nan_indices = data[data['Outside temp (0.1 °C)'].isna()].index
for idx in nan_indices:
    fig.add_shape(
        go.layout.Shape(
            type="line",
            x0=idx,
            x1=idx,
            y0=data_copy['Outside temp (0.1 °C)'].min(),
            y1=data_copy['Outside temp (0.1 °C)'].max(),
            line=dict(color="Red", width=2, dash="dashdot")
        )
    )

# Save the original values at the NaN indices for later comparison.
original_values = data_copy.loc[nan_indices, 'Outside temp (0.1 °C)']

fig.update_layout(title='Visualization After Shifting to Fill NaN',
                  xaxis_title='Time',
                  yaxis_title='Outside temp (0.1 °C)')
fig.show()

# Step 1.2: Induce Missingness Based on Observed Pattern
# For each index in nan_indices, set the value to NaN in the copy.
for idx in nan_indices:
    if idx in data_copy.index:  # Check if idx is a valid index in the DataFrame
        # Introduce NaN at the index in the copy.
        data_copy.at[idx, 'Outside temp (0.1 °C)'] = np.nan

# Visualization of the data after inducing missingness.
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_copy.index, y=data_copy['Outside temp (0.1 °C)'], mode='lines+markers', name='Time Series'))
for idx in nan_indices:
    fig.add_shape(
        go.layout.Shape(
            type="line",
            x0=idx,
            x1=idx,
            y0=data_copy['Outside temp (0.1 °C)'].min(),
            y1=data_copy['Outside temp (0.1 °C)'].max(),
            line=dict(color="Red", width=0.1, dash="dashdot")
        )
    )
fig.update_layout(title='Visualization After Inducing Missingness',
                  xaxis_title='Time',
                  yaxis_title='Outside temp (0.1 °C)')
fig.show()

In [None]:
# LOWESS Smoothing with Multiple frac Values

import statsmodels.api as sm
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Extract the x (time) and y (temperature) values.
x = np.array(data_copy.index.to_julian_date())
y = data_copy['Outside temp (0.1 °C)'].values

# Generate a list of frac values.
num_frac_values = 100
frac_values = np.linspace(0.000342465753425, 0.000570776255708, num_frac_values)

# Apply LOWESS smoothing for each frac value and store the results.
lowess = sm.nonparametric.lowess
lowess_results = {}

for frac in frac_values:
    z = lowess(y, x, frac=frac, missing='drop')
    x_smoothed, y_smoothed = z[:, 0], z[:, 1]
    lowess_results[frac] = {
        'x_smoothed': x_smoothed,
        'y_smoothed': y_smoothed
    }

# Visualization of the LOWESS smoothed data for each frac value.
fig = go.Figure()

# Original data.
fig.add_trace(go.Scatter(x=data_copy.index, y=data_copy['Outside temp (0.1 °C)'], mode='lines+markers', name='Original Time Series', line=dict(color="blue")))

# Plot LOWESS smoothed data for each frac value.
colors = plt.get_cmap('viridis')(np.linspace(0, 1, num_frac_values))
colors = [f'rgba({int(c[0]*255)},{int(c[1]*255)},{int(c[2]*255)},{c[3]})' for c in colors]

for frac, color in zip(frac_values, colors):
    x_smoothed_dates = pd.to_datetime(lowess_results[frac]['x_smoothed'], origin='julian', unit='D')
    fig.add_trace(go.Scatter(x=x_smoothed_dates, y=lowess_results[frac]['y_smoothed'], mode='lines', name=f'LOWESS Smoothed (frac={frac:.2f})', line=dict(color=color)))

# Highlighting NaNs.
y_min = data_copy['Outside temp (0.1 °C)'].min()
y_max = data_copy['Outside temp (0.1 °C)'].max()

# Highlighting NaNs with bright red lines.
for idx in nan_indices:
    fig.add_shape(
        go.layout.Shape(
            type="line",
            x0=idx,
            x1=idx,
            y0=data_copy['Outside temp (0.1 °C)'].min(),
            y1=data_copy['Outside temp (0.1 °C)'].max(),
            line=dict(color="rgb(255,0,0)", width=1, dash="dashdot")  # Bright red color
        )
    )

fig.update_layout(title='Visualization with LOWESS Smoothing (Multiple frac values)',
                  xaxis_title='Time',
                  yaxis_title='Outside temp (0.1 °C)')
fig.show()

In [None]:
# Calculate RMSE Values for Interpolated LOWESS Data

from scipy.interpolate import interp1d

# Convert nan_indices to positional indices.
positional_indices = [data_copy.index.get_loc(idx) for idx in nan_indices]

# Dictionary to store RMSE values for each frac value.
rmse_values = {}

for frac in frac_values:
    # Create an interpolation function based on the LOWESS smoothed data for the current frac value.
    f = interp1d(lowess_results[frac]['x_smoothed'], lowess_results[frac]['y_smoothed'], kind='cubic', fill_value='extrapolate')
    
    # Evaluate the interpolating function across all data points.
    entire_lowess_smoothed = f(x)
    
    # Extract interpolated values at the NaN indices.
    interpolated_values_at_nan_indices = entire_lowess_smoothed[positional_indices]
    
    # Compute RMSE between the original and interpolated values.
    rmse = compute_rmse(original_values, interpolated_values_at_nan_indices)
    rmse_values[frac] = rmse
 
# Display RMSE values for each frac value.
for frac, rmse in rmse_values.items():
    print(f"frac={frac:.30f}, RMSE: {rmse:.70f}")