In [11]:
"""Step 1: Load the Data and Create a Copy"""
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d


# Load the data
file_path = 'Cleaned_Historic_Data.csv'
data = pd.read_csv(file_path, parse_dates=['UTC'])
data.set_index('UTC', inplace=True)

# Create a copy of the data
data_copy = data.copy()

"""Step 2: Extract the x (time) and y (temperature) values"""
# Convert datetime index to ordinal numbers for numerical operations in LOWESS.
x = np.array(data_copy.index.to_julian_date())
y = data_copy['Outside temp (0.1 °C)'].values

"""Step 3: Apply LOWESS smoothing"""
# The frac value can be adjusted based on the desired level of smoothing.
# You might want to use a value that performed well in your earlier experiments.
frac_value = 1/8760  # Or any other value you've found to be effective
z = lowess(y, x, frac=frac_value, missing='drop')

# Extract smoothed y-values
x_smoothed = z[:, 0]
y_smoothed = z[:, 1]

"""Step 4: Interpolate the NaN values using the smoothed data"""
# Create an interpolation function based on the LOWESS smoothed data
f = interp1d(x_smoothed, y_smoothed, kind='cubic', fill_value='extrapolate')
# Interpolate the missing values
missing_indices = data_copy['Outside temp (0.1 °C)'].isna()
data_copy.loc[missing_indices, 'Outside temp (0.1 °C)'] = f(data_copy.index[missing_indices].to_julian_date())

"""Step 5: Visualization"""
fig = go.Figure()
# Original data
fig.add_trace(go.Scatter(x=data.index, y=data['Outside temp (0.1 °C)'], mode='lines', name='Original Data'))
# Interpolated values
fig.add_trace(go.Scatter(x=data_copy.index, y=data_copy['Outside temp (0.1 °C)'], mode='lines', name='Interpolated Data', line=dict(dash='dot')))

# Highlighting NaNs
nan_indices = data[data['Outside temp (0.1 °C)'].isna()].index
for idx in nan_indices:
    fig.add_shape(
        go.layout.Shape(
            type="line",
            x0=idx,
            x1=idx,
            y0=data['Outside temp (0.1 °C)'].min(),
            y1=data['Outside temp (0.1 °C)'].max(),
            line=dict(color="Red", width=2, dash="dashdot")
        )
    )

fig.update_layout(title='Original vs. Interpolated Data',
                  xaxis_title='Time',
                  yaxis_title='Outside temp (0.1 °C)')
fig.show()


# Optionally, you can save the interpolated data
data_copy.to_csv('Interpolated_data.csv')



In [10]:
# Replace NaN values in the original dataset with the interpolated values
data.loc[data['Outside temp (0.1 °C)'].isna(), 'Outside temp (0.1 °C)'] = data_copy.loc[data['Outside temp (0.1 °C)'].isna(), 'Outside temp (0.1 °C)']

# Confirming that there are no more NaNs
print(data_copy[missing_indices]['Outside temp (0.1 °C)'])  # This should print 0


UTC
2021-01-09 09:00:00    -1.175251
2021-01-14 22:00:00    -3.948986
2021-01-15 22:00:00    -2.013131
2021-01-16 14:00:00     1.341144
2021-01-30 22:00:00     1.838177
2021-02-06 19:00:00     1.995189
2021-03-02 05:00:00     1.245965
2021-03-05 04:00:00     0.125965
2021-03-19 04:00:00    -2.726352
2021-03-19 05:00:00    -1.271531
2021-04-06 23:00:00     2.036400
2021-04-07 06:00:00     2.552184
2021-04-08 04:00:00     4.530169
2021-04-18 04:00:00    12.073075
2021-04-23 01:00:00    -2.568387
2021-05-02 23:00:00     5.060997
2021-05-03 01:00:00     2.259952
2021-12-11 00:00:00     7.351396
2021-12-23 08:00:00     0.159568
2021-12-25 10:00:00    -0.196462
Name: Outside temp (0.1 °C), dtype: float64
