In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
if '../source' not in sys.path: sys.path.insert(0, '../source')
from utils import *

## Import data 
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [453]:
df = pd.read_csv('../saves/meteo_p_dc_22_init_impute.csv')
df['Time'] = pd.to_datetime(df['Time'], format='%Y-%m-%d %H:%M:%S')
df.set_index('Time', inplace=True)
df.head()

Unnamed: 0_level_0,GTI,GHI,DNI,DHI,Air_Temp,RH,Pressure,Wind_speed,Wind_dir,Wind_gust,Rain,P_DC,Imputation
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-01-01 00:00:00,0.0,0.0,0.0,0.0,10.170313,77.075,969.5,0.0,0.0,0.0,0.0,0.0,
2022-01-01 00:10:00,0.0,0.0,0.0,0.0,10.05,78.35,969.5,0.0,0.0,0.0,0.0,0.0,
2022-01-01 00:20:00,0.0,0.0,0.0,0.0,9.73125,79.0375,969.5,0.0,0.0,0.0,0.0,0.0,
2022-01-01 00:30:00,0.0,0.0,0.0,0.0,9.560938,80.275,969.5,0.0,0.0,0.0,0.0,0.0,
2022-01-01 00:40:00,0.0,0.0,0.0,0.0,9.790625,79.6625,969.5,0.0,0.0,0.0,0.0,0.0,


## Data preparation 
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [13]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

# Select relevant feautures Convert the DataFrame to a numpy array
data_array = df.drop(columns=['Imputation'])[['GTI', 'GHI', 'DNI', 'DHI', 'Air_Temp', 'RH', 'P_DC']]
data_array = data_array.values

# Normalize the data
scaler = StandardScaler()
data_array_scaled = scaler.fit_transform(data_array)

# Split data into input and target
X_scaled = data_array_scaled   # Features
y_scaled = data_array_scaled[:, -1]  # Target variable

# Choose a historical window size for the input sequences
window_size = 24*6  # Example window size 1 day (assuming data is sampled at 10-minute intervals) 

# Create input sequences and corresponding targets
X_sequences = []
y_targets = []
for i in range(0, len(X_scaled) - window_size, window_size//24):
    X_sequence = X_scaled[i : i + window_size]
    y_target = y_scaled[i + window_size]
    if np.isnan(X_sequence).any() or np.isnan(y_target).any():
        continue
    X_sequences.append(X_sequence)
    y_targets.append(y_target)

X_sequences = np.array(X_sequences)
y_targets = np.array(y_targets)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_targets, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=True)

# Move data to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# X_train, X_val, y_train, y_val = [t.to(device) for t in (X_train, X_val, y_train, y_val)]

# Create PyTorch DataLoader for training
batch_size = 256

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32, device='cpu'), torch.tensor(y_train, dtype=torch.float32, device='cpu'))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32, device='cpu'), torch.tensor(y_val, dtype=torch.float32, device='cpu'))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32, device='cpu'), torch.tensor(y_test, dtype=torch.float32, device='cpu'))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [242]:
from joblib import dump, load
dump(scaler, '../saves/std_scaler.bin', compress=True)

['saves/std_scaler.bin']

In [14]:
X_train.shape

(6300, 144, 7)

## Model defenition 
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [18]:
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau


class LSTMForecasting(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMForecasting, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.3, bidirectional=False)
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.dropout = nn.Dropout(p=0.2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size//2, output_size)
        
    def forward(self, x):
        out, _ = self.lstm1(x)
        out = self.fc1(out[:, -1, :])  # Use only the last time step's output
        # out = self.dropout(out)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Hyperparameters
input_size = X_train.shape[2]  # Number of variables
hidden_size = 64
num_layers = 2
output_size = 1  # Forecasted value

# Create the model
model = LSTMForecasting(input_size, hidden_size, num_layers, output_size)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

# Define the ReduceLROnPlateau scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, threshold=1e-4, min_lr=1e-5, verbose=True)

model.to(device)

LSTMForecasting(
  (lstm1): LSTM(7, 64, num_layers=2, batch_first=True, dropout=0.3)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)

## Model training
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [1565]:
train_losses, val_losses = [], []

# Training loop
num_epochs = 100
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.unsqueeze(1))  # Expand targets to match output shape
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Calculate validation loss
    model.eval()
    val_loss = 0.
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets.unsqueeze(1))
            val_loss += loss.item()
    
    average_train_loss = train_loss / len(train_loader)
    average_val_loss = val_loss / len(val_loader)
    
    train_losses.append(average_train_loss)
    val_losses.append(average_val_loss)
    
    # Update the learning rate based on validation loss
    scheduler.step(average_val_loss)
    
    # Save the best model
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        best_epoch = epoch
        model_scripted = torch.jit.script(model) # Export to TorchScript
        model_scripted.save('../saves/best_model_scripted.pt') # Save
        
    if epoch == 0 or (epoch+1) % 10 == 0:
        print(f'## Epoch [{epoch+1}/{num_epochs}], \t Train loss: {average_train_loss:.6f} \t Val loss: {average_val_loss:.6f}')
        print(f'Current best model checkpoint at epoch {best_epoch+1} \t Val loss: {best_val_loss:.6f}')

## Epoch [1/100], 	 Train loss: 0.011420 	 Val loss: 0.021396
Current best model checkpoint at epoch 94 	 Val loss: 0.020994
## Epoch [10/100], 	 Train loss: 0.011272 	 Val loss: 0.020863
Current best model checkpoint at epoch 2 	 Val loss: 0.020755
## Epoch [20/100], 	 Train loss: 0.011240 	 Val loss: 0.020805
Current best model checkpoint at epoch 12 	 Val loss: 0.020648
## Epoch [30/100], 	 Train loss: 0.011277 	 Val loss: 0.021475
Current best model checkpoint at epoch 29 	 Val loss: 0.020435
Epoch 00140: reducing learning rate of group 0 to 6.2500e-05.
## Epoch [40/100], 	 Train loss: 0.010858 	 Val loss: 0.021389
Current best model checkpoint at epoch 29 	 Val loss: 0.020435
## Epoch [50/100], 	 Train loss: 0.011035 	 Val loss: 0.021256
Current best model checkpoint at epoch 29 	 Val loss: 0.020435
Epoch 00151: reducing learning rate of group 0 to 3.1250e-05.
## Epoch [60/100], 	 Train loss: 0.010621 	 Val loss: 0.020589
Current best model checkpoint at epoch 29 	 Val loss: 0.020

In [1605]:
fig = go.Figure()
fig.add_scatter(y=train_losses[:], name='Training loss', mode='lines')
fig.add_scatter(y=val_losses[:], name='Validation loss', mode='lines')
fig.update_layout(title="Loss curves LSTM - 100 epochs", width=1000, height=500, xaxis_title="epoch", yaxis_title="MSE loss")
fig.show()

## Evaluation 
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [20]:
# Load the best model checkpoint
best_model = torch.jit.load('../saves/best_model_scripted.pt')
best_model.to('cpu')

RecursiveScriptModule(
  original_name=LSTMForecasting
  (lstm1): RecursiveScriptModule(original_name=LSTM)
  (fc1): RecursiveScriptModule(original_name=Linear)
  (dropout): RecursiveScriptModule(original_name=Dropout)
  (relu): RecursiveScriptModule(original_name=ReLU)
  (fc2): RecursiveScriptModule(original_name=Linear)
)

In [None]:
# load scaler
scaler = load('../saves/std_scaler.bin')

In [21]:
# Evaluate the model on the test set
best_model.eval()
with torch.no_grad():
    test_loss = 0
    for inputs, targets in test_loader:
        outputs = best_model(inputs)
        test_loss += criterion(outputs, targets.unsqueeze(1))
    average_test_loss = test_loss / len(test_loader)
    print(f'Average Test Loss: {average_test_loss.item():.4f}')

Average Test Loss: 0.0239


In [1386]:
len(train_dataset), len(val_dataset), len(test_dataset)

(37791, 4199, 7410)

In [1573]:
# Evaluate the model on the test set using different metrics and with unnormalized values

y_pred = []
y_true = []

best_model.eval()
with torch.no_grad():
    test_loss = 0
    for inputs, targets in zip(X_test, y_test):
        inputs = torch.from_numpy(inputs).float().unsqueeze(0) #.to(device)
        outputs = best_model(inputs).cpu().numpy().item()
        y_true.append(targets)
        y_pred.append(outputs)

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [40]:
def inv_transform_outputs(outputs):
    return [out * np.sqrt(scaler.var_[-1]) + scaler.mean_[-1] for out in outputs]

In [1575]:
y_true = inv_transform_outputs(y_true)
y_pred = inv_transform_outputs(y_pred)

In [1186]:
print('MSE: %.3f' % mean_squared_error(y_true, y_pred))
print('RMSE: %.3f' % mean_squared_error(y_true, y_pred, squared=False))
print('MAE: %.3f' % mean_absolute_error(y_true, y_pred))
print('R²: %.3f' % r2_score(y_true, y_pred))

Modle without historic P_DC data
MSE: 148985.210
RMSE: 385.986
MAE: 143.728
R²: 0.975


In [1663]:
gap_lengths = [24*i for i in range(1, 7)]

evaluation_metrics = {
    'mse' : mean_squared_error,
    'rmse': lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False),
    'mae' : mean_absolute_error,
    'r2' : r2_score
}

num_trials=500
avg_results = {}

for gap_len in gap_lengths:
    avg_results[gap_len] = {metric:[] for metric in evaluation_metrics.keys()}
    for n in range(num_trials):
        gap_start = np.random.randint(int(.85*df_imputed.shape[0]) + window_size, df_imputed.shape[0] - gap_len + 1)
        gap_end = gap_start + gap_len
        if df_imputed[['GTI', 'GHI', 'DNI', 'DHI', 'Air_Temp', 'RH', 'P_DC']].iloc[gap_start-window_size:gap_end].isna().values.any():
            n = n - 1
            continue
        real_values = df_imputed['P_DC'].iloc[gap_start:gap_end].values
        imputations = []
        for i in range(gap_len):
            window = df_imputed[['GTI', 'GHI', 'DNI', 'DHI', 'Air_Temp', 'RH', 'P_DC'
                                ]].iloc[gap_start-window_size+i:gap_start+i].values.copy()
            if imputations: window[-len(imputations):, -1] = imputations
            window = scaler.fit_transform(window)
            window = torch.from_numpy(window).float().unsqueeze(0)
            out = best_model(window).detach().numpy().item()
            imputations.append(inv_transform_outputs([out])[0])
            
        for metric, metric_func in evaluation_metrics.items():
            avg_results[gap_len][metric].append(metric_func(real_values, imputations))
    avg_results[gap_len] = {metric:np.mean(mesures) for metric, mesures in avg_results[gap_len].items()}        

In [1664]:
avg_results

{24: {'mse': 277936.13312890154,
  'rmse': 357.4361715069021,
  'mae': 297.0698980103132,
  'r2': -6288.121166161875},
 48: {'mse': 330550.70420214446,
  'rmse': 432.07073518563277,
  'mae': 331.95050661074555,
  'r2': -5369.526559751802},
 72: {'mse': 392548.5991999691,
  'rmse': 503.08726106599437,
  'mae': 360.0702904242969,
  'r2': -867.9016863835411},
 96: {'mse': 393844.5600438433,
  'rmse': 522.9008910832949,
  'mae': 358.0882130940446,
  'r2': 0.5364519373424771},
 120: {'mse': 446494.7142693554,
  'rmse': 574.8183242490602,
  'mae': 385.1856632606733,
  'r2': 0.7822061825790464},
 144: {'mse': 431539.1882596574,
  'rmse': 572.4588955274857,
  'mae': 385.9389096506077,
  'r2': 0.8142763420444679}}

## Filling gaps in the original data
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [36]:
best_model = torch.jit.load('../saves/best_model_scripted.pt')
best_model.to('cpu')
best_model.eval()

RecursiveScriptModule(
  original_name=LSTMForecasting
  (lstm1): RecursiveScriptModule(original_name=LSTM)
  (fc1): RecursiveScriptModule(original_name=Linear)
  (dropout): RecursiveScriptModule(original_name=Dropout)
  (relu): RecursiveScriptModule(original_name=ReLU)
  (fc2): RecursiveScriptModule(original_name=Linear)
)

In [307]:
gaps = find_gaps(df.P_DC)
print('%d remaining gaps' % len(gaps))
gaps_len = [(gap[1] - gap[0]).seconds // 60 // 10 // 6 for gap in gaps]
print('Longest remaining gap %dh' % max(gaps_len))

18 remaining gaps
Longest remaining gap 13h


In [23]:
df.isna().sum()

GTI               0
GHI               0
DNI               0
DHI               0
Air_Temp          0
RH                0
Pressure          0
Wind_speed        0
Wind_dir          0
Wind_gust         0
Rain              0
P_DC            665
Imputation    48625
dtype: int64

In [38]:
df_imputed = df.copy(deep=True)
df_imputed['P_DC_LSTM'] = df_imputed['P_DC'].copy(deep=True)
df_imputed['Imputation_LSTM'] = df_imputed['Imputation'].copy(deep=True)

In [41]:
best_model.eval()

# Impute remaining gaps
with torch.no_grad():
    for gap in gaps:
        for i in range(df_imputed.loc[gap[0]:gap[1], 'P_DC_LSTM'].shape[0]):
            window = df_imputed.loc[df_imputed.index < gap[0] + pd.Timedelta(minutes=i*10), 
                        ['GTI', 'GHI', 'DNI', 'DHI', 'Air_Temp', 'RH', 'P_DC_LSTM']].tail(window_size).values
            window = scaler.fit_transform(window)
            window = torch.from_numpy(window).float().unsqueeze(0)
            forecast = best_model(window).cpu().numpy().item()
            df_imputed.loc[gap[0] + pd.Timedelta(minutes=i*10), 'P_DC_LSTM'] = inv_transform_outputs([forecast])[0]
            df_imputed.loc[gap[0] + pd.Timedelta(minutes=i*10), 'Imputation_LSTM'] = 3

In [42]:
df_imputed.isna().sum()

GTI                    0
GHI                    0
DNI                    0
DHI                    0
Air_Temp               0
RH                     0
Pressure               0
Wind_speed             0
Wind_dir               0
Wind_gust              0
Rain                   0
P_DC                 665
Imputation         48625
P_DC_LSTM              0
Imputation_LSTM    47960
dtype: int64

In [43]:
df_imputed.to_csv('../saves/df_meteo_p_dc_imputed_lstm.csv')

## Visualization of gap filling results
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [317]:
fig = make_subplots(rows=3, cols=2, shared_yaxes=True)

k = 10
for i in range(6):    
    start = gaps[i+k][0] - pd.Timedelta(hours=10)
    end = gaps[i+k][1] + pd.Timedelta(hours=10)
    fig.add_trace(go.Scatter(y=df_imputed.loc[start:end, 'GTI'],
                    mode='lines', line=dict(width=1), marker=dict(color='green', size=2)), row=1+i//2, col=1+i%2)
    fig.add_trace(go.Scatter(y=df_imputed.loc[start:end, 'P_DC_LSTM'],
                    mode='lines', marker=dict(color='red')), row=1+i//2, col=1+i%2)
    fig.add_trace(go.Scatter(y=df_imputed.loc[start:end, 'P_DC'],
                    mode='lines', marker=dict(color='LightSkyBlue')), row=1+i//2, col=1+i%2)

fig.update_layout(height=700, width=1100, showlegend=False,
                  title_text="Imputation on test samples")
fig.show()

In [304]:
sample = df_imputed.loc['2022-11-03 00:10:00':'2022-11-08 00:10:00'].copy(deep=True)
sample['P_DC_with_gaps'] = sample['P_DC'].copy(deep=True)
sample['Imputations'] = np.nan
sample.loc['2022-11-04 05:00:00':'2022-11-04 09:00:00', 'P_DC_with_gaps'] = np.nan
sample.loc['2022-11-06 05:30:00':'2022-11-06 18:30:00', 'P_DC_with_gaps'] = np.nan
sample.loc['2022-11-07 13:00:00':'2022-11-07 19:00:00', 'P_DC_with_gaps'] = np.nan

gaps = find_gaps(sample['P_DC_with_gaps'])
best_model.eval()

# Impute remaining gaps
with torch.no_grad():
    for gap in gaps:
        for i in range(sample.loc[gap[0]:gap[1], 'P_DC_with_gaps'].shape[0]):
            window = sample.loc[sample.index < gap[0] + pd.Timedelta(minutes=i*10), 
                        ['GTI', 'GHI', 'DNI', 'DHI', 'Air_Temp', 'RH', 'P_DC_with_gaps']].tail(window_size).values
            window = scaler.fit_transform(window)
            window = torch.from_numpy(window).float().unsqueeze(0)
            forecast = best_model(window).cpu().numpy().item()
            sample.loc[gap[0] + pd.Timedelta(minutes=i*10), 'P_DC_with_gaps'] = inv_transform_outputs([forecast])[0]      
            sample.loc[gap[0] + pd.Timedelta(minutes=i*10), 'Imputations'] = inv_transform_outputs([forecast])[0] 

fig = go.Figure()
fig.add_scatter(y=sample['P_DC'], name='Real values', mode='lines', line=dict(color='gray', width=1))
fig.add_scatter(y=sample['P_DC_with_gaps'], name='Data with gaps', mode='lines', line=dict(color='blue', width=1.5))
fig.add_scatter(y=sample['Imputations'], name='Imputations', mode='lines', line=dict(color='red', width=1.5))
fig.update_layout(title="Gaps filling Sample", width=1200, height=400)
fig.show()

# Example real values and predictions (replace with your data)
real_values = sample['P_DC']
predictions = sample['P_DC_with_gaps']

# Create a scatter plot with real values on the x-axis and predictions on the y-axis
fig = px.scatter(x=real_values, y=predictions, labels={'x': 'Real Values', 'y': 'Predictions'})

# Add a diagonal line for reference
fig.add_shape(type='line', x0=min(real_values), y0=min(predictions), x1=max(real_values), y1=max(predictions),
              line=dict(color='red', width=2))

# Customize the layout
fig.update_layout(
    title='Correlation between Sample Real data and Imputd data',
    width=1100, height=400,
    xaxis_title='Real Values',
    yaxis_title='Predictions',
    template='plotly_white'
)

# Show the plot
fig.show()

In [303]:
sample = df_imputed.loc['2022-12-03 00:10:00':'2022-12-08 00:10:00'].copy(deep=True)
sample['P_DC_with_gaps'] = sample['P_DC'].copy(deep=True)
sample['Imputations'] = np.nan
sample.loc['2022-12-04 05:30:00':'2022-12-04 18:30:00', 'P_DC_with_gaps'] = np.nan
sample.loc['2022-12-05 16:00:00':'2022-12-05 20:00:00', 'P_DC_with_gaps'] = np.nan
sample.loc['2022-12-06 07:00:00':'2022-12-06 10:30:00', 'P_DC_with_gaps'] = np.nan
sample.loc['2022-12-07 13:00:00':'2022-12-07 19:00:00', 'P_DC_with_gaps'] = np.nan

gaps = find_gaps(sample['P_DC_with_gaps'])
best_model.eval()

# Impute remaining gaps
with torch.no_grad():
    for gap in gaps:
        for i in range(sample.loc[gap[0]:gap[1], 'P_DC_with_gaps'].shape[0]):
            window = sample.loc[sample.index < gap[0] + pd.Timedelta(minutes=i*10), 
                        ['GTI', 'GHI', 'DNI', 'DHI', 'Air_Temp', 'RH', 'P_DC_with_gaps']].tail(window_size).values
            window = scaler.fit_transform(window)
            window = torch.from_numpy(window).float().unsqueeze(0)
            forecast = best_model(window).cpu().numpy().item()
            sample.loc[gap[0] + pd.Timedelta(minutes=i*10), 'P_DC_with_gaps'] = inv_transform_outputs([forecast])[0]      
            sample.loc[gap[0] + pd.Timedelta(minutes=i*10), 'Imputations'] = inv_transform_outputs([forecast])[0] 

fig = go.Figure()
fig.add_scatter(y=sample['P_DC'], name='Real values', mode='lines', line=dict(color='gray', width=1))
fig.add_scatter(y=sample['P_DC_with_gaps'], name='Data with gaps', mode='lines', line=dict(color='blue', width=1.5))
fig.add_scatter(y=sample['Imputations'], name='Imputations', mode='lines', line=dict(color='red', width=1.5))
fig.update_layout(title="Gaps filling Sample", width=1200, height=400)
fig.show()

# Example real values and predictions (replace with your data)
real_values = sample['P_DC']
predictions = sample['P_DC_with_gaps']

# Create a scatter plot with real values on the x-axis and predictions on the y-axis
fig = px.scatter(x=real_values, y=predictions, labels={'x': 'Real Values', 'y': 'Predictions'})

# Add a diagonal line for reference
fig.add_shape(type='line', x0=min(real_values), y0=min(predictions), x1=max(real_values), y1=max(predictions),
              line=dict(color='red', width=2))

# Customize the layout
fig.update_layout(
    title='Correlation between Sample Real data and Imputd data',
    width=1100, height=400,
    xaxis_title='Real Values',
    yaxis_title='Predictions',
    template='plotly_white'
)

# Show the plot
fig.show()

## #

In [103]:
df_imputed = pd.read_csv('../saves/df_meteo_p_dc_imputed_xgb.csv')
df_imputed_lstm = pd.read_csv('../saves/df_meteo_p_dc_imputed_lstm.csv')
df_imputed['Time'] = pd.to_datetime(df_imputed['Time'], format='%Y-%m-%d %H:%M:%S')
df_imputed['P_DC_LSTM'] = df_imputed_lstm['P_DC_LSTM']
df_imputed['Imputation_LSTM'] = df_imputed_lstm['Imputation_LSTM']
df_imputed.set_index('Time', inplace=True)
df_imputed.head()

Unnamed: 0_level_0,GTI,GHI,DNI,DHI,Air_Temp,RH,Pressure,Wind_speed,Wind_dir,Wind_gust,Rain,P_DC,Imputation,P_DC_XGB,Imputation_XGB,P_DC_LSTM,Imputation_LSTM
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-01-01 00:00:00,0.0,0.0,0.0,0.0,10.170313,77.075,969.5,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,
2022-01-01 00:10:00,0.0,0.0,0.0,0.0,10.05,78.35,969.5,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,
2022-01-01 00:20:00,0.0,0.0,0.0,0.0,9.73125,79.0375,969.5,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,
2022-01-01 00:30:00,0.0,0.0,0.0,0.0,9.560938,80.275,969.5,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,
2022-01-01 00:40:00,0.0,0.0,0.0,0.0,9.790625,79.6625,969.5,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,


In [105]:
df_imputed.to_csv('../saves/df_meteo_p_dc_imputed.csv')