In [106]:
# !pip3 install neuralprophet

# !pip3 install --upgrade neuralprophet
# !pip install pytorch-lightning==1.9.4


In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from neuralprophet import NeuralProphet
import plotly.graph_objs as go

# Set your seed
seed = np.random.seed(42)

In [108]:
# Load the dataset
price_diesel = pd.read_csv('/Users/morad/Downloads/diesel.csv')

# Pick a random station
random_station = '72f7a2d4-1b77-4556-a45e-8abfb804fb3b'

# Create a new DataFrame that contains only the rows of the randomly chosen station
price_diesel = price_diesel[price_diesel['station'] == random_station]
price_diesel = price_diesel[['date', 'diesel']]
price_diesel.reset_index(drop=True, inplace=True)



In [109]:
# Limit data to one month duration
start_date = pd.to_datetime('2023-04-30', utc=True)
end_date = pd.to_datetime('2023-05-31', utc=True)

start_date_plot = '2022-01-30'
end_date_plot = '2023-06-08'

In [110]:
# Ensure 'date' column is in pandas datetime format
price_diesel['date'] = pd.to_datetime(price_diesel['date'])

mask = (price_diesel['date'] >= start_date) & (price_diesel['date'] <= end_date)
price_diesel = price_diesel.loc[mask]

# Reset the index of the DataFrame
price_diesel.reset_index(drop=True, inplace=True)

In [111]:
## Renaming columns
price_diesel.rename(columns = {'date':'ds', 'diesel':'y'}, inplace = True)
price_diesel['ds'] = pd.to_datetime(price_diesel['ds'], utc=True)


## Holiday

In [112]:

# Load the data
holiday_data = pd.read_csv('school_holiday_data_2014_2024.csv')

# Create a new DataFrame, avoiding the SettingWithCopyWarning
holiday_df = holiday_data[['date','NW']].copy()

# Convert boolean values to integers (0s and 1s)
holiday_df.loc[:, 'NW'] = holiday_df['NW'].astype(int)

# Rename 'NW' column to 'holiday'
holiday_df.rename(columns={'date': 'date', 'NW': 'holiday'}, inplace=True)

# Display information
holiday_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3557 entries, 0 to 3556
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     3557 non-null   object
 1   holiday  3557 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 55.7+ KB


## Crude oil

In [113]:
crude_oil = pd.read_csv('fill_crude_oil_2014_2023.csv')
crude_oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421 entries, 0 to 3420
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             3421 non-null   object 
 1   crude_oil_price  3421 non-null   float64
dtypes: float64(1), object(1)
memory usage: 53.6+ KB


## Merge Holiday and Crude oil

In [114]:
dataset = holiday_df.merge(crude_oil, on='date', how='left')


In [115]:
# Convert the 'date' column in dataset to datetime format and adjust it to UTC
dataset['date'] = pd.to_datetime(dataset['date']).dt.tz_localize('UTC')
dataset['holiday'].fillna(0, inplace=True)
dataset['crude_oil_price'].fillna(method='ffill', inplace=True)  # Forward-fill missing values
dataset.rename(columns={'date': 'ds'}, inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3557 entries, 0 to 3556
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   ds               3557 non-null   datetime64[ns, UTC]
 1   holiday          3557 non-null   int64              
 2   crude_oil_price  3557 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 111.2 KB


## Merge dataset to price_diesel

In [116]:
# Merge price_diesel and dataset
df = pd.merge( price_diesel, dataset, on='ds', how='left')

df['holiday'] = df['holiday'].fillna(method='bfill')
df['holiday'] = df['holiday'].fillna(method='ffill')

df['crude_oil_price'] = df['crude_oil_price'].fillna(method='bfill')
df['crude_oil_price'] = df['crude_oil_price'].fillna(method='ffill')

In [117]:
pd.set_option('display.max_rows', 10)  # Show up to 1000 rows

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 745 entries, 0 to 744
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   ds               745 non-null    datetime64[ns, UTC]
 1   y                745 non-null    float64            
 2   holiday          745 non-null    float64            
 3   crude_oil_price  745 non-null    float64            
dtypes: datetime64[ns, UTC](1), float64(3)
memory usage: 29.1 KB


In [118]:
# Drop duplicate rows based on 'ds' column
df = df.drop_duplicates(subset='ds')

# Reset the index, this step is optional but it can help keep your dataframe tidy
df = df.reset_index(drop=True)



In [119]:
df.tail()

Unnamed: 0,ds,y,holiday,crude_oil_price
740,2023-05-30 20:00:00+00:00,1.509,0.0,86.47
741,2023-05-30 21:00:00+00:00,1.551857,0.0,86.47
742,2023-05-30 22:00:00+00:00,1.559,0.0,86.47
743,2023-05-30 23:00:00+00:00,1.559,0.0,86.47
744,2023-05-31 00:00:00+00:00,1.559,0.0,86.47


## convert datetime

In [120]:
# Convert 'ds' column to datetime without UTC timezone
df['ds'] = df['ds'].dt.tz_convert(None)
#??
df = df.drop(['holiday', 'crude_oil_price'], axis=1)

## Model

In [121]:


# Initialize the model
m = NeuralProphet(
    growth="off",
    n_changepoints=1000,
    changepoints_range=0.8,
    seasonality_mode="additive",
    yearly_seasonality = False,
    weekly_seasonality = "auto",
    daily_seasonality = True,
    n_lags=168,
    n_forecasts=168,
    learning_rate=0.5,
    batch_size=50,
    newer_samples_weight=10,
    epochs=100
)

# # Split the dataframe into training and validation sets
# df_train, df_val = m.split_df(df, valid_p=0.3)

metrics = m.fit(df, freq="H")  # fit the model using all data


# # Fit the model and validate it on the validation set
# metrics = m.fit(df_train, validation_df=df_val)


INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.866% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H
INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training.

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.




Training: 0it [00:00, ?it/s]

In [125]:
# Predictions
future = m.make_future_dataframe(df, periods=168, n_historic_predictions=len(df)) #we need to specify the number of days in future
prediction = m.predict(future)
# Plotting


INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.866% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H
INFO - (NP.df_utils.return_df_in_original_format) - Returning df with no ID column
INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.89% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H
INFO - (NP.df_utils._infer_frequency) - Major frequency H corresponds to 99.89% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - H


Predicting: 9it [00:00, ?it/s]


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented

In [128]:
import plotly.graph_objects as go

# Create a line plot for each yhat column
fig = go.Figure()

for column in prediction.columns:
    if column.startswith('yhat'):
        fig.add_trace(go.Scatter(
            x=prediction['ds'],
            y=prediction[column],
            name=column,
            showlegend=False
        ))

# Customize the plot
fig.update_layout(
    title="Prediction",
    xaxis_title="Date",
    yaxis_title="Price"
)

# Show the plot
fig.show()


In [123]:
forecast

FigureWidgetResampler({
    'data': [{'fill': 'none',
              'line': {'color': 'rgba(45, 146, 255, 1.0)', 'width': 2},
              'mode': 'lines',
              'name': 'yhat1',
              'type': 'scatter',
              'uid': '0544c095-e4cf-4678-a1ac-088243aead7b',
              'x': array([datetime.datetime(2023, 4, 30, 0, 0),
                          datetime.datetime(2023, 4, 30, 1, 0),
                          datetime.datetime(2023, 4, 30, 2, 0), ...,
                          datetime.datetime(2023, 6, 6, 22, 0),
                          datetime.datetime(2023, 6, 6, 23, 0),
                          datetime.datetime(2023, 6, 7, 0, 0)], dtype=object),
              'y': array([nan, nan, nan, ..., nan, nan, nan])},
             {'fill': 'none',
              'line': {'color': 'rgba(45, 146, 255, 0.7714285714285714)', 'width': 2},
              'mode': 'lines',
              'name': 'yhat2',
              'type': 'scatter',
              'uid': 'bc089a42-114e-4c

In [124]:
metrics

Unnamed: 0,MAE,RMSE,Loss,RegLoss,epoch
0,0.121674,0.155163,0.211163,0.0,0
1,0.069556,0.088952,0.075849,0.0,1
2,0.052968,0.066871,0.042485,0.0,2
3,0.045234,0.057113,0.030951,0.0,3
4,0.042169,0.052932,0.026370,0.0,4
...,...,...,...,...,...
95,0.010899,0.014667,0.001740,0.0,95
96,0.010733,0.014400,0.001722,0.0,96
97,0.010730,0.014439,0.001717,0.0,97
98,0.010898,0.014631,0.001747,0.0,98
