<a href="https://colab.research.google.com/github/mphani/ts_prophet/blob/works/actual_as_prophet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from prophet import Prophet
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from prophet.plot import plot_plotly, plot_components_plotly

from prophet.plot import add_changepoints_to_plot, plot_cross_validation_metric


In [None]:
G_FILENAME = "sample_data/24122023_aerospike_stats.csv"

In [None]:
original_df = pd.read_csv(G_FILENAME)
original_df.head()

In [None]:
original_df["date_from_utc"] = pd.to_datetime(original_df.utc_date_time , unit='s')
original_df["full_date"] = original_df["date_from_utc"].dt.date
original_df["year"] = original_df["date_from_utc"].dt.year
original_df["month"] = original_df["date_from_utc"].dt.month
original_df["day"] = original_df["date_from_utc"].dt.day
original_df["hour"] = original_df["date_from_utc"].dt.hour
original_df["minute"] = original_df["date_from_utc"].dt.minute
original_df["second"] = original_df["date_from_utc"].dt.second

In [None]:
print( original_df.shape)

In [None]:
read_df= original_df[ original_df.metric_name=="aerospike_namespace_client_read_success"]
read_df= read_df[ read_df.sub_query_name=="simple_query"]
# pick up only ssd namespace
read_df= read_df[ read_df.ns=="ssd" ]
# read_df= read_df[ (read_df.ns=="ssd") & (read_df.service=="192.168.201.241:3000")]
read_df.shape

In [None]:
read_df= read_df[ read_df.service=="192.168.201.241:3000"]
read_df.shape

In [None]:
print(read_df["service"].unique())
print(read_df["ns"].unique())
print(read_df["cluster_name"].unique())
print(read_df["metric_name"].unique())


In [None]:
# read_df["full_date", "metric_value"].plot()
prophet_read_df = read_df[ ["date_from_utc", "metric_value"] ].copy()
prophet_read_df['date_from_utc'] = pd.to_datetime(prophet_read_df['date_from_utc'], format='%Y-%m-%d %H:%M:%S')

In [None]:
prophet_read_df.rename(columns={"date_from_utc": "ds", "metric_value": "y"}, inplace=True)
prophet_read_df=prophet_read_df.reindex()
prophet_read_df.info()

In [None]:
l_sort_column_names = ["ds"]
prophet_read_df.sort_values(by= l_sort_column_names, ascending = True)

In [None]:
# Removing duplicates in the original DataFrame
prophet_read_df.drop_duplicates(subset="ds", inplace=True)


In [None]:
prophet_read_df.plot()

In [None]:
# df['date_from_utc'] = pd.to_datetime(df['date_from_utc'])

# Plotting a line graph
plt.figure(figsize=(10, 6))  # Set figure size as needed

plt.plot(prophet_read_df['ds'], prophet_read_df['y'], color='blue', marker='o', linestyle='-')

# plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Minute-wise Line Graph')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability if needed
plt.tight_layout()  # Adjusts plot to fit into the figure area
plt.show()

In [None]:
# Plotting a line graph
plt.figure(figsize=(10, 6))  # Set figure size as needed

single_day_df = read_df [ read_df.day == 15 ]

single_day_df = single_day_df[ ["date_from_utc", "metric_value"] ].copy()
single_day_df.rename(columns={"date_from_utc": "ds", "metric_value": "y"}, inplace=True)
single_day_df.info()

# plt.plot(single_day_df['ds'], single_day_df['y'], color='green', marker='o', linestyle='--')
plt.bar (single_day_df['ds'], single_day_df['y'], )


plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Minute-wise Line Graph')
# plt.xticks(rotation=45)  # Rotate x-axis labels for better readability if needed
plt.tight_layout()  # Adjusts plot to fit into the figure area
plt.show()

In [None]:
m = Prophet()
# m= Prophet( interval_width=0.9, yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True )
m.fit(prophet_read_df)

In [None]:
# 1440 = 24 * 60 minutes

#
# future = m.make_future_dataframe(periods= 1440, freq="T", include_history=True)
# future.tail()

In [None]:
# forecast = m.predict(future)
#
# only for anamoly
forecast = m.predict( prophet_read_df)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
forecast.info()

In [None]:
pd.concat( [prophet_read_df.set_index("ds")["y"],  forecast.set_index("ds")["yhat"]], axis=1).plot()

In [None]:
# merge original and forecasted
#
merged_df = pd.merge( prophet_read_df, forecast[["ds", "yhat","yhat_lower","yhat_upper"]], on="ds" )

In [None]:
merged_df.to_csv("prophet_forecasted.csv")

In [None]:
figure_1 = m.plot(forecast)


In [None]:
figure_2 = m.plot_components(forecast)

In [None]:
plot_plotly(m, forecast)

In [None]:
fig_change_points = add_changepoints_to_plot ( figure_1.gca(), m, forecast)
figure_1

In [None]:
# error and metrics
perf_mae = mean_absolute_error(merged_df['y'], merged_df['yhat'])
print ("mae ", perf_mae)
perf_mape = mean_absolute_percentage_error(merged_df['y'], merged_df['yhat'])
print("mape " , perf_mape )

In [None]:
merged_df["anomaly"] = merged_df.apply(lambda rows: 1 if ((rows.y<rows.yhat_lower)|(rows.y>rows.yhat_upper)) else 0, axis = 1)

# Check the number of anomalies
merged_df["anomaly"].value_counts()

In [None]:
anomalies_df = merged_df[ merged_df["anomaly"]==1].sort_values(by='ds')
anomalies_df.head(100)

In [None]:
# Visualize the anomalies
plt.figure(figsize=(10, 6))
sns.scatterplot(x="ds", y="y", data=merged_df, hue="anomaly")
sns.lineplot(x="ds", y="yhat", data=merged_df, color="black")

In [None]:
# Calculate residuals (difference between actual and predicted values)
merged_df['residuals'] = merged_df['y'] - merged_df['yhat']

merged_df.info()

# single_merged_df =


In [None]:
# Calculate mean and standard deviation of residuals
mean_residual = merged_df['residuals'].mean()
std_residual = merged_df['residuals'].std()

# Set a threshold for anomalies (e.g., 3 standard deviations)
threshold = 3 * std_residual

# Identify anomalies
anomalies = merged_df[np.abs(merged_df['residuals'] - mean_residual) > threshold]

# Plotting the data and anomalies
plt.figure(figsize=(10, 6))
plt.plot(merged_df['ds'], merged_df['y'], label='Original Data')
plt.scatter(anomalies['ds'], anomalies['y'], color='red', label='Anomalies')
plt.legend()
plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Anomaly Detection with Prophet')
plt.show()

In [None]:
perf_cv = cross_validation(m, initial='6 days', period='2 days', horizon = '1 days')

In [None]:
perf_cv_metrics = performance_metrics( perf_cv)
perf_cv_metrics.info()

perf_cv_metrics

In [None]:
fig = plot_cross_validation_metric(perf_cv, metric='mape')