In [None]:
# Objective

The objective of this Jupyter Notebook is to analyze and forecast web traffic data using various data analysis and visualization techniques. We will utilize the Adobe Analytics API to extract data, perform time series analysis, and visualize the results using different Python libraries.

# Overview

1. **Data Extraction**: We will use the `cjapy` library to extract data from Customer Journey Analytics. The data will include metrics such as visits and orders over a specified date range.

2. **Data Preprocessing**: The extracted data will be preprocessed to convert day-of-year values to actual dates and sort the data accordingly. We will also ensure that the metrics are in the correct format for analysis.

3. **Time Series Analysis**: We will perform time series analysis using the SARIMAX model to forecast future values. This will include fitting the model to historical data and generating forecasts for the next 90 days.

4. **Visualization**: Various visualization techniques will be employed to understand the data better. This includes:
    - Line plots to visualize daily orders over time.
    - Histograms to understand the distribution of daily orders.
    - Interactive plots using Plotly for better data exploration.
    - Rolling statistics to observe trends and patterns.
    - Seasonal decomposition to analyze the observed, trend, seasonal, and residual components of the time series data.

5. **Statistical Analysis**: We will use statistical methods to decompose the time series data into its components and understand the underlying patterns and trends.

By the end of this notebook, we aim to have a comprehensive understanding of the web traffic data and be able to make informed predictions about future trends.

In [8]:
import cjapy
from datetime import datetime, timedelta
import plotly.graph_objs as go
import json

# Load the configuration and initialize the CJA object
cjapy.importConfigFile("myconfig.json")
cja = cjapy.CJA()

# Specify the Data View ID for analysis
data_view = "dv_677ea9291244fd082f02dd42"

In [9]:
# Function to convert day of year to date
def day_of_year_to_date(year, day_of_year):
    day_of_year = int(day_of_year)  # Convert to integer
    return (datetime(year, 1, 1) + timedelta(day_of_year - 1)).strftime('%Y-%m-%d')

# Pick dimension and metric
dimension = "variables/timepartdayofyear"
metric = "metrics/visits"
dateRange = "2024-01-01T00:00:00.000/2024-12-31T00:00:00.000"

# Define the report request
myRequest = cjapy.RequestCreator()
myRequest.setDataViewId(data_view)
myRequest.setDimension(dimension)
myRequest.addMetric(metric)
myRequest.addGlobalFilter(dateRange)

# Pull and print the report from CJA
myReport = cja.getReport(myRequest)

# Convert day of year to date and sort the dataframe
sorted_df = myReport.dataframe.copy()
sorted_df[dimension] = sorted_df[dimension].apply(lambda x: day_of_year_to_date(2024, x))
sorted_df.sort_values(by=dimension, inplace=True)

# Convert "metrics/visits" column to whole numbers
sorted_df[metric] = sorted_df[metric].astype(int)

# Print the sorted dataframe with dimension and metric
print(sorted_df[[dimension, metric]])

    variables/timepartdayofyear  metrics/visits
172                  2024-01-01             508
17                   2024-01-02             857
41                   2024-01-03             727
72                   2024-01-04             647
176                  2024-01-05             501
..                          ...             ...
354                  2024-12-26              78
352                  2024-12-27              86
362                  2024-12-28              43
360                  2024-12-29              56
357                  2024-12-30              63

[365 rows x 2 columns]


In [10]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd
import numpy as np
import plotly.graph_objs as go

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# 'x_values' are the dates, and 'y_values' are the corresponding visits
x_values = pd.to_datetime(sorted_df[dimension])
y_values = sorted_df[metric]

# Define the SARIMAX model
model = SARIMAX(y_values, visit=(1, 1, 1), seasonal_order=(1, 1, 1, 12))

# Fit the model
fitted_model = model.fit(disp=False)

# Forecast the next 90 days
forecast_length = 90
forecast_result = fitted_model.get_forecast(steps=forecast_length)

# Define forecast index as a date range starting the day after the last date in x_values
forecast_index = pd.date_range(start=x_values.iloc[-1] + pd.Timedelta(days=1), periods=forecast_length, freq='D')

# Get the forecast and the confidence intervals
forecast = forecast_result.predicted_mean
conf_int = forecast_result.conf_int()

# Create a new figure for plotting
fig = go.Figure()

# Plot historical data
fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='lines', name='Historical Visits'))

# Plot forecasted data
fig.add_trace(go.Scatter(x=forecast_index, y=forecast, mode='lines+markers', name='Forecast', line=dict(dash='dot')))

# Plot confidence intervals
fig.add_trace(go.Scatter(x=np.concatenate([forecast_index, forecast_index[::-1]]),
                         y=np.concatenate([conf_int.iloc[:, 0], conf_int.iloc[:, 1][::-1]]),
                         fill='toself', fillcolor='rgba(0,100,80,0.2)',
                         line=dict(color='rgba(255,255,255,0)'), name='Confidence Band'))

# Update layout
fig.update_layout(title='Forecast of Visits Over Time with Confidence Bands',
                  xaxis_title='Date', yaxis_title='Visits', legend=dict(y=0.5, traceorder='reversed'))

# Show plot
fig.show()

In [None]:
# Function to convert day of year to date
def day_of_year_to_date(year, day_of_year):
    day_of_year = int(day_of_year)  # Convert to integer
    return (datetime(year, 1, 1) + timedelta(day_of_year - 1)).strftime('%Y-%m-%d')

# Pick dimension and metric
dimension = "variables/timepartdayofyear"
metric = "metrics/orders"
dateRange = "2024-01-01T00:00:00.000/2024-12-31T00:00:00.000"

# Define the report request
myRequest = cjapy.RequestCreator()
myRequest.setDataViewId(data_view)
myRequest.setDimension(dimension)
myRequest.addMetric(metric)
myRequest.addGlobalFilter(dateRange)

# Pull and print the report from CJA
myReport = cja.getReport(myRequest)

# Convert day of year to date and sort the dataframe
sorted_df = myReport.dataframe.copy()
sorted_df[dimension] = sorted_df[dimension].apply(lambda x: day_of_year_to_date(2024, x))
sorted_df.sort_values(by=dimension, inplace=True)

# Convert "metrics/orders" column to whole numbers
sorted_df[metric] = sorted_df[metric].astype(int)

# Print the sorted dataframe with dimension and metric
print(sorted_df[[dimension, metric]])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.dates as mdates  # Add this import

# Generate sample data for a full year
dates = pd.date_range(start='2023-01-01', end='2024-12-31', freq='D')
np.random.seed(42)
orders = np.random.normal(100, 15, len(dates))
sorted_df = pd.DataFrame({'Date': dates, 'Orders': orders})

# Time Series Visualization
plt.figure(figsize=(15, 6))
sns.lineplot(data=sorted_df, x='Date', y='Orders')
plt.title('Daily Orders Over Time')
plt.xticks(rotation=45)
# Improve x-axis readability
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())  # Fixed: use mdates.MonthLocator
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))  # Fixed: use mdates.DateFormatter
plt.tight_layout()

# Statistical Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=sorted_df, x='Orders', kde=True)
plt.title('Distribution of Daily Orders')

# Interactive Plotly Visualization
fig = px.line(sorted_df, x='Date', y='Orders',
              title='Interactive Daily Orders Trend')
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of Orders",
    hovermode='x unified',
    xaxis=dict(
        tickformat="%b %Y",
        tickangle=45,
        dtick="M1"
    )
)

# Rolling Statistics
sorted_df['7_day_rolling_avg'] = sorted_df['Orders'].rolling(window=7).mean()
plt.figure(figsize=(15, 6))
plt.plot(sorted_df['Date'], sorted_df['Orders'], label='Daily Orders')
plt.plot(sorted_df['Date'], sorted_df['7_day_rolling_avg'], 
         label='7-day Moving Average', linewidth=2)
plt.title('Daily Orders with Rolling Average')
plt.legend()
# Improve x-axis readability
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())  # Fixed: use mdates.MonthLocator
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))  # Fixed: use mdates.DateFormatter
plt.xticks(rotation=45)
plt.tight_layout()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Perform seasonal decomposition
result = seasonal_decompose(sorted_df['Orders'], model='additive', period=365)

# Plot the decomposed components
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(15, 12), sharex=True)
result.observed.plot(ax=ax1)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4)
ax4.set_ylabel('Residual')
plt.xlabel('Date')
plt.tight_layout()
plt.show()