## Data Loading



In [457]:
from azure.storage.filedatalake import DataLakeServiceClient

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('notebooks/corrected/.env')

# Replace with your details
storage_account_name = "mldebugdevadls"
storage_account_key = os.getenv('AZURE_STORAGE_KEY')

# Connect to ADLS
service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net",
    credential=storage_account_key,
    api_version="2023-11-03"  # Use the correct supported API version
)

# List Containers
containers = service_client.list_file_systems()
for container in containers:
    print(container.name)


data


In [458]:
# Replace with your details
container_name = 'data'

In [459]:
import os
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io

def read_csv_from_blob(storage_account_name, container_name, file_name, storage_account_key=None):
    """
    Read a CSV file from Azure Blob Storage using Python and return a Pandas DataFrame.

    :param storage_account_name: Azure storage account name.
    :param container_name: Blob container name.
    :param file_name: Name of the file in the container.
    :param storage_account_key: Storage account access key.
    :return: Pandas DataFrame.
    """

    if not storage_account_key:
        # Try to get the key from environment variables if not provided
        storage_account_key = os.environ.get('AZURE_STORAGE_KEY')

    if not storage_account_key:
        raise ValueError("Storage account key must be provided either as a parameter or as an environment variable 'AZURE_STORAGE_KEY'")

    try:
        # Create a connection string
        connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"

        # Create the BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)

        # Get the container client
        container_client = blob_service_client.get_container_client(container_name)

        # Get the blob client
        blob_client = container_client.get_blob_client(file_name)

        # Download the blob content
        download_stream = blob_client.download_blob()

        # Convert the content to a DataFrame
        content = download_stream.readall()
        df = pd.read_csv(io.BytesIO(content))

        return df

    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

In [460]:
online_marketing = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="OnlineMarketingData.csv",
                                      storage_account_key=storage_account_key)
offline_marketing = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="OfflineMarketingData.csv",
                                      storage_account_key=storage_account_key)
sales = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="SalesData.csv",
                                      storage_account_key=storage_account_key)
price = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="PricingData.csv",
                                      storage_account_key=storage_account_key)

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

## Data Pre Processing

In [None]:
# Read sales data and convert to date to datetime
sales['date'] =  pd.to_datetime(sales['date'])

#read offline marketing data and convert week to datetime
offline_marketing['week'] = pd.to_datetime(offline_marketing['week'])

#read online marketing data and convert to datetime
online_marketing['date'] = pd.to_datetime(online_marketing['date'])

In [None]:
# Calculate total daily sales to match with marketing data
daily_sales = sales.groupby('date')['sales_quantity'].sum().reset_index()
daily_sales.rename(columns={'sales_quantity': 'total_sales'}, inplace=True)

In [None]:
# Merge digital data with daily sales for analysis
digital_sales = pd.merge(online_marketing, daily_sales, on='date', how='left')

In [None]:
# Aggregate weekly sales for comparison with weekly media spend
weekly_sales = daily_sales.copy()
weekly_sales['week'] = pd.to_datetime(weekly_sales['date']).dt.to_period('W').dt.start_time
weekly_sales = weekly_sales.groupby('week')['total_sales'].sum().reset_index()

In [None]:
# Merge weekly media spend with weekly sales
media_sales = pd.merge(offline_marketing, weekly_sales, on='week', how='left')

## Task 1: Flawed Adstock Transformation

In [None]:
# ----------------------------
# BUGGY PLOT 1: Flawed adstock transformation
# ----------------------------


def plot_buggy_adstock():
    # Filter the media data to get only TV spend
    media_df = offline_marketing[offline_marketing['channel'] == 'TV'][['week', 'spend']].copy()

    # Rename 'spend' column to 'TV' for consistency
    media_df['TV'] = media_df['spend']

    # Create flawed adstock by simply lagging data
    media_df['TV_lag1'] = media_df['TV'].shift(1)
    media_df['TV_lag2'] = media_df['TV'].shift(2)

    # Incorrectly add lags without decay
    media_df['TV_adstock'] = media_df['TV'] + media_df['TV_lag1'] + media_df['TV_lag2']

    # Plot the original and buggy adstock values
    plt.figure(figsize=(10, 6))
    plt.plot(media_df['week'], media_df['TV'], label='Original TV Spend')
    plt.plot(media_df['week'], media_df['TV_adstock'], label='Buggy Adstock (Simple Sum of Lags)')
    plt.title('Buggy Adstock Transformation')
    plt.xlabel('Week')
    plt.ylabel('Spend')
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_buggy_adstock()




## Task 2: Misleading Diminishing Returns Visualization (No Saturation Effect)

In [None]:
# ----------------------------
# BUGGY PLOT 2: Misleading diminishing returns visualization (No saturation effect)
# ----------------------------


def plot_buggy_diminishing_returns():
    # Filter the media data to get only TV spend
    tv_spend = offline_marketing[offline_marketing['channel'] == 'TV']['spend'].values

    # Create linear response (which is wrong for advertising)
    response = 500 + 0.05 * tv_spend

    # Plot the scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(tv_spend, response, label='Linear Response')

    # Add linear trend line
    z = np.polyfit(tv_spend, response, 1)
    p = np.poly1d(z)
    plt.plot(tv_spend, p(tv_spend), "r--", label='Linear Trend Line')

    # Add titles and labels
    plt.title('Buggy Linear Response Curve (No Diminishing Returns)')
    plt.xlabel('TV Spend')
    plt.ylabel('Sales Response')
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_buggy_diminishing_returns()


## Task 3: Incorrect Assessment of Interaction Term in Sales Analysis

In [None]:
offline_channels = ['TV', 'Radio', 'Print', 'OOH']
online_channels = ['Facebook', 'Instagram', 'YouTube', 'Google Ads', 'Influencer Marketing']

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def buggy_interaction_analysis():
    """
    Perform Buggy Channel Interaction Analysis by summing all channel spends to create a combined spend term.

    Parameters:
    media_sales (pd.DataFrame): DataFrame containing the merged sales and marketing data.
    offline_channels (list): List of offline marketing channels.
    online_channels (list): List of online marketing channels.
    """
    # Pivot the media_sales data to separate each channel into its own column
    media_sales_pivoted = media_sales.pivot_table(index='week', columns='channel', values='spend', aggfunc='sum')

    # Ensure that all offline and online channels are present in the DataFrame, fill missing ones with zeros
    for channel in offline_channels + online_channels:
        if channel not in media_sales_pivoted.columns:
            media_sales_pivoted[channel] = 0  # Fill missing channels with zero spend

    # **Buggy**: Summing all channel spends to create a combined spend (incorrect approach)
    media_sales_pivoted['combined_spend'] = media_sales_pivoted[offline_channels + online_channels].sum(axis=1)

    # **Buggy**: Interaction sales based on the combined spend (incorrect approach)
    media_sales_pivoted['interaction_sales'] = media_sales_pivoted['combined_spend'] * 0.1  # Arbitrary multiplication without proper logic

    # Merge total sales into the pivoted data
    media_sales_pivoted = pd.merge(media_sales_pivoted, media_sales[['week', 'total_sales']], on='week', how='left')

    # Plotting the actual vs misleading interaction sales (incorrect approach)
    plt.figure(figsize=(12, 6))
    plt.plot(media_sales_pivoted['week'], media_sales_pivoted['total_sales'], label='Actual Sales', color='b')
    plt.plot(media_sales_pivoted['week'], media_sales_pivoted['interaction_sales'], label='Misleading Interaction Sales', color='r', linestyle='--')

    plt.title('Buggy Channel Interaction Analysis')
    plt.xlabel('Week')
    plt.ylabel('Sales')
    plt.legend()
    plt.tight_layout()
    plt.show()

buggy_interaction_analysis()


## Task 4: Modeling and Visualizing the Adstock Effect

In [None]:
# ----------------------------
# BUGGY PLOT 4: Misleading adstock effect wih wrong decay rate
# ----------------------------

def plot_adstock_effect():
    """
    Function to calculate and plot the adstock effect on media spend.

    Parameters:
    - sales_data: DataFrame containing the sales data with 'date' and 'sales_quantity' columns.
    - media_data: DataFrame containing the media spend data with 'week' and 'spend' columns.
    - decay_rate: The decay rate to apply to the lagged spend (default is 0.5).

    Returns:
    - A plot showing the original and adstocked media spend over time.
    """

    decay_rate=0.5

    # Calculate total daily sales to match with marketing data
    daily_sales = sales.groupby('date')['sales_quantity'].sum().reset_index()
    daily_sales.rename(columns={'sales_quantity': 'total_sales'}, inplace=True)

    # Aggregate weekly sales for comparison with weekly media spend
    weekly_sales = daily_sales.copy()
    weekly_sales['week'] = pd.to_datetime(weekly_sales['date']).dt.to_period('W').dt.start_time
    weekly_sales = weekly_sales.groupby('week')['total_sales'].sum().reset_index()

    # Merge media spend with weekly sales
    media_sales = pd.merge(offline_marketing, weekly_sales, on='week', how='left')

    # Apply decay rate to model adstock (using the specified decay rate)
    media_sales['TV'] = media_sales['spend']  # Original spend

    # Lag the data by 1 and 2 weeks to represent delayed effects
    media_sales['TV_lag1'] = media_sales['spend'].shift(1) * decay_rate  # Applying decay to lag 1
    media_sales['TV_lag2'] = media_sales['spend'].shift(2) * (decay_rate ** 2)  # Applying decay to lag 2

    # Calculate the adstock effect by summing the original spend with the decayed lags
    media_sales['TV_adstock'] = media_sales['TV'] + media_sales['TV_lag1'] + media_sales['TV_lag2']

    # Plotting the results
    plt.figure(figsize=(10, 6))
    plt.plot(media_sales['week'], media_sales['TV'], label='Original TV Spend', marker='o')
    plt.plot(media_sales['week'], media_sales['TV_adstock'], label='Adstock with Decay', marker='x', linestyle='--')

    plt.title(f'Adstock Effect with Decay Rate ({decay_rate})')
    plt.xlabel('Week')
    plt.ylabel('Spend')
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_adstock_effect()
