In [None]:
# Install Dependency
%pip install mlflow  # Package for managing and tracking machine learning experiments
%pip install dlt  # Package for data lineage tracking
%pip install databricks-automl-runtime  # Package for running AutoML experiments in Databricks
%pip install holidays  # Package for working with holidays and business days
%pip install numpy  # Package for numerical computing
%pip install cloudpickle  # Package for serializing Python objects
%pip install autocorrect  # Package for spelling correction
%pip install better_profanity  # Package for detecting and filtering profanity
%pip install geopy  # Package for geocoding and reverse geocoding
%pip install category-encoders  # Package for encoding categorical variables
%pip install xgboost==1.5.2  # Package for gradient boosting
%pip install TextBlob  # Package for text processing and sentiment analysis

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting mlflow
  Using cached mlflow-2.12.1-py3-none-any.whl (20.2 MB)
Collecting sqlparse<1,>=0.4.0
  Using cached sqlparse-0.5.0-py3-none-any.whl (43 kB)
Collecting graphene<4
  Downloading graphene-3.3-py2.py3-none-any.whl (128 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 128.2/128.2 kB 3.1 MB/s eta 0:00:00
Collecting markdown<4,>=3.3
  Using cached Markdown-3.6-py3-none-any.whl (105 kB)
Collecting Flask<4
  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
Collecting cloudpickle<4
  Using cached cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Collecting querystring-parser<2
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting pyyaml<7,>=5.1
  Using cached PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)
Collecting alembic!=1.10.0,<2
  Using cached alembic-1.13.1-py3-none-any.whl (233 kB)
Collecting gunicorn<2

#Campaign Analytics<br/>

1. **Usecase               :** Performing Campaign analytics on static campaign data coming from OneLake Bronze Layer.<br/>
2. **Notebook Summary      :** This notebook is a part of campaign analytics application which perform `campaign analytics using various pyspark capability`.<br/>
3. **Notebook Description  :** Performing Campaign Analytics on OneLake Bronze Layer Files.


###Feature List
1. Data Profiling
2. Total Revenue, Total Revenue Target & Profit 
3. Campaign Run by Per Week 
4. Total Profit by Country Per Week
5. Top Loss-Making Campaign 

The bronze data received for processing is already curated. So, we will derive gold tables from bronze tables.

### Import Libraries

In [None]:
from pyspark.sql.functions import sum as _sum
from pyspark.sql.functions import mean as _mean
from pyspark.sql.functions import max as _max
from pyspark.sql.functions import min as _min
import pyspark.sql.functions as func
import pyspark.sql.functions as F
from pyspark.sql.functions import *
import dlt 
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
import random
import string

###Define the Schema for the input file

In [None]:
campaignSchema = StructType([    
    StructField("Region",StringType(),True),
    StructField("Country",StringType(),True),
    StructField("ProductCategory",StringType(),True),
    StructField("Campaign_ID",IntegerType(),True),    
    StructField("Campaign_Name",StringType(),True),
    StructField("Qualification",StringType(),True),
    StructField("Qualification_Number",StringType(),True),
    StructField("Response_Status",StringType(),True),
    StructField("Responses",FloatType(),True),
    StructField("Cost",FloatType(),True),
    StructField("Revenue",FloatType(),True),
    StructField("ROI",FloatType(),True),
    StructField("Lead_Generation",StringType(),True),
    StructField("Revenue_Target",FloatType(),True),
    StructField("Campaign_Tactic",StringType(),True),
    StructField("Customer_Segment",StringType(),True),
    StructField("Status",StringType(),True),
    StructField("Profit",FloatType(),True),
    StructField("Marketing_Cost",FloatType(),True),
    StructField("CampaignID",IntegerType(),True),
    StructField("CampDate",DateType(),True), 
    StructField("SORTED_ID",IntegerType(),True)])
    

### Load the Campaign Dataset from OneLake Bronze Layer

In [None]:
# Bronze Table Setup
@dlt.table(comment="Raw data")
def bronze_campaign_data():
#   return (spark.table("campaign.campaign_source"))
  return (spark.read.format("csv").option("header",True).schema(campaignSchema).load("/Volumes/litware_unity_catalog/rag/documents_store/MktData/CampaignData.csv"))

Name,Type
Region,string
Country,string
ProductCategory,string
Campaign_ID,int
Campaign_Name,string
Qualification,string
Qualification_Number,string
Response_Status,string
Responses,float
Cost,float


### Total Revenue, Total Revenue Target & Profit

In [None]:
# Gold Table Setup
@dlt.table(comment="Aggregated data")
def gold_country_wise_revenue():
    df = spark.table("LIVE.bronze_campaign_data").groupBy("Country","Region").agg(_sum("Revenue").alias("Total_Revenue"), _sum("Revenue_Target").alias("Total_Revenue_Target"),_sum("Profit").alias("Total_Profit"),_max("Cost").alias("Max_Cost"),_min("Cost").alias("Min_Cost"))
    df = df.withColumn("Total_Revenue", func.round(df["Total_Revenue"],2)).withColumn("Total_Revenue_Target", func.round(df["Total_Revenue_Target"], 2)).withColumn("Total_Profit", func.round(df["Total_Profit"], 2))
    return df

Name,Type
Country,string
Region,string
Total_Revenue,double
Total_Revenue_Target,double
Total_Profit,double
Max_Cost,float
Min_Cost,float


### Top Loss-Making Campaign

In [None]:
# Gold Table Setup
@dlt.table(comment="Aggregated data")
def gold_Top_Loss_Making_Campaign():
    loss = spark.table("LIVE.bronze_campaign_data").select("Campaign_Name","Profit").filter(F.col("Profit") < 0)
    loss = loss.withColumn("Loss_Count", F.when((F.col('Profit') < 0 ) , F.lit(1)).otherwise(F.lit(0)))
#     loss = loss.groupBy('Campaign_Name').sum('Loss_Count')
    return loss

Name,Type
Campaign_Name,string
Profit,float
Loss_Count,int


#  Retail Sales Data Preparation using Spark

Preparing retail data for training a regression model to predict total sales revenue of a product from a store using the following features: 
- Brand (The brand of the product)
- Quantity (Quantity of product purchased)
- Advert (Whether the product had an advertisement or not)
- Price (How much the product costs)

## Importing Libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window
from io import BytesIO
from copy import deepcopy
from datetime import datetime
from dateutil import parser
import logging
from pyspark.sql.types import *

## Defining the schema for the data

In [None]:
Dataschema = StructType([
    StructField("ID", StringType()),
    StructField("WeekStarting", DateType()),
    StructField("Store", IntegerType()),
    StructField("Brand", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("Advert", IntegerType()),
    StructField("Price", FloatType()),
    StructField("Revenue", FloatType())
])


## Load the data from the source and perform the transformations

In [None]:
@dlt.table(comment="Raw data")    
def bronze_SalesTrans():
  return (spark.read.csv('/Volumes/litware_unity_catalog/rag/documents_store/MktData/SalesTransData.txt',schema=Dataschema))

Name,Type
ID,string
WeekStarting,date
Store,int
Brand,string
Quantity,int
Advert,int
Price,float
Revenue,float


In [None]:
@dlt.table(comment="Silver data")
def silver_SalesTrans_data():
    pydf = spark.table('LIVE.bronze_SalesTrans').withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("WeekStarting")))
    return pydf

Name,Type
ID,string
WeekStarting,date
Store,int
Brand,string
Quantity,int
Advert,int
Price,float
Revenue,float
rank,double


In [None]:
@dlt.table(comment="Silver data")
def gold_SalesTrans_toprank():
    train = spark.table('LIVE.silver_SalesTrans_data').where("rank <= .8").drop("rank")
    return train
    
@dlt.table(comment="Silver data")
def gold_SalesTrans_lowerrank():
    test = spark.table('LIVE.silver_SalesTrans_data').where("rank > .8").drop("rank")
    return test

Name,Type
ID,string
WeekStarting,date
Store,int
Brand,string
Quantity,int
Advert,int
Price,float
Revenue,float


#  Customer Churn Data Preparation using Spark

## Load the data from the source and perform the transformations

In [None]:
@dlt.table(comment="Raw data") 
def bronze_CustomerChurnTrans():
  return (spark.read.csv('/Volumes/litware_unity_catalog/rag/documents_store/MktData/CustomerChurnData.csv',  header=True))

Name,Type
CustomerID,string
Gender,string
SeniorCitizen,string
Partner,string
Dependents,string
tenure,string
Discount,string
OutletSize,string
OnlineDelivery,string
OrderStatus,string


In [None]:
@dlt.table(comment="Silver data")
def silver_CustomerChurn_data():
    pydf = spark.table('LIVE.bronze_CustomerChurnTrans')
    return pydf

Name,Type
ID,string
WeekStarting,date
Store,int
Brand,string
Quantity,int
Advert,int
Price,float
Revenue,float


The result after running DLT pipeline would look similar to the following screenshot.

![](https://stmsftbuild2024.blob.core.windows.net/dltimage/task-2.2.7.png)