## *****For Demonstration purpose only, Please customize as per your enterprise security needs and compliances*****  

## Please don't run / don't click "Run all" the notebook:
At the time of writing of this document, the current core limit is 200 cores per workspace and depending upon number of concurrent users, you may end up with core capacity being exceeded or maximum number of parallel jobs being exceeded error. 
## Fetch Marketing Campaigns data into DataFrame and Calculate Revenue Variance        

In [1]:
%%pyspark
data_path = spark.read.load('abfss://marketing-data@#STORAGE_ACCOUNT_NAME#.dfs.core.windows.net/CampaignData.csv', format='csv',header=True)
display(data_path.limit(10))

## Load into Pandas and Perform Cleansing Operations


In [2]:
%%pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np

pd_df = data_path.select("*").toPandas()

'''Cleansing Operations: 
1. Columns Revenue, Revenue_Target: Remove '$' symbol and convert datatype to float
2. Columns Revenue, Revenue_Target: Replace null values with 0
3. Columns Region, Country, Product_Category, Campaign_Name: Convert columns to Camel Case
'''
pd_df['Revenue']= pd_df['Revenue'].replace('[\$,]', '', regex=True).astype(float)
pd_df['Revenue_Target']= pd_df['Revenue_Target'].replace('[\$,]', '', regex=True).astype(float)
pd_df['Revenue'].fillna(value=0, inplace=True)
pd_df['Revenue_Target'].fillna(value=0, inplace=True)

pd_df['Region'] = pd_df.Region.str.title()
pd_df['Country'] = pd_df.Country.str.title()

pd_df['Campaign_Name'] = pd_df.Campaign_Name.str.title()

## Data Transformation - Calculate Revenue Variance


In [3]:
#Create new column
pd_df['Revenue_Variance'] = pd_df['Revenue_Target'] - pd_df['Revenue']

print(pd_df[1:5])

Region Country  ... RoleID  Revenue_Variance
1  Southern California      Us  ...   NULL            -350.0
2           South East      Us  ...     20            5610.0
3           South East      Us  ...   NULL            8414.0
4  Southern California      Us  ...   NULL           -1097.0

[4 rows x 9 columns]

## Move data to Azure Data Lake Gen2


In [5]:
%%pyspark
df = spark.createDataFrame(pd_df)
df.show(5)

(df
 .coalesce(1)
 .write
 .mode("overwrite")
 .option("header", "true")
 .format("com.databricks.spark.csv")
 .save('abfss://marketing-data@mediademostorage.dfs.core.windows.net/Campaignsdata'))

+-------------------+-------+---------------+-------+--------------+-----------+----------+------+----------------+
|             Region|Country|  Campaign_Name|Revenue|Revenue_Target|       City|     State|RoleID|Revenue_Variance|
+-------------------+-------+---------------+-------+--------------+-----------+----------+------+----------------+
|         South East|     Us|Patient Stories|11564.0|       19306.0|      Miami|   Florida|  NULL|          7742.0|
|Southern California|     Us|  For Your Life| 6497.0|        6147.0|Los Angeles|California|  NULL|          -350.0|
|         South East|     Us|  Hit The Track|11620.0|       17230.0|      Miami|   Florida|    20|          5610.0|
|         South East|     Us|Patient Stories| 9963.0|       18377.0|      Miami|   Florida|  NULL|          8414.0|
|Southern California|     Us|  For Your Life|16850.0|       15753.0|Los Angeles|California|  NULL|         -1097.0|
+-------------------+-------+---------------+-------+--------------+----