## Please don't run / don't click "Run all" the notebook:
At the time of writing of this document, the current core limit is 200 cores per workspace and depending upon number of concurrent users, you may end up with core capacity being exceeded or maximum number of parallel jobs being exceeded error. 
## Fetch Marketing Campaigns data into DataFrame and Calculate VAT Variance        

In [2]:
%%pyspark
data_path = spark.read.load('abfss://file1@#STORAGE_ACCOUNT_NAME#.dfs.core.windows.net/PublicGov-data/PublicGovData.csv', format='csv',header=True)
data_path.show(10)

StatementMeta(demofintaxuser, 6, 2, Finished, Available)

+---------+-------+---------+-----------+-----------+---------+------+
|   Region|Country|   Sector|        VAT| VAT_Target|    State|RoleID|
+---------+-------+---------+-----------+-----------+---------+------+
|  Statiso|  South|   Retail|$11,564.00 |$19,306.00 |  Statiso|  NULL|
|Sur Datum|  South|Financial| $6,497.00 | $6,147.00 |Sur Datum|  NULL|
|  Statiso|  South|   Retail|$11,620.00 |$17,230.00 |  Statiso|    20|
|  Statiso|  South|   Retail| $9,963.00 |$18,377.00 |  Statiso|  NULL|
|Sur Datum|  South|Financial|$16,850.00 |$15,753.00 |Sur Datum|  NULL|
|  Statiso|  South|   Retail| $5,333.00 | $7,346.00 |  Statiso|    20|
|  Statiso|  South|   Retail|$17,488.00 | $9,941.00 |  Statiso|  NULL|
|Sur Datum|  South|Financial| $7,264.00 | $9,868.00 |Sur Datum|  NULL|
|  Statiso|  South|   Retail|$14,114.00 |$15,482.00 |  Statiso|    20|
|  Statiso|  South|   Retail| $8,628.00 |$12,622.00 |  Statiso|  NULL|
+---------+-------+---------+-----------+-----------+---------+------+
only s

## Load into Pandas and Perform Cleansing Operations


In [3]:
%%pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np

pd_df = data_path.select("*").toPandas()

'''Cleansing Operations: 
1. Columns VAT, VAT_Target: Remove '$' symbol and convert datatype to float
2. Columns VAT, VAT_Target: Replace null values with 0
3. Columns Region, Country, Sector, Sector: Convert columns to Camel Case
'''
pd_df['VAT']= pd_df['VAT'].replace('[\$,]', '', regex=True).astype(float)
pd_df['VAT_Target']= pd_df['VAT_Target'].replace('[\$,]', '', regex=True).astype(float)
pd_df['VAT'].fillna(value=0, inplace=True)
pd_df['VAT_Target'].fillna(value=0, inplace=True)

pd_df['Region'] = pd_df.Region.str.title()
pd_df['Country'] = pd_df.Country.str.title()

pd_df['Sector'] = pd_df.Sector.str.title()

StatementMeta(demofintaxuser, 6, 3, Finished, Available)

## Data Transformation - Calculate VAT Variance

In [5]:
#Create new column
pd_df['VAT_Variance'] = pd_df['VAT_Target'] - pd_df['VAT']

print(pd_df[1:5])

StatementMeta(demofintaxuser, 6, 5, Finished, Available)

      Region Country     Sector  ...      State  RoleID VAT_Variance
1  Sur Datum   South  Financial  ...  Sur Datum    NULL       -350.0
2    Statiso   South     Retail  ...    Statiso      20       5610.0
3    Statiso   South     Retail  ...    Statiso    NULL       8414.0
4  Sur Datum   South  Financial  ...  Sur Datum    NULL      -1097.0

[4 rows x 8 columns]

## Move data to Azure Data Lake Gen2


In [6]:
%%pyspark
df = spark.createDataFrame(pd_df)
df.show(5)

(df
 .coalesce(1)
 .write
 .mode("overwrite")
 .option("header", "true")
 .format("com.databricks.spark.csv")
 .save('abfss://file1@#STORAGE_ACCOUNT_NAME#.dfs.core.windows.net/PublicGov-data/PublicGovData/'))

StatementMeta(demofintaxuser, 6, 6, Finished, Available)

+---------+-------+---------+-------+----------+---------+------+------------+
|   Region|Country|   Sector|    VAT|VAT_Target|    State|RoleID|VAT_Variance|
+---------+-------+---------+-------+----------+---------+------+------------+
|  Statiso|  South|   Retail|11564.0|   19306.0|  Statiso|  NULL|      7742.0|
|Sur Datum|  South|Financial| 6497.0|    6147.0|Sur Datum|  NULL|      -350.0|
|  Statiso|  South|   Retail|11620.0|   17230.0|  Statiso|    20|      5610.0|
|  Statiso|  South|   Retail| 9963.0|   18377.0|  Statiso|  NULL|      8414.0|
|Sur Datum|  South|Financial|16850.0|   15753.0|Sur Datum|  NULL|     -1097.0|
+---------+-------+---------+-------+----------+---------+------+------------+
only showing top 5 rows

  'JavaPackage' object is not callable
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.