#### libraries

In [0]:
# sharepoint modules
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.user_credential import UserCredential
from office365.sharepoint.files.file import File

# local files handling when reading input
import os
# in-memory handling for writing files for export
# from io import BytesIO

# to read/write CSV in sharepoint
import pandas as pd
'''
If using spark.read.csv() -> AnalysisException: Incompatible format detected.
Delta format not working either.
Eg:
  dfr = spark.read \
      .format("csv") \
      .option("header", "true") \
      .load(download_path)
'''

from pyspark.sql import functions as F

#### `general settings` to work with sharepoint

In [0]:
# sharepoint working site
site_url = "https://test-company.sharepoint.com/sites/SASIPAutomationProject/"
# init client context with credentials
client = ClientContext(site_url).with_credentials(UserCredential("admin-user@test-company.com", dbutils.secrets.get('secret-scope-name', 'secret-key')))

# fold_link = "https://test-company.sharepoint.com/:f:/s/SASIPAutomationProject/EoAbcsDcoSJCpvyTPSRBzscBwYb-lowtdDwUKh7kdTfHDg?e=l13thZ"

# fold_obj = client.web.get_folder_by_guest_url_extended(fold_link).execute_query()
# sharepoint_files = fold_obj.files   
# client.load(sharepoint_files).execute_query()

#### create individual `dataframes from CSV files` (comma separated)

In [0]:
input_files = {
  "exclusion__df" : "https://test-company.sharepoint.com/:f:/s/SASIPAutomationProject/ErUH64YzYENHtWrA02EEDV0BH9SUUQ1p5i1qlkUPnPkWLg?e=20GjiH",
  "role_map__df" : "https://test-company.sharepoint.com/:f:/s/SASIPAutomationProject/EknYQFKVeTdDnEEvysgkUKAB2QnZdAM8kpHsDxJxYbpUVw?e=lY8zSW",
  "audit_detail_mtd__df": "https://test-company.sharepoint.com/:f:/s/SASIPAutomationProject/Eq2mqMvKe1FIlVr5aKvM00oBo_1lehXBue_vGL9tW-ciTA?e=sx3wdh",
  "account_list__df" : "https://test-company.sharepoint.com/:f:/s/SASIPAutomationProject/EnzRQOSFaSdOrcJaYXbxGTkBdzcvIXaK9WIrjAQdgKvRqg?e=GZne0M"
}

In [0]:
for key, value in input_files.items():
  
  print(f'creating dataframe: {key}')
  fold_link = value

  fold_obj = client.web.get_folder_by_guest_url_extended(fold_link).execute_query()
  sharepoint_files = fold_obj.files   
  # client to work with sharepoint_files
  client.load(sharepoint_files).execute_query()
  
  try:
    working_file = sharepoint_files[0]

    download_path = os.path.join(os.getcwd(), str(working_file))
    # apply sharepoint library method
    file_response = File.read(working_file)
    # Save file locally for pandas to read next
    with open(download_path, 'wb') as output_file:
      output_file.write(file_response)  
    
    # read file with pandas and convert to spark dataframe.
    # "low_memory=False" avoids data type error, alternatively use a dictionary and the dtype option to define data types.
    # elimintate empty rows if any.
    df = pd.read_csv(download_path, low_memory=False).dropna(how='all') 
    globals()[key] = spark.createDataFrame(df)

    # delete unnecesary elements
    os.remove(download_path)
    del df

  except:
    print(f"no file for {key}")

In [0]:
exclusion__df.show(10, truncate=False)
# display(exclusion__df.limit(10))

In [0]:
# display(role_map__df.limit(10))
role_map__df.show(10, truncate=False)

In [0]:
# display(audit_detail_mtd__df.limit(10))
audit_detail_mtd__df.show(10, truncate=False)

In [0]:
# display(account_list__df.limit(10))
account_list__df.show(10, truncate=False)

#### original tables `Transformation`

In [0]:
# Extract the date as a string and create a new column 'Date'
# Add a new column 'Audit' with a value of 1 for all rows (all of those are audited, boolean)
audit_detail_mtd__df = audit_detail_mtd__df \
  .withColumn( 
    "Date", audit_detail_mtd__df["Audit Datetime"].substr(1, 10)
  ) \
  .withColumn(   
     "Audit Complete", F.lit(1)
  )
# display(audit_detail_mtd__df.limit(10))
audit_detail_mtd__df.show(10, truncate=False)

In [0]:
account_list__df.createOrReplaceTempView("account_list__view")
role_map__df.createOrReplaceTempView("role_map__df__view")
audit_detail_mtd__df.createOrReplaceTempView("audit_detail_mtd__view")
exclusion__df.createOrReplaceTempView("exclusion__view")

In [0]:
account_list_final__df = \
  spark.sql('''
    SELECT 
      ac.*,
      ro.`Area Sales Director` ,
      ro.`Account Owner Manager/Sales Manager` AS `Account Owner Manager`,
      CASE
        WHEN au.`Audit Complete` = 1 THEN 1
        ELSE 0
      END `Audit Complete`,
      CASE
        WHEN ex.Exclude = 1 THEN 1
        ELSE 0
      END Exclusions
    FROM account_list__view ac
    LEFT JOIN role_map__df__view ro
      ON ac.`Owner: Full Name` = ro.`Account Owner/Sales Rep`
    LEFT JOIN audit_detail_mtd__view au
      ON ac.`Account 18 ID` = au.`Account 18 ID`
    LEFT JOIN exclusion__view ex
      ON ac.`Account 18 ID` = ex.`Account Id`
  ''')

In [0]:
# display(account_list_final__df.limit(10))
account_list_final__df.show(10, truncate=False)

In [0]:
account_list_final__df.printSchema()

#### `New tables` for export: "Audit gap list" series

In [0]:
account_list_final__df.createOrReplaceTempView("account_list_final__view")

`Previous` tab, from "audit gap list" excel
<br>
Pending:
<br>
- Does not refer to the model, confirm data source.
<br>
- Here we assume that "MTD Audit %" column is calculated as proposed below.
<br>
- COUNT or SUM, add DISTINCT?
<br>
- "Owner: Full Name" = "Account (EU): Owner Name" ?
<br>
Note: consider that in the template there are no formulas for these fields.

In [0]:
previous__df = spark.sql('''
  SELECT
    *,
    ROUND(`Sum of Audits Completed`/`Count of Account Name`*100, 0) AS `MTD Audit %`
  FROM (
    SELECT 
      `Area Sales Director` , -- regional director
      `Account Owner Manager`, -- sales manager
      `Owner: Full Name` AS `Account (EU): Owner Name`, -- sales rep
      CONCAT(`Account Owner Manager`,`Owner: Full Name`) AS concatenated_names,
      COUNT(`Account Name`) AS `Count of Account Name`,
      SUM(`Audit Complete`) AS `Sum of Audits Completed` 
    FROM account_list_final__view
    GROUP BY
      `Area Sales Director` ,
      `Account Owner Manager`,
      `Owner: Full Name`,
      CONCAT(`Account Owner Manager`,`Owner: Full Name`)
    ) t
  ''')

# display(previous__df)
previous__df.show(10, truncate=False)

In [0]:
previous__df.createOrReplaceTempView("previous__view")

%md
`Pivot Audit Report` tab, from "audit gap list" excel.
<br>
Pending:
- "Previous # of Accounts" = "Count of Account 18 ID" due to the need to review the "previous" tab data source
<br>

In [0]:
pivot_audit_report__df = spark.sql('''

  SELECT 
   t.*,
   p.`Count of Account Name` AS `Previous # of Accounts`
  FROM (
    SELECT 
      `Area Sales Director` , -- regional director
      `Account Owner Manager`, -- sales manager
      `Owner: Full Name` AS `Account (EU): Owner Name`, -- sales rep
      CONCAT(`Account Owner Manager`, `Owner: Full Name`) AS concatenated_names,
      COUNT(`Account 18 ID`) AS `Count of Account 18 ID`,
      COUNT(DISTINCT `Account 18 ID`) AS `Distinct Count of Account 18 ID`
    FROM account_list_final__view
    GROUP BY
      `Area Sales Director` ,
      `Account Owner Manager`,
      `Owner: Full Name`,
      CONCAT(`Account Owner Manager`, `Owner: Full Name`)
    ) t
  LEFT JOIN previous__view p
    ON t.concatenated_names = p.concatenated_names
  ''')

# display(pivot_audit_report__df)
pivot_audit_report__df.show(10, truncate=False)

%md
`Audit Gap List` tab, from "audit gap list" excel.
<br>
Pending:
- lorem ipusm
<br>

In [0]:
audit_gap_list__df = spark.sql('''
  SELECT
    `Account Name`,
    `Audit Complete`,
    Channel,
    `Account 18 ID`,
    `Area Sales Director`,
    `Account Owner Manager`,
    `Owner: Full Name`,
    `Region`,
    `Territory: Territory Name`,
    Chain,
    `Taxonomy Level 1`,
    `Visit Frequency: Visit Frequency Name`,
    `Key Account Manager: Full Name`,
    `Outlet Classification`,
    `PRS Group Outlet Type (EU) name`,
    `PRS Outlet Type (EU) name`,
    `PRS Image Level Name`,
    `PRS Volume Potential Name`,
    SUM(Exclusions) AS `Sum of Exclusions`
  FROM account_list_final__view
  GROUP BY
    `Account Name`,
    `Audit Complete`,
    Channel,
    `Account 18 ID`,
    `Area Sales Director`,
    `Account Owner Manager`,
    `Owner: Full Name`,
    `Region`,
    `Territory: Territory Name`,
    Chain,
    `Taxonomy Level 1`,
    `Visit Frequency: Visit Frequency Name`,
    `Key Account Manager: Full Name`,
    `Outlet Classification`,
    `PRS Group Outlet Type (EU) name`,
    `PRS Outlet Type (EU) name`,
    `PRS Image Level Name`,
    `PRS Volume Potential Name`  
  ''')

# display(audit_gap_list__df)
audit_gap_list__df.show(10, truncate=False)

%md
`Audit Summary` tab, from "audit gap list" excel.
<br>
Pending:
- review "previous" table source
<br>

In [0]:
pivot_audit_report__df.createOrReplaceTempView("pivot_audit_report__view")

audit_summary__df = spark.sql('''
  SELECT 
    piv.`Area Sales Director` , -- regional director
    piv.`Account Owner Manager`, -- sales manager
    piv.`Account (EU): Owner Name`, -- sales rep
    piv.`Count of Account 18 ID` AS `Count of Account Name`,
    piv.`Distinct Count of Account 18 ID` AS `Sum of Audits Completed`,
    ROUND(
      COALESCE(
        piv.`Distinct Count of Account 18 ID` / NULLIF(piv.`Count of Account 18 ID`, 0)
        , 0)
        *100
      ,0) 
      AS `MTD Audit %`,
    pre.`MTD Audit %` AS `Prior Week`,
    pre.`Count of Account Name` AS `Previous # of Accounts`,
    piv.`Count of Account 18 ID`- pre.`Count of Account Name` AS `Database Movement`,
    piv.`Count of Account 18 ID`- piv.`Distinct Count of Account 18 ID` AS `No of Accounts Not Audited`
  FROM pivot_audit_report__view piv
  LEFT JOIN previous__view pre
    ON piv.concatenated_names = pre.concatenated_names
  ''')

# display(audit_summary__df)
audit_summary__df.show(10, truncate=False)


#### `Move` files btw Sharepoint Folders

##### keep researching -> move_to or moveto don't work in this module version --> Error: access denied

https://github.com/vgrem/Office365-REST-Python-Client/issues/157
<br>
https://github.com/vgrem/Office365-REST-Python-Client/issues/703

Before June 30th, 2023 The moveTo function was moving the files within the sharepoint folders. But after 1st July , 2023 , its not moving the files and giving the error as ('-2147024891, System.UnauthorizedAccessException', 'Access denied.', "403 Client Error: Forbidden for url:.....). I checked there are required access as It does other operations like delete and other. I guess the issue is with office365 python package.

#### `Write` in sharepoint

In [0]:
# Folder link and object
output_folder = "https://test-company.sharepoint.com/:f:/s/SASIPAutomationProject/Ehr1xSw61FRCnUk8aCvZ4NQB_qOJe7eqGjYdd0jXq9zrsg?e=MsHZxK"
output_folder_obj = client.web.get_folder_by_guest_url_extended(output_folder).execute_query()

#### Write in sharepoint - `A: Pandas`
##### simplier but less scalable if big data related

In [0]:
# Convert Spark DataFrame to Pandas DataFrame
role_map__pdf = role_map__df.toPandas()

# Convert the Pandas DataFrame to a CSV string
csv_data = role_map__pdf.to_csv(index=False)

# Upload CSV to SharePoint directly from the CSV string
output_file_name = "role_map.csv"
output_folder_obj.upload_file(output_file_name, csv_data.encode('utf-8')).execute_query()

#### Write in sharepoint - B: `just PySpark`
##### use A.. 

In [0]:
from io import StringIO

# Collect the DataFrame to a list of Row objects
rows = role_map__df.collect()

# Get the schema (column names)
columns = role_map__df.columns

# Create a CSV string from the collected data
csv_output = StringIO()
csv_output.write(','.join(columns) + '\n')  # Write header
for row in rows:
    csv_output.write(','.join(str(value) for value in row) + '\n')  # Write each row

# Get the CSV string from the StringIO object
csv_data = csv_output.getvalue()

# Upload CSV to SharePoint directly from the CSV string
output_file_name = "role_map.csv"
output_folder_obj.upload_file(output_file_name, csv_data.encode('utf-8')).execute_query()


#### `Delete` files in a Sharepoint Folder

In [0]:
# Folder link
deleteFolder_link = "https://test-company.sharepoint.com/:f:/s/SASIPAutomationProject/Ehr1xSw61FRCnUk8aCvZ4NQB_qOJe7eqGjYdd0jXq9zrsg?e=MsHZxK"

# Retrieve the folder object
deleteFolder_obj = client.web.get_folder_by_guest_url_extended(deleteFolder_link).execute_query()

# Load the files in the folder
sharepoint_files_toDelete = deleteFolder_obj.files
client.load(sharepoint_files_toDelete).execute_query()

# Delete each file in the folder
for file in sharepoint_files_toDelete:
    print(f"Deleting file: {file.properties['Name']}")
    file.delete_object()
    client.execute_query()

print("All files deleted.")