Copyright (c) Microsoft Corporation.

Licensed under the MIT License.

In [4]:
data_lake_account_name = '' # Synapse Workspace ADLS
file_system_name = ''
user_group_name = ''

In [5]:
base_path = f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/"

In [6]:
#process user data

from pyspark.sql.functions import *
from pyspark.sql.types import *

def load_users_json_file(usersPath):
    users_df= spark.read.load(usersPath, format='json')
    users_df = users_df.select("id","userPrincipalName","mail","mailNickname","puser","ptenant","usageLocation")

    
    users_path = base_path + "o365data_processed/users/" + user_group_name 
    users_df.write.format("parquet").mode("append").save(users_path)

def get_users_subfolder_files(folder):
    children = mssparkutils.fs.ls(folder)
    for child in children:
        if child.name == 'metadata':
            continue
        if child.isDir:
           get_users_subfolder_files(child.path)
        else:       
            load_users_json_file(child.path)
            
usersPath = base_path + "o365data/users/" + user_group_name 
get_users_subfolder_files(usersPath)

In [7]:
#process mail folder data

from pyspark.sql.functions import *
from pyspark.sql.types import *

def load_mailfolders_json_file(mailfodlersPath):
    mailfolders_df= spark.read.load(mailfodlersPath, format='json')
    mailfolders_df = mailfolders_df.select("id","displayName","parentFolderId","puser","ptenant")

    mailfolder_path = base_path + "/o365data_processed/mailfolders/" + user_group_name 
    mailfolders_df.write.format("parquet").mode("append").save(mailfolder_path)

def get_mailfolders_subfolder_files(folder):
    children = mssparkutils.fs.ls(folder)
    for child in children:
        if child.name == 'metadata':
            continue
        if child.isDir:
           get_mailfolders_subfolder_files(child.path)
        else:
            load_mailfolders_json_file(child.path)
            
mailfoldersPath = base_path + "o365data/mailfolders/" + user_group_name
get_mailfolders_subfolder_files(mailfoldersPath)

In [8]:
base_path = f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/"

In [9]:
users_path = base_path + "o365data_processed/users/" + user_group_name 
mailfolder_path = base_path + "o365data_processed/mailfolders/" + user_group_name 


from pyspark.sql.functions import *
df_folder_config = spark.read.format("cosmos.oltp")\
            .option("spark.synapse.linkedService", "CosmosDB")\
            .option("spark.cosmos.container", "users")\
            .load()

 # Uncomment this if your Linked Service is enabled with a private endpoint 
#df_folder_config = spark.read.format("cosmos.oltp")\
#           .option("spark.cosmos.useGatewayMode", True)\
#            .option("spark.synapse.linkedService", "CosmosDB")\
#            .option("spark.cosmos.container", "users")\
#            .load()

df_folder_config = df_folder_config.select('email','folders').select(explode(col("folders")).alias("folders"),'email') \
               .select(["email",col("folders.FolderName").alias("FolderName")])


df_users = spark.read.format('parquet').load(users_path, header=True)
df_mailfolder = spark.read.format('parquet').load(mailfolder_path, header=True)

df_folder_filter = df_folder_config.join(df_users, lower(df_folder_config['email']) == lower(df_users['UserPrincipalName'])) \
                .join(df_mailfolder, (df_folder_config['FolderName'] == df_mailfolder['displayName']) & (df_mailfolder['puser'] == df_users['puser'])) \
                .select(df_users.puser,df_users.ptenant,df_mailfolder.id, df_mailfolder.parentFolderId)

folderfilter_path = base_path + "o365data_processed/folderfiltersdata/" #+ user_group_name
df_folder_filter.write.format("parquet").mode("append").save(folderfilter_path)