In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=7799c9f0161405b4e6b90f034fb786021de582acd557f60bf6c43108b28eff08
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:

!pip install pyaml

Collecting pyaml
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Installing collected packages: pyaml
Successfully installed pyaml-23.12.0


In [None]:
!pip install msal

Collecting msal
  Downloading msal-1.26.0-py2.py3-none-any.whl (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.0/99.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: msal
Successfully installed msal-1.26.0


# databricks_utils.py

In [None]:
# Functions to support connector with databricks

# Get secrets from Databricks

# Extract config variables from Databricks
def get_secret_variables():
    try:
        # Get secret variables from Databricks
        client_secret = dbutils.secrets.get(scope = 'scope',key = 'client_secret')
        client_id = dbutils.secrets.get(scope = 'scope',key = 'client_id')
        authority = dbutils.secrets.get(scope = 'scope',key = 'authority')
        scope = dbutils.secrets.get(scope = 'scope',key = 'scope')
        return client_secret, client_id,  authority, scope
    except Exception as e:
        print('''
        Databricks secrets were not fetched correctly. please make sure the the variables names are:
        - Client Secret = "client_secret"
        - Client ID = "client_id"
        - Authority = "authority"
        - Scope = "scope"

        For more information, check traceback below:
        ''')
        print(e)



# general_utils.py

In [None]:
# Flatten JSON
def flatten_json(json_input, prefix=''):

    # Create empty dictionary to collect JSON data
    flattened_json_dictionary = {}

    # Get key:value items from JSON
    for key, value in json_input.items():
        # Recursively flatten nested dictionaries within JSON
        if isinstance(value, dict):
            flattened_json_dictionary.update(flatten_json(value, f"{prefix}{key}_"))
        else:
            # Update the flattened dictionary with the current key:value item
            flattened_json_dictionary[f"{prefix}{key}"] = value

    return flattened_json_dictionary

# Get Variables within the same row
def get_targetVariable_with_sourceVariable_from_pandas_df(df, source_column, source_value, target_column_to_fetch_value):
    # This function extract the specific value from a column given the input of a different column and it's reference value
    #  Example sites dataframe include multiple data in its metadata, but only the id is necessary in this case,
    #  therefore to get the site's ID we look for the id using the site name of interest
    #  E.g. get_targetVariable_with_sourceVariable_from_pandas_df(df, 'site_name', 'file_system', 'site_id')
    try:
        result = df.loc[df[source_column] == source_value, target_column_to_fetch_value].iloc[0]
        return result
    except Exception as e:
        print(f'Value not founded, please check the input variables. Check the traceback below:{e}')
        return None


def get_target_variable_with_source_variable_from_pyspark_df(spark, df, source_column, source_value, target_column_to_fetch_value):
    # Same as above but for pyspark
    try:
        result = df.filter(col(source_column) == source_value).select(target_column_to_fetch_value).first()[0]
        return result
    except Exception as e:
        print(f'Value not found, please check the input variables. Check the traceback below:{e}')
        return None




# sharepoint_res_utils.py

In [None]:
import pandas as pd
import json
#from utils.general_utils import flatten_json
import requests


# Get sites data in a flatten pandas dataframe (use for nested JSONS)
def flatten_sites_metadata_to_pandas(json_data):
    flattened_sites_data = [flatten_json(site_element, 'site_') for site_element in json_data['value']]
    # Return data as pandas dataframe
    return pd.DataFrame(flattened_sites_data)


# Get sites data in a flatten pyspark dataframe (use for nested JSONS)
def flatten_sites_metadata_to_spark(spark, json_data):
    # Create the df using the JSON response
    spark_df = spark.createDataFrame(json_data['value'])

    # Flatten the DataFrame using explode when column is nested
    for column in spark_df.columns:
        if isinstance(spark_df.schema[column].dataType, (dict, list)):
            spark_df = spark_df.withColumn(column, explode(col(column)))

    # Rename columns with a prefix
    for column in spark_df.columns:
        spark_df = spark_df.withColumnRenamed(column, f"site_{column}")

    return spark_df

# Get sites files data in a flatten pandas dataframe (use for nested JSONS)
def flatten_files_metadata_to_pandas(json_data, prefix):
    flattened_sites_data = [flatten_json(site_element, prefix) for site_element in json_data['value']]
    # Return data as pandas dataframe
    return pd.DataFrame(flattened_sites_data)

# Get sites files data in a flatten pyspark dataframe (use for nested JSONS)
def flatten_files_metadata_to_pyspark(spark, json_data, prefix):
    # Create the df using the JSON response
    spark_df = spark.read.json(spark.sparkContext.parallelize([json_data]))

    # Flatten the DataFrame using explode when column is nested
    for column in spark_df.columns:
        if isinstance(spark_df.schema[column].dataType, (dict, list)):
            # If the column is of type dict or list, explode it
            spark_df = spark_df.withColumn(column, col(column).alias(column + '_0')).selectExpr("*", f"inline_outer({column}) as {column}")

    # Rename columns with a prefix
    for column in spark_df.columns:
        spark_df = spark_df.withColumnRenamed(column, f"{prefix}{column}")

    return spark_df


# Upload files to specific site and folder in Sharepoint
def upload_file_to_sharepoint(access_token, site_id, filename,folder_path = ''):

    # Define upload request
    upload_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:{folder_path}/{filename}:/content"

    # Open file to write
    with open(filename, 'rb') as file:
        # Define header for request including bytes Content-Type
        headers = {
            'Authorization': access_token,
            'Content-Type': 'application/octet-stream',
        }
        # PUT request to upload file to Sharepoint
        response = requests.put(upload_url, headers=headers, data=file)

        if response.status_code == 201 or response.status_code == 200:
            print(f"File '{filename}' uploaded successfully.")
        else:
            print(f"Failed to upload {filename}. please check Traceback below: {response.text}")


# Get sites metadata as response
def get_sites_metadata_as_dataframe(spark,access_token, is_pyspark = True):
    if is_pyspark == True:
        return flatten_sites_metadata_to_spark(spark, get_graph_response('https://graph.microsoft.com/v1.0/sites', access_token).json())
    else:
        return flatten_sites_metadata_to_pandas(get_graph_response('https://graph.microsoft.com/v1.0/sites', access_token).json())


        # Get specific Site ID for accessing that one
        site_id = get_target_variable_with_source_variable_from_pyspark_df(df, 'site_name', 'file_system', 'site_id')
        site_name = get_target_variable_with_source_variable_from_pyspark_df(df, 'site_name', 'file_system', 'site_name')
        print(site_name)

# yaml_utils.py

In [None]:
import yaml
import os
# Functions to support the library
# Load config from YAML
def load_config():
    # Get the absolute path to the config.yml file
    script_directory = os.path.dirname(os.path.abspath(__file__))
    config_path = os.path.join(script_directory, 'config.yml')
    with open(config_path, 'r') as config_file:
        config = yaml.safe_load(config_file)
    return config

# Extract config variables from file
def get_config_variables_from_file(config):
    client_secret = config['share_point']['client_secret']
    client_id = config['share_point']['client_id']
    authority = config['share_point']['authority']
    scope = config['share_point']['scope']
    return client_secret, client_id, authority, scope


# main.py

In [None]:
import msal
import yaml
import json
import requests
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import col, explode, coalesce, lit

# Create class


class Sharepoint_connector_API:

    def __init__(self, connector=None, config_file_path=None):

        # Get config variables from a specific connector.
        # At the moment only databricks and YAML files are included as methods.
        # Credentials created in azure portal (get them from Entra ID. Register a new app if needed(more details in readme.md))

        # Create spark Session
        self.spark = SparkSession.builder \
            .appName("Sharepoint_Connector_App") \
            .getOrCreate()
        # Config variables from Databricks
        if connector.lower() == 'databricks':
            self.client_id, self.client_secret, self.authority, self.scope = get_secret_variables()
            print('fetched variables from script')
        # Config variables from YAML
        elif connector.lower() == 'yaml':
            # Get config variables from file or else define as None
            self.config = load_config(config_file_path)
            self.client_id, self.client_secret, self.authority, self.scope = get_config_variables_from_file()
        else:
            ##############################################################################
            ########################## START UNCOMMENT BEFORE GIT ########################
            ##############################################################################

            # self.client_id, self.client_secret, self.authority , self.scope = None

            ##############################################################################
            ########################## END UNCOMMENT BEFORE GIT ##########################
            ##############################################################################

            #####################################################################################################
            ########################## START REMOVING LINES BELOW BEFORE UPLOAD TO GIT ##########################
            #####################################################################################################

            self.client_id = '28e6a33b-3653-4cad-be88-e77678e47f65'
            self.client_secret = 'ws~8Q~sFWQpw3BYGSUVL.Twp4MeJk6-YZ1t0zb-W'
            self.authority = 'https://login.microsoft.com/02c3ccc0-6680-4316-934b-97d503015046'
            self.scope = ['https://graph.microsoft.com/.default']

###################################################################################################
########################## END REMOVING LINES BELOW BEFORE UPLOAD TO GIT ##########################
###################################################################################################

        self.access_token = self.get_authenticate_token(
            self.client_id, self.client_secret, self.authority, self.scope)
    # Functions to support the init function

    # Create function to authenticate to sharepoint using authenticate token
    def get_authenticate_token(self, client_id, client_secret, authority, scope):
        # Getting access token
        client = msal.ConfidentialClientApplication(
            client_id, authority=authority, client_credential=client_secret)

        # Try to lookup an access token in cache
        token_cache = client.acquire_token_silent(scope, account=None)
        # Assign token to access token for login
        if token_cache:
            access_token = 'Bearer ' + token_cache['access_token']
            print('Access token fetched from Cache')
        else:
            token = client.acquire_token_for_client(scopes=scope)
            access_token = 'Bearer ' + token['access_token']
            print('Access token created using Azure AD')

        return access_token

    def get_graph_response(self, url, access_token):

        # Define Header
        headers = {
            'Authorization': access_token
        }

        # Return get request using the access token
        return requests.get(url=url, headers=headers)

    # Get site metadata as pyspark df

    def get_sites_metadata(self):

        # Get sites metadata as response
        url = f'https://graph.microsoft.com/v1.0/sites'
        sites_metadata_response = self.get_graph_response(
            url, self.access_token)

        df = self.flatten_sublevel_metadata_to_spark(
            sites_metadata_response.json(), 'value')

        return df
# Get flattend data in a flattened PySpark DataFrame (use for nested JSONS)
    def flatten_sublevel_metadata_to_spark(self, json_data, key_to_convert):
        # Create the schema by inferring data types
        schema = StructType([StructField(key, StringType(), True) for key in json_data[key_to_convert][0]])

        # Create DataFrame with schema

        spark_df = self.spark.createDataFrame([], schema=schema)

        # Populate the df with data
        for row in json_data[key_to_convert]:
            values = []
            for key in schema.fieldNames():
                # get values to populate df
                value = row.get(key) if isinstance(row.get(key), dict) else row.get(key)
                values.append(value)

            spark_df = spark_df.union(self.spark.createDataFrame([tuple(values)], schema=schema))

        return spark_df

# Get specific site metadata df with site Name

    def flatten_json_response_metadata_to_spark(self, json_response):
        try:
            # Read JSON string as a df
            df = self.spark.read.json(
                self.spark.sparkContext.parallelize([json_response]))

            return df
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def get_metadata_dataframe_with_site_name(self, df, site_name):
        # Get sites metadata as response
        url = f'https://graph.microsoft.com/v1.0/sites'
        sites_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.get_sites_metadata()
        site_id = get_target_variable_with_source_variable_from_pyspark_df(
            self.spark, df, 'name', site_name, 'id')
        # Use site_id for fetching that site_response
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}"
        sites_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.flatten_json_response_metadata_to_spark(
            sites_metadata_response.json())
        return df

# Get files metadata df
    def get_all_files_metadata_with_site_name(self, df, site_name):
        # Get sites metadata as response
        url = f'https://graph.microsoft.com/v1.0/sites'
        sites_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.get_sites_metadata()
        site_id = get_target_variable_with_source_variable_from_pyspark_df(
            self.spark, df, 'name', site_name, 'id')

        # Use site_id for fetching that site_response
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}"
        site_name_metadata_response = self.get_graph_response(
            url, self.access_token)
        # Get df
        df = self.flatten_json_response_metadata_to_spark(
            site_name_metadata_response.json())

        # Get new id
        site_id = get_target_variable_with_source_variable_from_pyspark_df(
            self.spark, df, 'name', site_name, 'id')

        # Use new id to get new df
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root/children"
        site_files_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.flatten_sublevel_metadata_to_spark(
            site_files_metadata_response.json(), 'value')

        return df

# Get specific file metadata with site_name
    def get_specific_file_metadata_with_site_name(self, df, site_name, file_name):
        # Get sites metadata as response
        url = f'https://graph.microsoft.com/v1.0/sites'
        sites_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.get_sites_metadata()
        site_id = get_target_variable_with_source_variable_from_pyspark_df(
            self.spark, df, 'name', site_name, 'id')

        # Use site_id for fetching that site_response
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}"
        site_name_metadata_response = self.get_graph_response(
            url, self.access_token)
        # Get df
        df = self.flatten_json_response_metadata_to_spark(
            site_name_metadata_response.json())

        # Get new site id
        site_id = get_target_variable_with_source_variable_from_pyspark_df(
            self.spark, df, 'name', site_name, 'id')

        # Use new id to get new df
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root/children"
        site_files_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.flatten_sublevel_metadata_to_spark(
            site_files_metadata_response.json(), 'value')
        # Get file id
        # file_ids = get_target_variable_with_source_variable_from_pyspark_df(self.spark, df, 'name', file_name, 'id')
        file_ids_dict = {row['name']: row['id'] for row in df.collect()}
        file_id = file_ids_dict.get('folder')
        # Get file metadata as response
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/items/{file_id}"
        file_metadata_response = self.get_graph_response(
            url, self.access_token)

        # Convert to df
        df = self.flatten_json_response_metadata_to_spark(
            file_metadata_response.json())

        return df
# Getget_all_specific_file_metadata_with_site_name

    def get_all_specific_file_metadata_with_site_name(self, df, site_name):
        # Initialize an empty list to store response dictionaries
        response_list = []

        # Get sites metadata as response
        url = f'https://graph.microsoft.com/v1.0/sites'
        sites_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.get_sites_metadata()
        site_id = get_target_variable_with_source_variable_from_pyspark_df(
            self.spark, df, 'name', site_name, 'id')

        # Use site_id for fetching that site_response
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}"
        site_name_metadata_response = self.get_graph_response(
            url, self.access_token)
        # Get df
        df = self.flatten_json_response_metadata_to_spark(
            site_name_metadata_response.json())

        # Get new site id
        site_id = get_target_variable_with_source_variable_from_pyspark_df(
            self.spark, df, 'name', site_name, 'id')

        # Use new id to get new df
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root/children"
        site_files_metadata_response = self.get_graph_response(
            url, self.access_token)
        df = self.flatten_sublevel_metadata_to_spark(
            site_files_metadata_response.json(), 'value')
        # Get file id
        file_ids_dict = {row['name']: row['id'] for row in df.collect()}
        file_id = file_ids_dict.get('folder')
        # Get file metadata as response
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/items/{file_id}"
        file_metadata_response = self.get_graph_response(
            url, self.access_token)

        # Append the file metadata response to the list
        response_list.append(file_metadata_response.json())
        print(file_metadata_response.json())
        # Convert the list of dictionaries to a PySpark DataFrame
        response_df = self.spark.createDataFrame(response_list)
        print(len(response_list))
        return response_df


    # run function to be created accordingly

    def test_run(self):

        # Get sites metadata as response
        url = f'https://graph.microsoft.com/v1.0/sites'
        sites_metadata_response = self.get_graph_response(
            url, self.access_token)
        # get sites metadata df
        df = self.get_sites_metadata()
        # get site metadata df with site name
        metadata_dataframe_with_site_name = self.get_metadata_dataframe_with_site_name(
            df, 'file_system')
        # get site metadata df with site name
        files_metadata_dataframe_with_site_name = self.get_all_files_metadata_with_site_name(
            df, 'file_system')
        # Get specific file metadata df
        specific_file_metadata = self.get_specific_file_metadata_with_site_name(
            df, 'file_system', 'file_system_files_metadata.csv')

        # Get all specific file metadata df
        all_specific_file_metadata = self.get_all_specific_file_metadata_with_site_name(df, 'file_system')
        # Get list of all files in this site

        return df, specific_file_metadata, metadata_dataframe_with_site_name, files_metadata_dataframe_with_site_name, specific_file_metadata, all_specific_file_metadata


# Testing running the class above with desire functions
sp_connector = Sharepoint_connector_API('dummy_connector', None)
df, specific_file_metadata, metadata_dataframe_with_site_name, files_metadata_dataframe_with_site_name, specific_file_metadata, all_specific_file_metadata = sp_connector.test_run()
files_metadata_dataframe_with_site_name.show(5)


Access token created using Azure AD
{'@odata.context': "https://graph.microsoft.com/v1.0/$metadata#sites('xbpmf.sharepoint.com%2Cd8b00d65-d0c3-4b04-9a11-bb738275aa47%2C57216035-9a89-4f6f-8517-51cfdc6be081')/drive/items/$entity", 'createdDateTime': '2023-12-19T09:43:58Z', 'eTag': '"{5ADBB01F-63A2-48F0-B7FC-6F524CD7A461},1"', 'id': '01SCS67TQ7WDNVVITD6BELP7DPKJGNPJDB', 'lastModifiedDateTime': '2023-12-19T09:43:58Z', 'name': 'folder', 'webUrl': 'https://xbpmf.sharepoint.com/sites/file_system/Shared%20Documents/folder', 'cTag': '"c:{5ADBB01F-63A2-48F0-B7FC-6F524CD7A461},0"', 'size': 8803, 'createdBy': {'user': {'email': 'Maxi@xbpmf.onmicrosoft.com', 'id': '8a763747-9377-4d44-aa0b-adc5c9e55010', 'displayName': 'Maxi'}}, 'lastModifiedBy': {'user': {'email': 'Maxi@xbpmf.onmicrosoft.com', 'id': '8a763747-9377-4d44-aa0b-adc5c9e55010', 'displayName': 'Maxi'}}, 'parentReference': {'driveType': 'documentLibrary', 'driveId': 'b!ZQ2w2MPQBEuaEbtzgnWqRzVgIVeJmm9PhRdRz9xr4IF7-qCj4Lf5Q64yaUZgyiv_', 'id'

In [None]:
files_metadata_dataframe_with_site_name.show(5)


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------+-------------+
|     createdDateTime|                eTag|                  id|lastModifiedDateTime|                name|              webUrl|                cTag|size|           createdBy|      lastModifiedBy|     parentReference|      fileSystemInfo|        folder|       shared|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------+-------------+
|2023-12-19T09:43:58Z|"{5ADBB01F-63A2-4...|01SCS67TQ7WDNVVIT...|2023-12-19T09:43:58Z|              folder|https://xbpmf.sha...|"c:{5ADBB01F-63A2...|8803|{user={id=8a76374...|{user={id=8a76374...|{pat