In [1]:
from capfourpy.authentication import get_access_token_interactive, get_azure_db_token_api
from capfourpy.sharepoint import SharePoint
from urllib.parse import urlparse

import os
import pandas as pd
import platform
import time

**Subclassing**: CustomSharePoint inherits from SharePoint, so it will have all the same methods and attributes.

**Method Override**: By defining fetch_list_data within CustomSharePoint, it overrides the fetch_list_data method of the base SharePoint class.

In [None]:
class CustomSharePoint(SharePoint):
    """
    Subclass that inherits from capfourpy SharePoint
    """

    # Class variable to store the token
    idp_token = None

    def _generate_token(self, token: str = "missing idp token"):
        """
        Retrieves an authentication token using different methods based on the environment,
        and caches it as a class-level variable to avoid repeated authentications.

        Parameters
        ----------
        token (str, optional): Default token value, used when deployed. Defaults to "missing idp token".

        Returns
        -------
        str: Authentication token for accessing SharePoint API.
        """
        # Check if the token is already cached
        if CustomSharePoint.idp_token is not None:
            return CustomSharePoint.idp_token

        # Generate the token - different methods for hosted and local
        if platform.system() == "Linux":
            try:
                token = get_azure_db_token_api(scope=self.Scope)
            except:
                token = token  # Should always have a value when deployed, otherwise it will fail
        else:
            print("get_access_token_interactive")
            token = get_access_token_interactive(self.Client_Id, self.Tenant_Id, self.Scope)

        # Cache the token at the class level
        CustomSharePoint.idp_token = token
        return token


    def fetch_list_data(self, ListId: str = None, SiteUrl: str = None) -> pd.DataFrame:
        """
        Retrieves all data from a specified SharePoint list and converts it to a DataFrame.

        Parameters
        ----------
        ListId (str, optional): The unique identifier of the SharePoint list to retrieve.

        Returns
        -------
        pd.DataFrame: DataFrame containing all items from the specified SharePoint list.
        """
        large_list = self.ctx.web.lists.get_by_id(list_id=ListId)

        # items = large_list.items.get().execute_query()
        items = large_list.items.get_all().execute_query()
        # items = large_list.items.get_all().execute_query(500) # adding some number makes it run faster dunno why
        data = [item.properties for item in items]

        return pd.DataFrame(data)


    def get_files_metadata(self, folder_url: str):
        """
        Retrieves files in the specified SharePoint folder along with their metadata.

        Parameters
        ----------
        folder_url : str
            The relative URL of the target SharePoint folder.

        Returns
        -------
        List
            List of files with metadata in the specified folder.
        """
        folder = self.ctx.web.get_folder_by_server_relative_url(folder_url)
        # Expand to include ListItemAllFields to access metadata
        files_metadata = folder.files.expand(["ListItemAllFields"]).get().execute_query()
        return files_metadata

In [3]:
ListId = "6ba7678f-2b65-4ad4-8759-21b68035c8c8"
SiteUrl = "https://c4.sharepoint.com/sites/IMP"

sp = CustomSharePoint(site_url=SiteUrl)

get_access_token_interactive
No accounts found in cache.
No cached token found or expired. Initiating interactive authentication...


In [4]:
sp_data = sp.fetch_list_data(SiteUrl=SiteUrl, ListId=ListId)
print(sp_data.shape)
print(sp_data.columns)
print(sp_data.iloc[0].values)
print(sp_data.EB_SPWebUrl.values[0])
print(sp_data.EB_Url.values[0])

(3865, 27)
Index(['FileSystemObjectType', 'Id', 'ServerRedirectedEmbedUri',
       'ServerRedirectedEmbedUrl', 'ID', 'ContentTypeId', 'Title', 'Modified',
       'Created', 'AuthorId', 'EditorId', 'OData__UIVersionString',
       'Attachments', 'GUID', 'ComplianceAssetId', 'EB_Created', 'EB_ID',
       'EB_ListID', 'EB_SiteID', 'EB_SiteTitle', 'EB_SPWebUrl', 'EB_Url',
       'FlowLog', 'EB_UniqueID', 'EB_Author', 'OData__ColorTag',
       'EB_NoteType'],
      dtype='object')
[np.int64(0) np.int64(1) None '' np.int64(1)
 '0x0100F85F83B9B7B2164B971E241601880D090012394ABE830548438DEE171159507E12'
 'Faurecia FY 2020' '2021-12-01T12:22:22Z' '2021-12-01T12:22:22Z'
 np.int64(1073741822) np.int64(1073741822) '1.0' np.False_
 '6aa973a0-d18c-4c15-9b4f-01f187e1f20d' None '2021-07-21T04:57:14Z'
 np.int64(12) 'cc3155c9-9c23-4de3-a701-624e2d25feab'
 'b689d508-5290-4291-aa89-61470e5c2413' 'Faurecia'
 'https://c4.sharepoint.com/sites/Faurecia73'
 'https://c4.sharepoint.com/sites/Faurecia73/SitePages/

In [None]:
# Dictionary to store CustomSharePoint instances for each site URL to avoid redundant initializations
sp_instances = {}


def download_specific_files_for_row(row, desired_document_type, desired_document_subtype):
    """
    Downloads files with specific metadata values from the '/Reorg/' document library for a given site.
    Saves the files into the specified output_folder.

    Parameters
    ----------
    row : pd.Series
        A row from the dataframe containing 'EB_SPWebUrl'.
    desired_document_type : str
        The desired value for the "Document Type" column.
    desired_document_subtype : str
        The desired value for the "Document SubType" column.
    output_folder : str
        The folder path where files will be saved.
    """
    site_url = row['EB_SPWebUrl']
    rms_id = row['RmsId']
    if pd.isna(site_url):
        return

    # Use or create a CustomSharePoint instance for this site URL
    if site_url not in sp_instances:
        sp_instances[site_url] = CustomSharePoint(site_url=site_url)
    sp = sp_instances[site_url]

    # Construct the server-relative URL for the '/Reorg/' folder
    parsed_url = urlparse(site_url)
    server_relative_url = parsed_url.path.rstrip('/') + '/Reorg/'

    # Initialize a flag to check if the folder has been created
    folder_created = False
    files_downloaded = False  # Flag to check if any files were downloaded

    # Get files with metadata in the folder
    try:
        files = sp.get_files_metadata(server_relative_url)
        for file in files:
            list_item_properties = file.listItemAllFields.properties
            document_type = list_item_properties.get("DocumentType", None)
            document_subtype = list_item_properties.get("DocumentSubType", None)
            
            if document_type == desired_document_type and document_subtype == desired_document_subtype:
                # Get the file name
                file_name = file.name

                # Define the output folder path
                output_folder = os.path.join('../data/raw/sharepoint_reorg_files/', str(rms_id))

                # Check if the file already exists
                file_path = os.path.join(output_folder, file_name)
                if not os.path.exists(file_path):
                    # Create the folder if it hasn't been created yet
                    if not folder_created:
                        os.makedirs(output_folder, exist_ok=True)
                        folder_created = True

                    # Download the file
                    file_url = file.serverRelativeUrl
                    file_stream = sp.download_file(file_url)
                    # Save the file into the specified output folder
                    with open(file_path, 'wb') as f:
                        f.write(file_stream.read())
                    print(f"Downloaded file {file_name} from {site_url} to {output_folder}")
                    files_downloaded = True
                else:
                    pass
                    #print(f"File {file_name} already exists in {output_folder}, skipping download.")
        if not files_downloaded:
            print(f"No new files to download for RmsId {rms_id} at {server_relative_url}")
    except Exception as e:
        print(f"Failed to download files from {server_relative_url}: {e}")

### SQL Connection
- **fundamental_score**: `CfRms_prod` Fundamental Scores Data
- **rms_issuer**: `CfRms_prod` Linking RmsId to SharePoint Pages

In [None]:
from capfourpy.databases import Database
#import pickle as pkl


# To retrieve data from the Azure database:
db = Database(database="CfRms_prod", azure=True)
sql_query_fundamental_score = """
WITH tbl1 AS(
	SELECT r.ScoringId,
		   r.RmsId,
		   t.TemplateName AS ScoringType,
		   r.ScoringDate,
		   cat.Grouping AS CategoryGroup,
		   cat.Name AS Category,
		   rc.Score,
		   rc.Text,
		   (
			   SELECT c.Description AS CharacteristicText,
					  c.Influence AS CharacteristicInfluence
			   FROM Scoring.ResultCharacteristic AS rca
				   LEFT JOIN Scoring.Characteristic AS c ON c.CategoryId = rca.CategoryId AND c.CharacteristicId = rca.CharacteristicId
			   WHERE rca.ScoringId = rc.ScoringId AND rca.CategoryId = rc.CategoryId
			   FOR JSON PATH
		   ) AS TaggedCharacteristics
	FROM Scoring.Result AS r
		INNER JOIN Scoring.Template AS t ON t.TemplateId = r.TemplateId
		INNER JOIN Scoring.ResultCategory AS rc ON rc.ScoringId = r.ScoringId
		INNER JOIN Scoring.Category AS cat ON cat.CategoryId = rc.CategoryId
	WHERE t.TemplateName = 'Corporate'
)
SELECT * FROM tbl1 WHERE TaggedCharacteristics IS NOT NULL
"""

sql_query_rms_issuer = """
SELECT *
FROM [CfRms_prod].[Core].[RmsIssuer]
WHERE SharePointLink IS NOT NULL
"""

# Define dataframes from sql queries
fundamental_score = db.read_sql(sql_query_fundamental_score)
rms_issuer = db.read_sql(sql_query_rms_issuer)

# Keep only relevant columns
columns_to_remove_fundamental_score = ['ScoringId', 'ScoringType', 'Text']
columns_to_remove_rms_issuer = ['PrimaryAnalystId', 'SecondaryAnalystId', 'ResearchTeam',
       'CompanyDescription', 'BondTicker',
       'OperatingCountryIso', 'Industry', 'Sponsor', 'MajorityOwnership',
       'MinorityOwnership', 'WhyInvested', 'CreditPositives',
       'CreditNegatives', 'CreditView', 'BookType', 
       'UpdateUser', 'SharePointExcelModel', 'SharePointSiteName',
       'SharePointProvisioningStatus', 'SubIndustry',
       'SharePointProvisioningMessage'] # 'Status',

fundamental_score = fundamental_score.drop(columns=columns_to_remove_fundamental_score)
rms_issuer = rms_issuer.drop(columns=columns_to_remove_rms_issuer)

# Get a DataFrame with only names that have Fundamental Score
rms_with_fundamental_score = fundamental_score.merge(rms_issuer, on='RmsId', how='left')
rms_with_fundamental_score["SharePointLinkTruncated"] = rms_with_fundamental_score["SharePointLink"].apply(lambda x: x[:-1] if str(x).endswith('/') else x)
#rms_with_fundamental_score.to_pickle("./rms_with_fundamental_score.pkl")
rms_with_fundamental_score.head(2)

Unnamed: 0,RmsId,ScoringDate,CategoryGroup,Category,Score,TaggedCharacteristics,CompanyName,Status,SharePointLink,SharePointLinkTruncated
0,194,2021-04-15,Industry,Market Dynamics,2.0,"[{""CharacteristicText"":""Positive demographic, ...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
1,194,2021-04-15,Industry,Intra-Industry Competition,2.0,"[{""CharacteristicText"":""Market share is consol...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194


In [7]:
# Save subset of rms_with_fundamental_score df just to test it
df = rms_with_fundamental_score.head(25)
#df.to_pickle("./df_rms_with_fundamental_score.pkl")

In [8]:
# # Let's see how many SharePoint Sites we can link from our rms_with_fundamental_score to sp_data
# unique_site_urls_sp_data = set(sp_data["EB_SPWebUrl"])
# unique_site_urls_rms_data = set(rms_with_fundamental_score["SharePointLinkTruncated"])

# count_unique_sp_data_site_urls = len(unique_site_urls_sp_data)
# count_rms_site_urls = len(unique_site_urls_rms_data)
# common_elements_count = len(unique_site_urls_sp_data.intersection(unique_site_urls_rms_data))

# print(count_unique_sp_data_site_urls, count_rms_site_urls, common_elements_count)
# unique_to_rms_site_urls = unique_site_urls_rms_data - unique_site_urls_sp_data
# unique_to_rms_site_url

In [9]:
sp_data_unique = sp_data.drop_duplicates(subset='EB_SPWebUrl')
# sp_data_unique_50 = sp_data_unique.head(50)

# Clean and prepare URLs for matching
sp_data_unique['EB_SPWebUrl_cleaned'] = sp_data_unique['EB_SPWebUrl'].astype(str).str.rstrip('/')
rms_with_fundamental_score['SharePointLinkTruncated_cleaned'] = rms_with_fundamental_score['SharePointLinkTruncated'].astype(str).str.rstrip('/')

# Merge dataframes on cleaned URLs
merged_data = sp_data_unique.merge(
    rms_with_fundamental_score[['SharePointLinkTruncated_cleaned', 'RmsId', 'Status']],
    left_on='EB_SPWebUrl_cleaned',
    right_on='SharePointLinkTruncated_cleaned',
    how='left')

# Cast RMS and drop rows with missing RmsId
merged_data = merged_data.dropna(subset=["RmsId"])
merged_data["RmsId"] = merged_data["RmsId"].astype(int)

# Remove duplicates if necessary
merged_data_unique = merged_data.drop_duplicates(subset='EB_SPWebUrl_cleaned')
merged_data_unique.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sp_data_unique['EB_SPWebUrl_cleaned'] = sp_data_unique['EB_SPWebUrl'].astype(str).str.rstrip('/')


Unnamed: 0,FileSystemObjectType,Id,ServerRedirectedEmbedUri,ServerRedirectedEmbedUrl,ID,ContentTypeId,Title,Modified,Created,AuthorId,...,EB_Url,FlowLog,EB_UniqueID,EB_Author,OData__ColorTag,EB_NoteType,EB_SPWebUrl_cleaned,SharePointLinkTruncated_cleaned,RmsId,Status
0,0,1,,,1,0x0100F85F83B9B7B2164B971E241601880D090012394A...,Faurecia FY 2020,2021-12-01T12:22:22Z,2021-12-01T12:22:22Z,1073741822,...,https://c4.sharepoint.com/sites/Faurecia73/Sit...,,a2f533c1-1683-48b4-9d26-a712c114d2e3,Aske Taastrøm,,,https://c4.sharepoint.com/sites/Faurecia73,https://c4.sharepoint.com/sites/Faurecia73,127,Active
7,0,5,,,5,0x0100F85F83B9B7B2164B971E241601880D090012394A...,Techem - 3Q21 Results - Positive,2021-12-01T12:22:35Z,2021-12-01T12:22:35Z,1073741822,...,https://c4.sharepoint.com/sites/4692/SitePages...,,4c6d3508-2663-4158-b37b-c1260b73f742,Andreas Dahl Jensen,,,https://c4.sharepoint.com/sites/4692,https://c4.sharepoint.com/sites/4692,287,Active


In [10]:
merged_data_unique.columns

Index(['FileSystemObjectType', 'Id', 'ServerRedirectedEmbedUri',
       'ServerRedirectedEmbedUrl', 'ID', 'ContentTypeId', 'Title', 'Modified',
       'Created', 'AuthorId', 'EditorId', 'OData__UIVersionString',
       'Attachments', 'GUID', 'ComplianceAssetId', 'EB_Created', 'EB_ID',
       'EB_ListID', 'EB_SiteID', 'EB_SiteTitle', 'EB_SPWebUrl', 'EB_Url',
       'FlowLog', 'EB_UniqueID', 'EB_Author', 'OData__ColorTag', 'EB_NoteType',
       'EB_SPWebUrl_cleaned', 'SharePointLinkTruncated_cleaned', 'RmsId',
       'Status'],
      dtype='object')

### Download Offerings (Prospectuses) From SharePoint Pages

In [11]:
# Define the desired metadata values
desired_document_type = "Legal"
desired_document_subtype = "Offerings"

# Create a dictionary to store CustomSharePoint instances
sp_instances = {}

# Process each row in the merged dataframe
for index, row in merged_data_unique.iterrows():
    if pd.isna(row['RmsId']):
        continue  # Skip rows where there is no matching RmsId
    # Download files; the function will create the folder only if files are downloaded
    download_specific_files_for_row(row, desired_document_type, desired_document_subtype)

No new files to download for RmsId 127 at /sites/Faurecia73/Reorg/
No new files to download for RmsId 287 at /sites/4692/Reorg/
No new files to download for RmsId 12 at /sites/12/Reorg/
No new files to download for RmsId 312 at /sites/Axilone77/Reorg/
No new files to download for RmsId 135 at /sites/135/Reorg/


KeyboardInterrupt: 

**List of `RmsId` that does have Fundamental Score but does not have Findox Offering PDF** 

In [12]:
# Print the list of RmsId that does not have Prospectus
sorted_rms_id_df = merged_data_unique[["RmsId"]].sort_values(by="RmsId").reset_index(drop=True)
rms_id_list = sorted_rms_id_df["RmsId"].to_list()

directory_path = "../data/raw/sharepoint_reorg_files/"
# Get list of RmsId folders that actually exist in the directory
existing_folders = [int(folder) for folder in os.listdir(directory_path) if folder.isdigit() and int(folder) in rms_id_list]

# List of RmsId that do not have an associated folder
rms_id_without_folders = [rms_id for rms_id in rms_id_list if rms_id not in existing_folders]
print(rms_id_without_folders)

[1, 2, 5, 10, 12, 13, 14, 17, 20, 24, 32, 33, 35, 37, 40, 41, 46, 51, 52, 55, 56, 63, 65, 66, 68, 74, 75, 78, 80, 81, 83, 84, 86, 89, 95, 98, 105, 112, 116, 120, 128, 132, 134, 135, 140, 143, 145, 147, 149, 153, 159, 160, 161, 168, 171, 174, 178, 183, 185, 196, 200, 209, 218, 223, 224, 225, 227, 229, 232, 236, 238, 241, 242, 246, 249, 257, 263, 265, 268, 271, 273, 274, 275, 278, 279, 280, 282, 291, 293, 295, 296, 300, 301, 302, 305, 307, 309, 311, 312, 316, 319, 323, 324, 327, 329, 331, 333, 335, 339, 342, 344, 345, 349, 355, 356, 358, 360, 362, 370, 372, 374, 376, 384, 385, 400, 413, 414, 420, 428, 431, 432, 438, 451, 453, 454, 488, 489, 496, 499, 508, 512, 517, 518, 593, 613, 614, 615, 616, 621, 622, 626, 634, 639, 645, 646, 647, 648, 649, 653, 654, 655, 659, 662, 663, 664, 672, 673, 674, 675, 679, 680, 682, 683, 684, 765, 767, 768, 772, 816, 839, 845, 884, 901, 904, 905, 906, 907, 908, 911, 913, 917, 920, 924, 935, 936, 945, 946, 947, 948, 949, 950, 951, 953, 954, 976, 986, 987, 988