In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [71]:

# Function to normalize the fund names
def normalize_fund_name(fund_name):
    fund_name = fund_name.lower()
    fund_name = re.sub(r'[^a-zA-Z0-9 ]', '', fund_name)  # no NONE ALPHANUM
    fund_name = fund_name.replace(' ', '-') 
    fund_name = re.sub(r'--+', '-', fund_name)
    return fund_name

# Function to perform the search and scrape the AUM
def get_aum(normalized_name):
    url = f'https://aum13f.com/firm/{normalized_name}'
    aum_value, aum_date = 0, 0

    try:
        response = requests.get(url)
        response.raise_for_status()
        

        soup = BeautifulSoup(response.text, 'html.parser')
        
        aum_field = soup.find("table", {"class": "resp-lg-div"})
        if aum_field:
            aum_text = aum_field.get_text(strip=True)
            aum_match = re.search(r'AUM\s*\$(\d+(\.\d+)?)\s*(BB|MM)\s*\((\d{4}-\d{2}-\d{2})\)', aum_text, re.IGNORECASE)
            if aum_match:
                aum_value = float(aum_match.group(1).replace(',', ''))
                aum_unit = aum_match.group(3).upper()
                aum_date = aum_match.group(4)

                # numerical aum
                if aum_unit == 'MM':  
                    aum_value *= 1e6
                elif aum_unit == 'BB':  
                    aum_value *= 1e9

                return aum_value, aum_date, 1
            else:
                print(f"AUM format not found in the text for {normalized_name}")
                return aum_value, aum_date, 1
        else:
            print(f"AUM field not found on the page for {normalized_name}")
            return aum_value, aum_date, 2

    except requests.exceptions.RequestException as e:
        print(f"Error retrieving the URL for {normalized_name}: {e}")
        return aum_value, aum_date, 3



def aum_dataset(fund_list):
    aum_data = []
    failed_data = []
    processed_names = set()


    
    for fund_name in fund_list:
        normalized_name = normalize_fund_name(fund_name)
        
        if normalized_name in processed_names:
            continue

        processed_names.add(normalized_name)
        aum, aum_date, reason= get_aum(normalized_name)

        if reason == 0:
            failed_data.append({
                'fund_name': fund_name, 
                'normalized_name': normalized_name, 
                'reason': 0  # url failed
            })
        else:
            if aum == 0:
                failed_data.append({
                    'fund_name': fund_name, 
                    'normalized_name': normalized_name, 
                    'reason': 1  # url good aum not founf
                })
            else:
                aum_data.append({
                    'fund_name': fund_name, 
                    'aum': aum, 
                    'aum_date': aum_date
                })
    aum_df = pd.DataFrame(aum_data)
    failed_df = pd.DataFrame(failed_data)

    return aum_df, failed_df
        
    
# Example list of fund names (in capitalized format)
fund_list = test_df["NAME"]

# Generate the AUM dataset
aum_df, failed_df = aum_dataset(fund_list)


AUM format not found in the text for desjardins-global-asset-management-inc
AUM format not found in the text for ladenburg-thalmann-asset-management
AUM format not found in the text for zacks-investment-management
AUM format not found in the text for meritage-portfolio-management
AUM format not found in the text for davidson-investment-advisors
AUM format not found in the text for consolidated-press-international-holdings-ltd
AUM field not found on the page for the-glenmede-trust-company-na
AUM field not found on the page for alps-advisors
AUM format not found in the text for arden-trust-co
AUM format not found in the text for franklin-resources-inc
AUM format not found in the text for spx-gestao-de-recursos-ltda
AUM format not found in the text for ci-investments-inc
AUM field not found on the page for russell-investments-group-ltd
AUM field not found on the page for altrinsic-global-advisor-llc
AUM field not found on the page for kennedy-capital-management
AUM format not found in the

In [54]:
test_df = pd.read_csv('Data/01jun2024-31aug2024_form13f/OTHERMANAGER.tsv', sep = "\t", usecols=["NAME"])

In [72]:
aum_df

Unnamed: 0,fund_name,aum,aum_date
0,TOWNSQUARE CAPITAL LLC,6.200000e+09,2023-03-31
1,"Granite Investment Partners, LLC",2.600000e+09,2023-03-24
2,"OSAIC WEALTH, INC.",9.460000e+10,2024-08-26
3,WRIGHT INVESTORS SERVICE INC,1.300000e+09,2023-03-07
4,LOGAN CAPITAL MANAGEMENT INC,2.300000e+09,2023-03-30
...,...,...,...
706,ZAZOVE ASSOCIATES LLC,2.300000e+09,2023-03-23
707,"SECURIAN ASSET MANAGEMENT, INC",4.110000e+10,2023-03-30
708,"Muzinich & Co., Inc.",3.090000e+10,2023-03-30
709,"CWM, LLC",1.990000e+10,2023-03-31


In [73]:
failed_df

Unnamed: 0,fund_name,normalized_name,reason
0,Desjardins Global Asset Management Inc.,desjardins-global-asset-management-inc,1
1,LADENBURG THALMANN ASSET MANAGEMENT,ladenburg-thalmann-asset-management,1
2,ZACKS INVESTMENT MANAGEMENT,zacks-investment-management,1
3,MERITAGE PORTFOLIO MANAGEMENT,meritage-portfolio-management,1
4,DAVIDSON INVESTMENT ADVISORS,davidson-investment-advisors,1
...,...,...,...
976,Wellington,wellington,1
977,AllianceBernstein,alliancebernstein,1
978,Brandywine,brandywine,1
979,Smith Group,smith-group,1


In [60]:

# Function to perform the search and scrape the AUM
fund_name = "Desjardins Global Asset Management Inc."
normalized_name = normalize_fund_name(fund_name)
url = f'https://aum13f.com/firm/{normalized_name}'
aum_value, aum_date = 0, 0

try:
    response = requests.get(url)
    response.raise_for_status()
    

    soup = BeautifulSoup(response.text, 'html.parser')
    
    aum_field = soup.find("table", {"class": "resp-lg-div"})
    if aum_field:
        aum_text = aum_field.get_text(strip=True)
        aum_match = re.search(r'AUM\s*\$(\d+(\.\d+)?)\s*(BB|MM)\s*\((\d{4}-\d{2}-\d{2})\)', aum_text, re.IGNORECASE)
        if aum_match:
            aum_value = float(aum_match.group(1).replace(',', ''))
            aum_unit = aum_match.group(3).upper()
            aum_date = aum_match.group(4)

            # numerical aum
            if aum_unit == 'MM':  
                aum_value *= 1e6
            elif aum_unit == 'BB':  
                aum_value *= 1e9

            print(aum_value, aum_date, 1)
        else:
            print(f"AUM format not found in the text for {fund_name}")
            print(aum_value, aum_date, 1)
    else:
        print(f"AUM field not found on the page for {fund_name}")
        print(aum_value, aum_date, 1)

        print(aum_value, aum_date, 0) 
except requests.exceptions.RequestException as e:
    print(f"Error retrieving the URL for {fund_name}: {e}")
    print(aum_value, aum_date, 0) 


AUM format not found in the text for Desjardins Global Asset Management Inc.
0 0 1
