In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [23]:
## Read in all fund name data fro web scrapping
directory = 'Data'

fund_names = set()

for subdir, _, files in os.walk(directory):
    for filename in files:
        if filename in ["OTHERMANAGER.tsv", "OTHERMANAGER2.tsv"]:
            file_path = os.path.join(subdir, filename)
            try:
                df = pd.read_csv(file_path, sep='\t')
                # make sure name col exist
                if 'NAME' in df.columns:
                    fund_names.update(df['NAME'].dropna().unique())
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

fund_name_list = sorted(fund_names)

print(len(fund_name_list))

4486


In [24]:

# Function to normalize the fund names
def normalize_fund_name(fund_name):
    fund_name = fund_name.lower()
    fund_name = re.sub(r'[^a-zA-Z0-9 ]', '', fund_name)  # no NONE ALPHANUM
    fund_name = fund_name.replace(' ', '-') 
    fund_name = re.sub(r'--+', '-', fund_name)
    return fund_name

# Function to perform the search and scrape the AUM
def get_aum(normalized_name):
    url = f'https://aum13f.com/firm/{normalized_name}'
    aum_value, aum_date = 0, 0

    try:
        response = requests.get(url)
        response.raise_for_status()
        

        soup = BeautifulSoup(response.text, 'html.parser')
        
        aum_field = soup.find("table", {"class": "resp-lg-div"})
        if aum_field:
            aum_text = aum_field.get_text(strip=True)
            aum_match = re.search(r'AUM\s*\$(\d+(\.\d+)?)\s*(BB|MM)\s*\((\d{4}-\d{2}-\d{2})\)', aum_text, re.IGNORECASE)
            if aum_match:
                aum_value = float(aum_match.group(1).replace(',', ''))
                aum_unit = aum_match.group(3).upper()
                aum_date = aum_match.group(4)

                # numerical aum
                if aum_unit == 'MM':  
                    aum_value *= 1e6
                elif aum_unit == 'BB':  
                    aum_value *= 1e9

                return aum_value, aum_date, 1
            else:
                print(f"AUM format not found in the text for {normalized_name}")
                return aum_value, aum_date, 1
        else:
            print(f"AUM field not found on the page for {normalized_name}")
            return aum_value, aum_date, 2

    except requests.exceptions.RequestException as e:
        print(f"Error retrieving the URL for {normalized_name}: {e}")
        return aum_value, aum_date, 3



def aum_dataset(fund_list):
    aum_data = []
    failed_data = []
    processed_names = set()


    
    for fund_name in fund_list:
        normalized_name = normalize_fund_name(fund_name)
        
        if normalized_name in processed_names:
            continue

        processed_names.add(normalized_name)
        aum, aum_date, reason= get_aum(normalized_name)

        if reason == 0:
            failed_data.append({
                'fund_name': fund_name, 
                'normalized_name': normalized_name, 
                'reason': 0  # url failed
            })
        else:
            if aum == 0:
                failed_data.append({
                    'fund_name': fund_name, 
                    'normalized_name': normalized_name, 
                    'reason': 1  # url good aum not founf
                })
            else:
                aum_data.append({
                    'fund_name': fund_name, 
                    'aum': aum, 
                    'aum_date': aum_date
                })
    aum_df = pd.DataFrame(aum_data)
    failed_df = pd.DataFrame(failed_data)

    return aum_df, failed_df
        
    
# Example list of fund names (in capitalized format)
fund_list = fund_name_list

# Generate the AUM dataset
aum_df, failed_df = aum_dataset(fund_list)


AUM field not found on the page for -arrow-financial-corporation
AUM field not found on the page for -artemis-investment-management-llp
AUM field not found on the page for -bp-plc
AUM field not found on the page for -burgundy-asset-management-ltd
AUM field not found on the page for -first-commonwealth-financial-corp-pa
AUM field not found on the page for -gradient-investments-llc
AUM field not found on the page for -hills-bank-trust-co
AUM field not found on the page for -lsv-asset-management
AUM field not found on the page for -mg-investment-management-limited
AUM field not found on the page for -overseachinese-banking-corporation-limited
AUM field not found on the page for 02806460-st-bank
AUM field not found on the page for 12th-street-asset-mgmt
AUM field not found on the page for 13f-manager
AUM format not found in the text for 140-summer-partners-master-fund-lp
AUM field not found on the page for 1832-asset-management-lp
AUM field not found on the page for 1919-investment-council

KeyboardInterrupt: 

In [14]:
# aum_df.to_csv("Data/aum_df", index=False)

In [None]:
aum_df

In [11]:
failed_df

Unnamed: 0,fund_name,normalized_name,reason
0,Desjardins Global Asset Management Inc.,desjardins-global-asset-management-inc,1
1,LADENBURG THALMANN ASSET MANAGEMENT,ladenburg-thalmann-asset-management,1
2,ZACKS INVESTMENT MANAGEMENT,zacks-investment-management,1
3,MERITAGE PORTFOLIO MANAGEMENT,meritage-portfolio-management,1
4,DAVIDSON INVESTMENT ADVISORS,davidson-investment-advisors,1
...,...,...,...
976,Wellington,wellington,1
977,AllianceBernstein,alliancebernstein,1
978,Brandywine,brandywine,1
979,Smith Group,smith-group,1


In [60]:

# # Function to perform the search and scrape the AUM
# fund_name = "Desjardins Global Asset Management Inc."
# normalized_name = normalize_fund_name(fund_name)
# url = f'https://aum13f.com/firm/{normalized_name}'
# aum_value, aum_date = 0, 0

# try:
#     response = requests.get(url)
#     response.raise_for_status()
    

#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     aum_field = soup.find("table", {"class": "resp-lg-div"})
#     if aum_field:
#         aum_text = aum_field.get_text(strip=True)
#         aum_match = re.search(r'AUM\s*\$(\d+(\.\d+)?)\s*(BB|MM)\s*\((\d{4}-\d{2}-\d{2})\)', aum_text, re.IGNORECASE)
#         if aum_match:
#             aum_value = float(aum_match.group(1).replace(',', ''))
#             aum_unit = aum_match.group(3).upper()
#             aum_date = aum_match.group(4)

#             # numerical aum
#             if aum_unit == 'MM':  
#                 aum_value *= 1e6
#             elif aum_unit == 'BB':  
#                 aum_value *= 1e9

#             print(aum_value, aum_date, 1)
#         else:
#             print(f"AUM format not found in the text for {fund_name}")
#             print(aum_value, aum_date, 1)
#     else:
#         print(f"AUM field not found on the page for {fund_name}")
#         print(aum_value, aum_date, 1)

#         print(aum_value, aum_date, 0) 
# except requests.exceptions.RequestException as e:
#     print(f"Error retrieving the URL for {fund_name}: {e}")
#     print(aum_value, aum_date, 0) 


AUM format not found in the text for Desjardins Global Asset Management Inc.
0 0 1
