In [1]:
####fidelity 

In [2]:
import pandas as pd

df = pd.read_csv("xmllinksfidelity.csv")

# Convert filing_date column to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Sort by filing_date ascending (oldest first)
df_sorted = df.sort_values('filing_date').reset_index(drop=True)

# Save back to CSV (optional)
df_sorted.to_csv('sortedfidelityxmllinks.csv', index=False)

print(df_sorted.head())


  filing_date filing_month                                           xml_link
0  2016-06-07      2016-06  https://www.sec.gov/Archives/edgar/data/917286...
1  2016-07-08      2016-07  https://www.sec.gov/Archives/edgar/data/917286...
2  2016-08-04      2016-08  https://www.sec.gov/Archives/edgar/data/917286...
3  2016-09-08      2016-09  https://www.sec.gov/Archives/edgar/data/917286...
4  2016-10-07      2016-10  https://www.sec.gov/Archives/edgar/data/917286...


In [3]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time

def find_local_name(elem, name):
    """Find the text content of a tag by its local name."""
    for el in elem.iter():
        if el.tag.endswith('}' + name) or el.tag == name:
            return el.text
    return None

def find_series_info(root):
    """Find series-level info node that might contain averagePortfolioMaturity."""
    for tag in ['seriesLevelInformation', 'seriesLevelInfo']:
        for el in root.iter():
            if el.tag.endswith('}' + tag) or el.tag == tag:
                return el
    return None

# Load your sorted links CSV file
df_links = pd.read_csv("sortedfidelityxmllinkstest.csv")

# Ensure links are sorted by filing_date ascending
df_links['filing_date'] = pd.to_datetime(df_links['filing_date'], errors='coerce')
df_links.sort_values('filing_date', inplace=True)
df_links.reset_index(drop=True, inplace=True)

xml_links = df_links['xml_link'].dropna().tolist()

data = []

headers = {
    'User-Agent': 'Nathrah (Nathrah.Sharul-Nizam@bayes.city.ac.uk)'
}

for idx, url in enumerate(xml_links):
    try:
        time.sleep(1)  # respectful delay
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Skipping {url}: HTTP {response.status_code}")
            continue

        root = ET.fromstring(response.content)

        cik = url.split('/')[6]
        filing_id = url.split('/')[7]

        # Extract seriesId and averagePortfolioMaturity
        series_id = find_local_name(root, 'seriesId')
        fund_name = f"Fund {series_id}" if series_id else "Unknown Fund"

        series_info = find_series_info(root)
        wam_text = find_local_name(series_info, 'averagePortfolioMaturity') if series_info else find_local_name(root, 'averagePortfolioMaturity')

        wam = None
        if wam_text:
            try:
                wam_cleaned = wam_text.strip().replace(',', '')
                wam = float(wam_cleaned)
            except ValueError:
                print(f"Failed to parse WAM '{wam_text}' from {url}")

        # 🆕 Extract netAssetOfSeries (AUM)
        net_assets_text = find_local_name(root, 'netAssetOfSeries')
        net_assets = None
        if net_assets_text:
            try:
                net_assets = float(net_assets_text.strip().replace(',', ''))
            except ValueError:
                print(f"Failed to parse Net Assets '{net_assets_text}' from {url}")

        doc_period_end = find_local_name(root, 'documentPeriodEndDate')

        # Get filing date from df_links (same index as url)
        filing_date_val = df_links.loc[idx, 'filing_date']

        print(f"✓ Processed: {url}")
        print(f"CIK: {cik}, Filing ID: {filing_id}")
        print(f"Fund Name: {fund_name}")
        print(f"WAM: {wam} days, Net Assets: {net_assets}, Period End: {doc_period_end}, Filing Date: {filing_date_val}")
        print('-' * 40)

        data.append({
            'CIK': cik,
            'Filing ID': filing_id,
            'Fund Name': fund_name,
            'WAM (days)': wam,
            'Net Assets': net_assets,
            'Document Period End': doc_period_end,
            'Filing Date': filing_date_val,
            'URL': url
        })

    except Exception as e:
        print(f"✗ Failed to process {url}: {e}")
        continue

# Create DataFrame
df = pd.DataFrame(data)

# Convert date columns to datetime
df['Document Period End'] = pd.to_datetime(df['Document Period End'], errors='coerce')
df['Filing Date'] = pd.to_datetime(df['Filing Date'], errors='coerce')

# Sort and save
df.sort_values('Filing Date', inplace=True)
df.reset_index(drop=True, inplace=True)

df.to_csv('n_mfp_fidelitytest.csv', index=False)
print("✅ Data saved to 'n_mfp_fidelitytest.csv'")
print(df.head())


✓ Processed: https://www.sec.gov/Archives/edgar/data/917286/000091728616000226/primary_doc.xml
CIK: 917286, Filing ID: 000091728616000226
Fund Name: Fund S000007052
WAM: 36.0 days, Net Assets: 6029429812.73, Period End: None, Filing Date: 2016-06-07 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/917286/000091728616000238/primary_doc.xml
CIK: 917286, Filing ID: 000091728616000238
Fund Name: Fund S000007052
WAM: 37.0 days, Net Assets: 5543204683.39, Period End: None, Filing Date: 2016-07-08 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/917286/000091728616000244/primary_doc.xml
CIK: 917286, Filing ID: 000091728616000244
Fund Name: Fund S000007052
WAM: 29.0 days, Net Assets: 5726992901.01, Period End: None, Filing Date: 2016-08-04 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/917286/000091728616000251/primary_doc.xml
CIK: 

In [4]:
#### jpmorgan

In [3]:
import pandas as pd

df = pd.read_csv("xmllinksjpmorgantest.csv")

# Convert filing_date column to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Sort by filing_date ascending (oldest first)
df_sorted = df.sort_values('filing_date').reset_index(drop=True)

# Save back to CSV (optional)
df_sorted.to_csv('sortedjpmorganxmllinks.csv', index=False)

print(df_sorted.head())


  filing_date filing_month                                           xml_link
0  2016-07-08      2016-07  https://www.sec.gov/Archives/edgar/data/121728...
1  2016-08-05      2016-08  https://www.sec.gov/Archives/edgar/data/121728...
2  2016-09-08      2016-09  https://www.sec.gov/Archives/edgar/data/121728...
3  2016-10-14      2016-10  https://www.sec.gov/Archives/edgar/data/121728...
4  2016-11-07      2016-11  https://www.sec.gov/Archives/edgar/data/121728...


In [4]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time

def find_local_name(elem, name):
    """Find the text content of a tag by its local name."""
    for el in elem.iter():
        if el.tag.endswith('}' + name) or el.tag == name:
            return el.text
    return None

def find_series_info(root):
    """Find series-level info node that might contain averagePortfolioMaturity."""
    for tag in ['seriesLevelInformation', 'seriesLevelInfo']:
        for el in root.iter():
            if el.tag.endswith('}' + tag) or el.tag == tag:
                return el
    return None

# Load your sorted links CSV file
df_links = pd.read_csv("sortedjpmorganxmllinks.csv")

# Ensure links are sorted by filing_date ascending
df_links['filing_date'] = pd.to_datetime(df_links['filing_date'], errors='coerce')
df_links.sort_values('filing_date', inplace=True)
df_links.reset_index(drop=True, inplace=True)

xml_links = df_links['xml_link'].dropna().tolist()

data = []

headers = {
    'User-Agent': 'Nathrah (Nathrah.Sharul-Nizam@bayes.city.ac.uk)'
}

for idx, url in enumerate(xml_links):
    try:
        time.sleep(1)  # respectful delay
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Skipping {url}: HTTP {response.status_code}")
            continue

        root = ET.fromstring(response.content)

        cik = url.split('/')[6]
        filing_id = url.split('/')[7]

        # Extract seriesId and averagePortfolioMaturity
        series_id = find_local_name(root, 'seriesId')
        fund_name = f"Fund {series_id}" if series_id else "Unknown Fund"

        series_info = find_series_info(root)
        wam_text = find_local_name(series_info, 'averagePortfolioMaturity') if series_info else find_local_name(root, 'averagePortfolioMaturity')

        wam = None
        if wam_text:
            try:
                wam_cleaned = wam_text.strip().replace(',', '')
                wam = float(wam_cleaned)
            except ValueError:
                print(f"Failed to parse WAM '{wam_text}' from {url}")

        # 🆕 Extract netAssetOfSeries (AUM)
        net_assets_text = find_local_name(root, 'netAssetOfSeries')
        net_assets = None
        if net_assets_text:
            try:
                net_assets = float(net_assets_text.strip().replace(',', ''))
            except ValueError:
                print(f"Failed to parse Net Assets '{net_assets_text}' from {url}")

        doc_period_end = find_local_name(root, 'documentPeriodEndDate')

        # Get filing date from df_links (same index as url)
        filing_date_val = df_links.loc[idx, 'filing_date']

        print(f"✓ Processed: {url}")
        print(f"CIK: {cik}, Filing ID: {filing_id}")
        print(f"Fund Name: {fund_name}")
        print(f"WAM: {wam} days, Net Assets: {net_assets}, Period End: {doc_period_end}, Filing Date: {filing_date_val}")
        print('-' * 40)

        data.append({
            'CIK': cik,
            'Filing ID': filing_id,
            'Fund Name': fund_name,
            'WAM (days)': wam,
            'Net Assets': net_assets,
            'Document Period End': doc_period_end,
            'Filing Date': filing_date_val,
            'URL': url
        })

    except Exception as e:
        print(f"✗ Failed to process {url}: {e}")
        continue

# Create DataFrame
df = pd.DataFrame(data)

# Convert date columns to datetime
df['Document Period End'] = pd.to_datetime(df['Document Period End'], errors='coerce')
df['Filing Date'] = pd.to_datetime(df['Filing Date'], errors='coerce')

# Sort and save
df.sort_values('Filing Date', inplace=True)
df.reset_index(drop=True, inplace=True)

df.to_csv('n_mfp_jpmorgan.csv', index=False)
print("✅ Data saved to 'n_mfp_jpmorgan.csv'")
print(df.head())


✓ Processed: https://www.sec.gov/Archives/edgar/data/1217286/000152535016000338/primary_doc.xml
CIK: 1217286, Filing ID: 000152535016000338
Fund Name: Fund S000002965
WAM: 44.0 days, Net Assets: 20364259489.23, Period End: None, Filing Date: 2016-07-08 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/1217286/000094936516000235/primary_doc.xml
CIK: 1217286, Filing ID: 000094936516000235
Fund Name: Fund S000002966
WAM: 9.0 days, Net Assets: 1579344499.07, Period End: None, Filing Date: 2016-08-05 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/1217286/000094936516000258/primary_doc.xml
CIK: 1217286, Filing ID: 000094936516000258
Fund Name: Fund S000002968
WAM: 6.0 days, Net Assets: 962696744.73, Period End: None, Filing Date: 2016-09-08 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/1217286/000094936516000315/primary_doc.xml


In [7]:
######  goldman

In [13]:
import pandas as pd

df = pd.read_csv("xmllinksgoldman.csv")

# Convert filing_date column to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Sort by filing_date ascending (oldest first)
df_sorted = df.sort_values('filing_date').reset_index(drop=True)

# Save back to CSV (optional)
df_sorted.to_csv('sortedgoldmanxmllinks.csv', index=False)

print(df_sorted.head())


  filing_date filing_month                                           xml_link
0  2016-07-08      2016-07  https://www.sec.gov/Archives/edgar/data/822977...
1  2016-08-25      2016-08  https://www.sec.gov/Archives/edgar/data/822977...
2  2016-09-08      2016-09  https://www.sec.gov/Archives/edgar/data/822977...
3  2016-10-07      2016-10  https://www.sec.gov/Archives/edgar/data/822977...
4  2016-11-07      2016-11  https://www.sec.gov/Archives/edgar/data/822977...


In [14]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time

def find_local_name(elem, name):
    """Find the text content of a tag by its local name."""
    for el in elem.iter():
        if el.tag.endswith('}' + name) or el.tag == name:
            return el.text
    return None

def find_series_info(root):
    """Find series-level info node that might contain averagePortfolioMaturity."""
    for tag in ['seriesLevelInformation', 'seriesLevelInfo']:
        for el in root.iter():
            if el.tag.endswith('}' + tag) or el.tag == tag:
                return el
    return None

# Load your sorted links CSV file
df_links = pd.read_csv("sortedgoldmanxmllinks.csv")

# Ensure links are sorted by filing_date ascending
df_links['filing_date'] = pd.to_datetime(df_links['filing_date'], errors='coerce')
df_links.sort_values('filing_date', inplace=True)
df_links.reset_index(drop=True, inplace=True)

xml_links = df_links['xml_link'].dropna().tolist()

data = []

headers = {
    'User-Agent': 'Nathrah (Nathrah.Sharul-Nizam@bayes.city.ac.uk)'
}

for idx, url in enumerate(xml_links):
    try:
        time.sleep(1)  # respectful delay
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Skipping {url}: HTTP {response.status_code}")
            continue

        root = ET.fromstring(response.content)

        cik = url.split('/')[6]
        filing_id = url.split('/')[7]

        # Extract seriesId and averagePortfolioMaturity
        series_id = find_local_name(root, 'seriesId')
        fund_name = f"Fund {series_id}" if series_id else "Unknown Fund"

        series_info = find_series_info(root)
        wam_text = find_local_name(series_info, 'averagePortfolioMaturity') if series_info else find_local_name(root, 'averagePortfolioMaturity')

        wam = None
        if wam_text:
            try:
                wam_cleaned = wam_text.strip().replace(',', '')
                wam = float(wam_cleaned)
            except ValueError:
                print(f"Failed to parse WAM '{wam_text}' from {url}")

        # 🆕 Extract netAssetOfSeries (AUM)
        net_assets_text = find_local_name(root, 'netAssetOfSeries')
        net_assets = None
        if net_assets_text:
            try:
                net_assets = float(net_assets_text.strip().replace(',', ''))
            except ValueError:
                print(f"Failed to parse Net Assets '{net_assets_text}' from {url}")

        doc_period_end = find_local_name(root, 'documentPeriodEndDate')

        # Get filing date from df_links (same index as url)
        filing_date_val = df_links.loc[idx, 'filing_date']

        print(f"✓ Processed: {url}")
        print(f"CIK: {cik}, Filing ID: {filing_id}")
        print(f"Fund Name: {fund_name}")
        print(f"WAM: {wam} days, Net Assets: {net_assets}, Period End: {doc_period_end}, Filing Date: {filing_date_val}")
        print('-' * 40)

        data.append({
            'CIK': cik,
            'Filing ID': filing_id,
            'Fund Name': fund_name,
            'WAM (days)': wam,
            'Net Assets': net_assets,
            'Document Period End': doc_period_end,
            'Filing Date': filing_date_val,
            'URL': url
        })

    except Exception as e:
        print(f"✗ Failed to process {url}: {e}")
        continue

# Create DataFrame
df = pd.DataFrame(data)

# Convert date columns to datetime
df['Document Period End'] = pd.to_datetime(df['Document Period End'], errors='coerce')
df['Filing Date'] = pd.to_datetime(df['Filing Date'], errors='coerce')

# Sort and save
df.sort_values('Filing Date', inplace=True)
df.reset_index(drop=True, inplace=True)

df.to_csv('n_mfp_goldman.csv', index=False)
print("✅ Data saved to 'n_mfp_goldman.csv'")
print(df.head())


✓ Processed: https://www.sec.gov/Archives/edgar/data/822977/000114554916015780/primary_doc.xml
CIK: 822977, Filing ID: 000114554916015780
Fund Name: Fund S000009244
WAM: 10.0 days, Net Assets: 2228832032.44, Period End: None, Filing Date: 2016-07-08 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/822977/000114554916016952/primary_doc.xml
CIK: 822977, Filing ID: 000114554916016952
Fund Name: Fund S000009260
WAM: 39.0 days, Net Assets: 18329872097.33, Period End: None, Filing Date: 2016-08-25 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/822977/000114554916017367/primary_doc.xml
CIK: 822977, Filing ID: 000114554916017367
Fund Name: Fund S000009244
WAM: 6.0 days, Net Assets: 1466049197.03, Period End: None, Filing Date: 2016-09-08 00:00:00
----------------------------------------
✓ Processed: https://www.sec.gov/Archives/edgar/data/822977/000114554916018162/primary_doc.xml
CIK: 

In [10]:
data = pd.read_csv('n_mfp_fidelity.csv') 
data.head()

Unnamed: 0,CIK,Filing ID,Fund Name,WAM (days),Net Assets,Document Period End,Filing Date,URL
0,917286,91728616000238,Fund S000007052,37.0,5543205000.0,,2016-07-08,https://www.sec.gov/Archives/edgar/data/917286...
1,917286,91728616000244,Fund S000007052,29.0,5726993000.0,,2016-08-04,https://www.sec.gov/Archives/edgar/data/917286...
2,917286,91728616000251,Fund S000007051,43.0,66026020000.0,,2016-09-08,https://www.sec.gov/Archives/edgar/data/917286...
3,917286,91728616000257,Fund S000007051,49.0,70935200000.0,,2016-10-07,https://www.sec.gov/Archives/edgar/data/917286...
4,917286,91728616000267,Fund S000007051,48.0,73632330000.0,,2016-11-07,https://www.sec.gov/Archives/edgar/data/917286...
