In [1]:
#Import modules
import xml.etree.ElementTree as ET
import pandas as pd

In [7]:
#CONSTANTS
#This dictionary lists the ticker names, and the series ID numbers of investment instruments
fundHash = {'VEMIX': 'S000005786', 'VIIIX': 'S000002853', 'VTIVX': 'S000002574',
           'VMCPX': 'S000002844', 'VSCPX': 'S000002845', 'FSMDX': 'S000033637',
           'FSSNX': 'S000033638', 'VTSPX': 'S000038501', 'FXAIX': 'S000006027'}

#This dictionary lists the number of shares owned of each investment instrument
sharesHash = {'VEMIX': 62.01, 'VIIIX': 7.065, 'VTIVX': 0.045,
           'VMCPX': 4.66, 'VSCPX': 5.041, 'FSMDX': 91.872,
           'FSSNX': 112.97, 'VTSPX': 1197.552, 'FXAIX': 33.225}

#This is a prefix that seems to be built into all of the xml tag names
pT = "{http://www.sec.gov/edgar/nport}"

#This is an empty dictionary that describes what data elements should be extracted from the xml file
recordFeatures={'name': [], 'lei': [], 'title':[], 'cusip': [], 
                'balance':[], 'units':[], 'currencyConditional': ['curCd','exchangeRt'], 
                'valUSD': [], 'pctVal': [], 'payoffProfile': [], 'assetCat': [], 'issuerCat': [], 
                'invCountry': [], 'isRestrictedSec': [], 'fairValLevel': []}

In [4]:
#parseRecord(aNode, rF = recordFeatures)
#aNode: XML node that represents an individual investment instrument (XML tag invstOrSec)
#rF: Empty dictionary describing what data elements to extract from the XML records  
#Returns a dictionary of data values for the individual investment record
def parseRecord(aNode, rF = recordFeatures):
    #parseValue(k, v, rH, partStr = "")
    #k: Key value that designates either the tag name or the next-level node
    #v: Empty list (if it's the tag name) or list of 2nd-level tags to extract
    #rH: Dictionary to return, will populate with data values
    #partStr: partial string - not currently implemented, but would be needed for deeper nodes
    #No return value
    def parseValue(k, v, rH, partStr = ""):
        #Empty list means the key is the XML tag name
        if len(v) == 0:
            try:
                #Extract the node text
                rH[k] = aNode.find(partStr+pT+k).text
            except AttributeError: #This item is missing
                try:
                    if k == 'issuerCat': #The issuer category had a backup field
                        rH[k] = aNode.find(pT+'issuerConditional').get('issuerCat')
                except KeyError: #Otherwise it's not found
                    #print(f"Attribute not found {rH[k]}: {partStr+pT+k}")
                    rH[k] = ""
        #If the list is not empty, we need to go down a level and extract the items
        else:
            #Each item in the list is a sub-value
            for sV in v:
                try:
                    #Get the value from the sub-node
                    rH[sV] = aNode.find(partStr+pT+k).get(sV)
                except AttributeError: #Otherwise it's not found
                    #print(f"Attribute not found {rH['name']}: {partStr+pT+k}")
                    rH[sV] = ""
    #Initialize an empty dictionary            
    returnHash = {}

    #The ID record is unique in that it has several different potential tag types
    idRecord = aNode.find(pT+'identifiers')[0]
    returnHash['IDtype'] = idRecord.tag.split("}")[1]
    returnHash['ID'] = idRecord.attrib['value']

    #Call parseValue for each value in the record features dictionary
    for k, v in rF.items():
        parseValue(k, v, returnHash)
    
    return returnHash

In [14]:
df['valUSD']

0        5389130.74000000
1         914577.74000000
2        6676856.33000000
3        1090847.53000000
4       57719140.64000000
              ...        
5926    -1620723.05000000
5927      530766.74000000
5928    13388321.53000000
5929     5355798.58000000
5930     4347208.51000000
Name: valUSD, Length: 5931, dtype: object

In [11]:
fundDFhash = {}
#For each fund in the list
for aFund, sID in fundHash.items():
    xmlFN = f"dataFiles/{sID}.xml" #Filename
    xmlTree = ET.parse(xmlFN)
    rootNode = xmlTree.getroot()
    #Get a list of all the investment instruments in the XML file
    allRecs = rootNode.findall("./"+pT+"formData/"+pT+"invstOrSecs/")
    parsedRecs = [parseRecord(aRec) for aRec in allRecs]
    df = pd.DataFrame({k: [rec[k] for rec in parsedRecs] for k in parsedRecs[0].keys()})
    df['avgPricePerShare'] = df['valUSD']/df['balance']
    df['amtInvested'] = df['avgPricePerShare']*df['pctVal']*sharesHash[aFund]
    fundDFhash[aFund] = df

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [10]:
fundDFhash['VEMIX']

Unnamed: 0,IDtype,ID,name,lei,title,cusip,balance,units,curCd,exchangeRt,valUSD,pctVal,payoffProfile,assetCat,issuerCat,invCountry,isRestrictedSec,fairValLevel
0,isin,CNE000000M72,Wingtech Technology Co Ltd,,WINGTECH TECH-A,,1159831.00000000,NS,CNY,0.13765500,5389130.74000000,0.004919912166,Long,EC,CORP,CN,N,2
1,isin,CNE000001L07,LianChuang Electronic Technology Co Ltd,,LIANCHUANG ELE-A,,747661.00000000,NS,CNY,0.13765500,914577.74000000,0.000834947669,Long,EC,CORP,CN,N,2
2,isin,INE133A01011,Akzo Nobel India Ltd,335800Z6FCJYII12VJ88,AKZO NOBEL INDIA,,152844.00000000,NS,INR,0.01154500,6676856.33000000,0.006095518604,Long,EC,CORP,IN,N,2
3,isin,CNE100000JH1,Gaona Aero Material Co Ltd,,GAONA AERO-A,,530560.00000000,NS,CNY,0.13765500,1090847.53000000,0.000995870075,Long,EC,CORP,CN,N,2
4,isin,INE647A01010,SRF Ltd,335800436F28GT8ZW506,SRF LTD,,1784858.00000000,NS,INR,0.01154500,57719140.64000000,0.052693674722,Long,EC,CORP,IN,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5926,ticker,INR,,,INR/USD FWD 20250319,,1.00000000,NC,INR,,-1620723.05000000,-0.00147961061,,DFE,OTHER,,N,2
5927,isin,CNE000001CN3,Shinva Medical Instrument Co Ltd,300300517GYTH3UJ9T68,SHINVA MEDICAL-A,,239522.00000000,NS,CNY,0.13765500,530766.74000000,0.000484554164,Long,EC,CORP,CN,N,2
5928,isin,CNE100000767,China Shenhua Energy Co Ltd,529900N9JOX4C108MA40,CHINA SHENHUA-A,,2429648.00000000,NS,CNY,0.13765500,13388321.53000000,0.012222632769,Long,EC,CORP,CN,N,2
5929,isin,CNE100002GQ4,Bank of Hangzhou Co Ltd,300300C1092033000075,BANK OF HANGZH-A,,2629388.00000000,NS,CNY,0.13765500,5355798.58000000,0.004889482156,Long,EC,CORP,CN,N,2


In [58]:
df

Unnamed: 0,IDtype,ID,name,lei,title,cusip,balance,units,curCd,exchangeRt,valUSD,pctVal,payoffProfile,assetCat,issuerCat,invCountry,isRestrictedSec,fairValLevel
0,isin,CNE000000M72,Wingtech Technology Co Ltd,,WINGTECH TECH-A,,1159831.00000000,NS,CNY,0.13765500,5389130.74000000,0.004919912166,Long,EC,CORP,CN,N,2
1,isin,CNE000001L07,LianChuang Electronic Technology Co Ltd,,LIANCHUANG ELE-A,,747661.00000000,NS,CNY,0.13765500,914577.74000000,0.000834947669,Long,EC,CORP,CN,N,2
2,isin,INE133A01011,Akzo Nobel India Ltd,335800Z6FCJYII12VJ88,AKZO NOBEL INDIA,,152844.00000000,NS,INR,0.01154500,6676856.33000000,0.006095518604,Long,EC,CORP,IN,N,2
3,isin,CNE100000JH1,Gaona Aero Material Co Ltd,,GAONA AERO-A,,530560.00000000,NS,CNY,0.13765500,1090847.53000000,0.000995870075,Long,EC,CORP,CN,N,2
4,isin,INE647A01010,SRF Ltd,335800436F28GT8ZW506,SRF LTD,,1784858.00000000,NS,INR,0.01154500,57719140.64000000,0.052693674722,Long,EC,CORP,IN,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5926,ticker,INR,,,INR/USD FWD 20250319,,1.00000000,NC,INR,,-1620723.05000000,-0.00147961061,,DFE,OTHER,,N,2
5927,isin,CNE000001CN3,Shinva Medical Instrument Co Ltd,300300517GYTH3UJ9T68,SHINVA MEDICAL-A,,239522.00000000,NS,CNY,0.13765500,530766.74000000,0.000484554164,Long,EC,CORP,CN,N,2
5928,isin,CNE100000767,China Shenhua Energy Co Ltd,529900N9JOX4C108MA40,CHINA SHENHUA-A,,2429648.00000000,NS,CNY,0.13765500,13388321.53000000,0.012222632769,Long,EC,CORP,CN,N,2
5929,isin,CNE100002GQ4,Bank of Hangzhou Co Ltd,300300C1092033000075,BANK OF HANGZH-A,,2629388.00000000,NS,CNY,0.13765500,5355798.58000000,0.004889482156,Long,EC,CORP,CN,N,2
