In [196]:
import os
import numpy as np
import re
from bs4 import BeautifulSoup
import pandas as pd
import locale
import datetime
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )

base_dir = '/Users/shuhao/Dropbox/SEC_MF_Holdings/filings/'

###  1. Regex Patterns

In [256]:
# Different patterns to use later on
table_of_contents_pattern = re.compile(r'table\s+of\s+contents', re.I)
statements_of_assets_pattern = re.compile(r'statements?\s+of\s+assets?', re.I)  # May be not necessary
schedules_of_investment = re.compile(r'^schedules?\sof\sinvestments?',  re.I)
statements_of_net_assets_pattern = re.compile(r'^statements\sof\snet\sassets', re.I)
portfolio_names_pattern = re.compile(r'^\s+(.+)\s+\.{2,}')

table_pattern = re.compile(r'</Table>', re.I)
split_pattern = re.compile(r'[#*]*(.*\S.*)[\.\s]+\$?\s*(\b[\d,]+\b)\s+\$?\s*(\b[\d,]+\b)')
split_only_value_pattern = re.compile(r'[#*]*(.*\S.*)\.*\s{3,}\$?\s*(\b[\d,]+\b)')


# use for bond-like types
split_percent_pattern  = re.compile(r'([\d\.]+%,\s\d{2}/\d{2}/\d{2})\s+\.{2,}\s+\$?\s*(\b[\d,]+\b)\s+\$?\s*(\b[\d,]+\b)')

total_investment_pattern = re.compile(r'total\s+investments?.+\(100\.?0*%\)', re.I)

# investment in currency style read pattern

# only_name_split_pattern = re.compile(r'[#*]*(\w+)\n')
# dash_split_pattern = re.compile(r'-{3,}\n')


# shares_value_pattern = re.compile(r'shares\s+value\+', re.I)

common_stock_pattern = re.compile(r'common\sstocks?[\s-]+\([\d\.]+%\)', re.I)
end_of_common_stock_pattern = re.compile(r'total\scommon\sstocks?', re.I)

preferred_stocks_pattern = re.compile(r'preferred\sstocks?[\s-]+\([\d\.]+%\)', re.I)
end_of_preferred_stocks_pattern = re.compile(r'total\spreferred\sstocks?', re.I)

right_warrant_pattern = re.compile(r'rights?/warrants?[\s-]+\([\d\.]+%\)', re.I)
end_of_right_warrant_pattern = re.compile(r'total\srights?/warrants?', re.I)

investment_in_currency_pattern = re.compile(r'investment\sin\scurrency[\s-]+\([\d\.]+%\)', re.I)

temporary_cash_investment_pattern = re.compile(r'temporary\scash\sinvestments?[\s-]+', re.I)  

bonds_pattern = re.compile(r'bonds[\s-]+\([\d\.]+%\)', re.I)
end_of_bonds_pattern = re.compile(r'total\sbonds?', re.I)

us_treasury_obligation_pattern = re.compile(r'u\.s\.\streasury\sobligations?[\s-]+\([\d\.]+%\)', re.I)
end_of_us_treasury_obligation_pattern = re.compile(r'total\su\.s\.\streasury\sobligations?', re.I)

us_treasury_bonds_pattern = re.compile(r'u\.s\.\streasury\sbonds?', re.I)
us_treasury_notes_pattern = re.compile(r'u\.s\.\streasury\snotes?', re.I)

agency_obligation_pattern = re.compile(r'agency\sobligations?[\s-]+\([\d\.]+%\)', re.I)
end_of_agency_obligation_pattern = re.compile(r'total\sagency\sobligations?', re.I)

supranational_obligation_pattern = re.compile(r'supranational\sobligations?[\s-]+\([\d\.]+%\)', re.I)

bonds_and_certificates_of_deposit_pattern = re.compile(r'bonds\sand\scertificates\sof\sdeposit[\s-]+\([\d\.]+%\)', re.I)
end_of_bonds_and_certificates_of_deposit_pattern = re.compile(r'total\sbonds\sand\scertificates\sof\sdeposit', re.I)

commercial_paper_pattern = re.compile(r'commercial\spaper[\s-]+\([\d\.]+%\)', re.I)
end_of_commercial_paper_pattern = re.compile(r'total\scommercial\spaper', re.I)

# face_amount_pattern = re.compile(r'\s+face\s+\n\s+amount\s+', re.I)

### 2. File names and fund names within the file

In [252]:
# extract the names of the funds reported and save into a list

def total_files():

    # Get all the file names as strings
    file_list = []
    for root, dirs, files in os.walk('/Users/shuhao/Dropbox/SEC_MF_Holdings/filings/'):  
        if files:
            for f in files:
                file_list.append(base_dir + f)
    read = np.array(file_list)
    data = read.astype(np.string_)

    return data

def fund_names(lines):
    port_name_list = []
    line_num = -1

    for line in lines:
        line_num += 1
        if table_of_contents_pattern.search(line):
            start_line = line_num + 1

            for line in lines[start_line:]:
                start_line += 1
                if schedules_of_investment.search(line):

                    for line in lines[start_line:]:
                        if portfolio_names_pattern.search(line):
                            match = portfolio_names_pattern.search(line)
                            port_name = match.group(1)

                            row = {'port_name': port_name}
                            port_name_list.append(row)
                        else:
                            break
                elif statements_of_net_assets_pattern.search(line):
                    for line in lines[start_line:]:
                        if portfolio_names_pattern.search(line):
                            match = portfolio_names_pattern.search(line)
                            port_name = match.group(1)

                            row = {'port_name': port_name}
                            port_name_list.append(row)
                        else:
                            break
    port_name_list = pd.DataFrame(port_name_list)
    return port_name_list

### 3. Extract the holding information of each fund from the text file

In [253]:
start = datetime.datetime.now()

def extract_holdings(contents, fund_name_list):

    """
    pinpoint to the start and end of each fund
    with 'Total Investment' and 'Value+', and save the holdings
    under each fund name.

    """
#     types_of_invesment = ['Common_Stock', 'Agency_Obligation', 'Bonds', 'Bonds_And_Certificates_Of_Deposit',
#                          'Commercial_Paper', 'Investment_In_Currency', 'Preferred_Stock', 'Rights_Warrants',
#                          'Supranational_Obligation', 'Temporary_Cash_Investment', 'US_Treasury_Bonds', 
#                           'US_Treasury_Notes']
    split_lines = re.split(total_investment_pattern, contents)


    data = []  # save all the data
    port_name_index = -1
    for case in split_lines[:-1]:
        
        port_name_index += 1  # count the number of the name of the fund
        fund_name = fund_name_list['port_name'][port_name_index]
        print(str(port_name_index) + fund_name)
        case_list = case.split('\n')



        num = -1   # count the number of the lines read
        for line in case_list:
            num += 1
             
            # Common Stocks
            if common_stock_pattern.search(line):

                holding_type = 'Common_Stock'
                name_line_count = 0
                
                for line in case_list[num+1:]:
                    name_line_count += 1  # count from the beginning of the bonds pattern, starting at 0
                    
                    if bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif end_of_common_stock_pattern.search(line):
                        break
                    elif common_stock_pattern.search(line):
                        break

                    else:
                        if split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check - 1:num - 1:-1]:
                                reverse_num += 1

                                if common_stock_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)

                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)
                                    break

                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string
                                name += match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)


            # Preferred Stocks
            elif preferred_stocks_pattern.search(line):

                holding_type = 'Preferred_Stock'
                name_line_count = 0
                for line in case_list[num+1:]:
                    name_line_count += 1  # count from the beginning of the bonds pattern, starting at 0

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif end_of_preferred_stocks_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break

                    else:
                        if split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check - 1:num - 1:-1]:
                                reverse_num += 1

                                if preferred_stocks_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)

                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)
                                    break

                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string

                                name += match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

            # Rights/Warrants
            elif right_warrant_pattern.search(line):

                holding_type = 'Rights_Warrants'
                name_line_count = 0
                for line in case_list[num+1:]:

                    name_line_count += 1

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif end_of_right_warrant_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break

                    else:
                        if split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check - 1:num - 1:-1]:
                                reverse_num += 1

                                if right_warrant_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)

                                        break
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(reverse_num - 1):
#                                             string = case_list[name_check - i - 1].lstrip()
#                                             name_str.append(string)

#                                         break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)
                                        break

                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string
                                name += match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)
            # Investments in Currency
            elif investment_in_currency_pattern.search(line):
                holding_type = 'Investment_In_Currency'
                name_line_count = 0
                for line in case_list[num+1:]:
                    name_line_count +=1

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif re.compile(r'Total', re.I).search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    else:
                        if split_only_value_pattern.search(line):
                            match = split_only_value_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check - 1:num - 1:-1]:
                                reverse_num += 1

                                if investment_in_currency_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)

                                        break
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(reverse_num - 1):
#                                             string = case_list[name_check - i - 1].lstrip()
#                                             name_str.append(string)

#                                         break
                                elif split_only_value_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(reverse_num - 1):
                                            string = case_list[name_check - i - 1].lstrip()
                                            name_str.append(string)
                                        break

                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string
                                name += match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            value = match.group(2)

                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'value': value}
                            data.append(row)
            # Temporary Cash Investment
            elif temporary_cash_investment_pattern.search(line):

                holding_type = 'Temporary_Cash_Investment'

                name_line_count = 0
                for line in case_list[num+1:]:
                    name_line_count += 1

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    else:
                        if split_percent_pattern.search(line):
                            match = split_percent_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            
                            # the loop is looking back to get the name in multi-line
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1

                                if temporary_cash_investment_pattern.search(line):
                                    for i in range(0, reverse_num - 1):
                                        string = case_list[name_check - i - 1].strip() + ' '
                                        name_str.append(string)
                                    break
                            
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(0, reverse_num - 1):
#                                             string = case_list[name_check - i - 1].strip() + ' '
#                                             name_str.append(string)
#                                         break
                                elif split_percent_pattern.search(line):
                                    if reverse_num != 1:
                                        for i in range(reverse_num-1):
                                            string = case_list[name_check - i-1].strip() + ' '
                                            name_str.append(string)
                                        break
                                    else:
                                        # the line above is also a percent pattern. 
                                        # find the end of the name
                                        reverse_num_find_end = 0
                                        
                                        # the loop is to looking up for the start and end of the name lines
                                        for line in case_list[name_check-1:num-1:-1]:
                                            reverse_num_find_end += 1

                                            if not split_percent_pattern.search(line): # meet the end of the name line
                                                reverse_num_find_start = 0
                                                
                                                # the loop is to find the start of the name line and save the name_str
                                                for line in case_list[name_check-reverse_num_find_end:num-1:-1]:
                                                    reverse_num_find_start += 1
                                                    if temporary_cash_investment_pattern.search(line): # find the top 
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        # if find the start of the name and save the name_str, then break the loop
                                                        break
                                                    elif split_percent_pattern.search(line):
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        break
                                                break
                                        break
                                                
                            name = ''
                            for string in name_str[::-1]:
                                name += string

                            name = name + match.group(1)
                            name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                            name = name.lstrip()
                            name = name.split('<C>')[-1]
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)
                            
                        elif split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1
                                if temporary_cash_investment_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif re.compile(r'<s>', re.I).search(line):
                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string

                                name = name + match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)
            # Bonds
            elif bonds_pattern.search(line):

                holding_type = 'Bonds'

                name_line_count = 0
                for line in case_list[num+1:]:
                    name_line_count += 1  # count from the beginning of the bonds pattern, starting at 0

                    if common_stock_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif end_of_bonds_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break

                    else:
                        if split_percent_pattern.search(line):
                            match = split_percent_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            
                            # the loop is looking back to get the name in multi-line
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1

                                if bonds_pattern.search(line):
                                    for i in range(0, reverse_num - 1):
                                        string = case_list[name_check - i - 1].strip() + ' '
                                        name_str.append(string)
                                    break
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(0, reverse_num - 1):
#                                             string = case_list[name_check - i - 1].strip() + ' '
#                                             name_str.append(string)
#                                         break
                                elif split_percent_pattern.search(line):
                                    if reverse_num != 1:
                                        for i in range(reverse_num-1):
                                            string = case_list[name_check - i-1].strip() + ' '
                                            name_str.append(string)
                                        break
                                    else:
                                        # the line above is also a percent pattern. 
                                        # find the end of the name
                                        reverse_num_find_end = 0
                                        
                                        # the loop is to looking up for the start and end of the name lines
                                        for line in case_list[name_check-1:num-1:-1]:
                                            reverse_num_find_end += 1

                                            if not split_percent_pattern.search(line): # meet the end of the name line
                                                reverse_num_find_start = 0
                                                
                                                # the loop is to find the start of the name line and save the name_str
                                                for line in case_list[name_check-reverse_num_find_end:num-1:-1]:
                                                    reverse_num_find_start += 1
                                                    if bonds_pattern.search(line): # find the top 
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        # if find the start of the name and save the name_str, then break the loop
                                                        break
                                                    elif split_percent_pattern.search(line):
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        break
                                                break
                                        break
                                                
                            name = ''
                            for string in name_str[::-1]:
                                name += string

                            name = name + match.group(1)
                            name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                            name = name.lstrip()
                            name = name.split('<C>')[-1]
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

                        elif split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1
                                if bonds_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif re.compile(r'<s>', re.I).search(line):
                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string

                                name = name + match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

            # U.S. Treasury Obligations: Treasury Bonds and Notes
            elif us_treasury_obligation_pattern.search(line):

                holding_type = 'US_Treasury_Obligation'

                name_line_count = 0
                for line in case_list[num+1:]:
                    name_line_count += 1  # count from the beginning of the bonds pattern, starting at 0

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif end_of_us_treasury_obligation_pattern.search(line):
                        break
                    elif us_treasury_obligation_pattern.search(line):
                        break

                    else:
                        if us_treasury_bonds_pattern.search(line):
                            
                            base_name = 'U.S. Treasury Bonds '
                            
                        elif us_treasury_notes_pattern.search(line):
                            base_name = 'U.S. Treasury Notes '

                        elif split_percent_pattern.search(line):
                            match = split_percent_pattern.search(line)
                            name = base_name + match.group(1)
                            name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                            name = name.lstrip()

                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)
                        
            # Agency Obligation
            elif agency_obligation_pattern.search(line):

                holding_type = 'Agency_Obligation'

                name_line_count = 0
                for line in case_list[num+1:]:
                    name_line_count += 1

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif end_of_agency_obligation_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    else:
                        if split_percent_pattern.search(line):
                            match = split_percent_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            
                            # the loop is looking back to get the name in multi-line
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1

                                if agency_obligation_pattern.search(line):
                                    for i in range(0, reverse_num - 1):
                                        string = case_list[name_check - i - 1].strip() + ' '
                                        name_str.append(string)
                                    break
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(0, reverse_num - 1):
#                                             string = case_list[name_check - i - 1].strip() + ' '
#                                             name_str.append(string)
#                                         break
                                elif split_percent_pattern.search(line):
                                    if reverse_num != 1:
                                        for i in range(reverse_num-1):
                                            string = case_list[name_check - i-1].strip() + ' '
                                            name_str.append(string)
                                        break
                                    else:
                                        # the line above is also a percent pattern. 
                                        # find the end of the name
                                        reverse_num_find_end = 0
                                        
                                        # the loop is to looking up for the start and end of the name lines
                                        for line in case_list[name_check-1:num-1:-1]:
                                            reverse_num_find_end += 1

                                            if not split_percent_pattern.search(line): # meet the end of the name line
                                                reverse_num_find_start = 0
                                                
                                                # the loop is to find the start of the name line and save the name_str
                                                for line in case_list[name_check-reverse_num_find_end:num-1:-1]:
                                                    reverse_num_find_start += 1
                                                    if agency_obligation_pattern.search(line): # find the top 
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        # if find the start of the name and save the name_str, then break the loop
                                                        break
                                                    elif split_percent_pattern.search(line):
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        break
                                                break
                                        break
                                                
                            name = ''
                            for string in name_str[::-1]:
                                name += string

                            name = name + match.group(1)
                            name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                            name = name.lstrip()
                            name = name.split('<C>')[-1]
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

                        elif split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1
                                if agency_obligation_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif re.compile(r'<s>', re.I).search(line):
                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string

                                name = name + match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

            # Supranational Obligation
            elif supranational_obligation_pattern.search(line):

                holding_type = 'Supranational_Obligation'

                name_line_count = 0
                for line in case_list[num+1:]:

                    name_line_count += 1

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    else:
                        if split_percent_pattern.search(line):
                            match = split_percent_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            
                            # the loop is looking back to get the name in multi-line
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1

                                if supranational_obligation_pattern.search(line):
                                    for i in range(0, reverse_num - 1):
                                        string = case_list[name_check - i - 1].strip() + ' '
                                        name_str.append(string)
                                    break
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(0, reverse_num - 1):
#                                             string = case_list[name_check - i - 1].strip() + ' '
#                                             name_str.append(string)
#                                         break
                                elif split_percent_pattern.search(line):
                                    if reverse_num != 1:
                                        for i in range(reverse_num-1):
                                            string = case_list[name_check - i-1].strip() + ' '
                                            name_str.append(string)
                                        break
                                    else:
                                        # the line above is also a percent pattern. 
                                        # find the end of the name
                                        reverse_num_find_end = 0
                                        
                                        # the loop is to looking up for the start and end of the name lines
                                        for line in case_list[name_check-1:num-1:-1]:
                                            reverse_num_find_end += 1

                                            if not split_percent_pattern.search(line): # meet the end of the name line
                                                reverse_num_find_start = 0
                                                
                                                # the loop is to find the start of the name line and save the name_str
                                                for line in case_list[name_check-reverse_num_find_end:num-1:-1]:
                                                    reverse_num_find_start += 1
                                                    if supranational_obligation_pattern.search(line): # find the top 
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        # if find the start of the name and save the name_str, then break the loop
                                                        break
                                                    elif split_percent_pattern.search(line):
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        break
                                                break
                                        break
                                                
                            name = ''
                            for string in name_str[::-1]:
                                name += string

                            name = name + match.group(1)
                            name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                            name = name.lstrip()
                            name = name.split('<C>')[-1]
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

                        elif split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1
                                if supranational_obligation_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif re.compile(r'<s>', re.I).search(line):
                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string

                                name = name + match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)
                            
            # Bonds and Certificates of Deposit
            elif bonds_and_certificates_of_deposit_pattern.search(line):

                holding_type = 'Bonds_And_Certificates_Of_Deposit'

                name_line_count = 0
                for line in case_list[num+1:]:
                    name_line_count += 1  # count from the beginning of the bonds pattern, starting at 0

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    elif end_of_bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break

                    else:
                        if split_percent_pattern.search(line):
                            match = split_percent_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            
                            # the loop is looking back to get the name in multi-line
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1

                                if bonds_and_certificates_of_deposit_pattern.search(line):
                                    for i in range(0, reverse_num - 1):
                                        string = case_list[name_check - i - 1].strip() + ' '
                                        name_str.append(string)
                                    break
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(0, reverse_num - 1):
#                                             string = case_list[name_check - i - 1].strip() + ' '
#                                             name_str.append(string)
#                                         break
                                elif split_percent_pattern.search(line):
                                    if reverse_num != 1:
                                        for i in range(reverse_num-1):
                                            string = case_list[name_check - i-1].strip() + ' '
                                            name_str.append(string)
                                        break
                                    else:
                                        # the line above is also a percent pattern. 
                                        # find the end of the name
                                        reverse_num_find_end = 0
                                        
                                        # the loop is to looking up for the start and end of the name lines
                                        for line in case_list[name_check-1:num-1:-1]:
                                            reverse_num_find_end += 1

                                            if not split_percent_pattern.search(line): # meet the end of the name line
                                                reverse_num_find_start = 0
                                                
                                                # the loop is to find the start of the name line and save the name_str
                                                for line in case_list[name_check-reverse_num_find_end:num-1:-1]:
                                                    reverse_num_find_start += 1
                                                    if bonds_and_certificates_of_deposit_pattern.search(line): # find the top 
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        # if find the start of the name and save the name_str, then break the loop
                                                        break
                                                    elif split_percent_pattern.search(line):
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        break
                                                break
                                        break
                                                
                            name = ''
                            for string in name_str[::-1]:
                                name += string

                            name = name + match.group(1)
                            name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                            name = name.lstrip()
                            name = name.split('<C>')[-1]
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

                        elif split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1
                                if bonds_and_certificates_of_deposit_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif re.compile(r'<s>', re.I).search(line):
                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string

                                name = name + match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)   
            # Commercial Paper
            elif commercial_paper_pattern.search(line):

                name_line_count = 0
                holding_type = 'Commercial_Paper'

                for line in case_list[num+1:]:
                    name_line_count += 1

                    if common_stock_pattern.search(line):
                        break
                    elif bonds_pattern.search(line):
                        break
                    elif right_warrant_pattern.search(line):
                        break
                    elif temporary_cash_investment_pattern.search(line):
                        break
                    elif investment_in_currency_pattern.search(line):
                        break
                    elif preferred_stocks_pattern.search(line):
                        break
                    elif us_treasury_bonds_pattern.search(line):
                        break
                    elif us_treasury_notes_pattern.search(line):
                        break
                    elif agency_obligation_pattern.search(line):
                        break
                    elif supranational_obligation_pattern.search(line):
                        break
                    elif bonds_and_certificates_of_deposit_pattern.search(line):
                        break
                    elif end_of_commercial_paper_pattern.search(line):
                        break
                    elif commercial_paper_pattern.search(line):
                        break
                    else:
                        if split_percent_pattern.search(line):
                            match = split_percent_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            
                            # the loop is looking back to get the name in multi-line
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1

                                if commercial_paper_pattern.search(line):
                                    for i in range(0, reverse_num - 1):
                                        string = case_list[name_check - i - 1].strip() + ' '
                                        name_str.append(string)
                                    break
#                                 elif re.compile(r'<s>', re.I).search(line):
#                                     if reverse_num == 1:
#                                         break
#                                     else:
#                                         for i in range(0, reverse_num - 1):
#                                             string = case_list[name_check - i - 1].strip() + ' '
#                                             name_str.append(string)
#                                         break
                                elif split_percent_pattern.search(line):
                                    if reverse_num != 1:
                                        for i in range(reverse_num-1):
                                            string = case_list[name_check - i-1].strip() + ' '
                                            name_str.append(string)
                                        break
                                    else:
                                        # the line above is also a percent pattern. 
                                        # find the end of the name
                                        reverse_num_find_end = 0
                                        
                                        # the loop is to looking up for the start and end of the name lines
                                        for line in case_list[name_check-1:num-1:-1]:
                                            reverse_num_find_end += 1

                                            if not split_percent_pattern.search(line): # meet the end of the name line
                                                reverse_num_find_start = 0
                                                
                                                # the loop is to find the start of the name line and save the name_str
                                                for line in case_list[name_check-reverse_num_find_end:num-1:-1]:
                                                    reverse_num_find_start += 1
                                                    if commercial_paper_pattern.search(line): # find the top 
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        # if find the start of the name and save the name_str, then break the loop
                                                        break
                                                    elif split_percent_pattern.search(line):
                                                        for i in range(reverse_num_find_start-1):
                                                            string = case_list[name_check-reverse_num_find_end-i].strip() + ' '
                                                            name_str.append(string)
                                                        break
                                                break
                                        break
                                                
                            name = ''
                            for string in name_str[::-1]:
                                name += string

                            name = name + match.group(1)
                            name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                            name = name.lstrip()
                            name = name.split('<C>')[-1]
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)

                        elif split_pattern.search(line):
                            match = split_pattern.search(line)

                            # if name is multi-line, look back to check the last line saved
                            name_check = name_line_count + num  # the index of the line in total lines (case_list)
                            reverse_num = 0  # look back how many lines

                            name_str = []  # save the line when we look back
                            for line in case_list[name_check-1:num-1:-1]:
                                reverse_num += 1
                                if commercial_paper_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif split_pattern.search(line):

                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                                elif re.compile(r'<s>', re.I).search(line):
                                    if reverse_num == 1:
                                        break
                                    else:
                                        for i in range(0, reverse_num-1):
                                            string = case_list[name_check - i - 1].strip() + ' '
                                            name_str.append(string)
                                        break
                            if reverse_num == 1:
                                name = match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                
                            else:
                                name = ''
                                for string in name_str[::-1]:
                                    name += string

                                name = name + match.group(1)
                                name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                                name = name.lstrip()
                                name = name.split('<C>')[-1]
                                
                            shares = match.group(2)
                            value = match.group(3)

                            shares = locale.atoi(shares)
                            value = locale.atoi(value)
                            # print(company, shares, value)

                            row = {'fund_name': fund_name,
                                   'holding_type': holding_type,
                                   'name': name,
                                   'shares': shares,
                                   'value': value}
                            data.append(row)
    data = pd.DataFrame(data)
#     data.to_csv('/Users/shuhao/PycharmProjects/Learning/data/SEC_EDGAR_Extract.csv')
    return data
end = datetime.datetime.now()
print(end-start)

0:00:00.013677


### 4. Extract holdings information for the rest of funds without traditional holding types

In [254]:
def extract_other_holdings(contents, fund_name_list, left_index):

    """
    pinpoint to the start and end of each fund
    with 'Total Investment' and 'Value+', and save the holdings
    under each fund name.

    """
#     types_of_invesment = ['Common_Stock', 'Agency_Obligation', 'Bonds', 'Bonds_And_Certificates_Of_Deposit',
#                          'Commercial_Paper', 'Investment_In_Currency', 'Preferred_Stock', 'Rights_Warrants',
#                          'Supranational_Obligation', 'Temporary_Cash_Investment', 'US_Treasury_Bonds', 
#                           'US_Treasury_Notes']
    split_lines = re.split(total_investment_pattern, contents)


    data_left = []  # save all the data
    for case_index in left_index:
        fund_name = fund_name_list['port_name'][case_index]
        print(str(case_index) + fund_name)
        case_list = split_lines[case_index].split('\n')

        num = 1   # count the number of the lines read
        holding_type = 'Other'
        
        for line in case_list[2:]:
            num += 1
            
            # Other
            if split_pattern.search(line):
                match = split_pattern.search(line)

                # if name is multi-line, look back to check the last line saved
                reverse_num = 0  # look back how many lines

                name_str = []  # save the line when we look back
                for line in case_list[num - 1:2:-1]:
                    reverse_num += 1

                    if split_pattern.search(line):

                        if reverse_num == 1:
                            break
                        else:
                            for i in range(0, reverse_num - 1):
                                string = case_list[num - i - 1].lstrip()
                                name_str.append(string)
                        break
                    elif re.compile(r'<S>', re.I).search(line):
                        if reverse_num == 1:
                            break
                        else:
                            for i in range(0, reverse_num - 1):
                                string = case_list[num - i - 1].lstrip()
                                name_str.append(string)
                        break

                if reverse_num == 1:
                    name = match.group(1)
                    name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                    name = name.lstrip()
                else:
                    name = ''
                    for string in name_str[::-1]:
                        name += string
                    name += match.group(1)
                    name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                    name = name.lstrip()
                    name = name.split('<C>')[-1]

                shares = match.group(2)
                value = match.group(3)

                shares = locale.atoi(shares)
                value = locale.atoi(value)

                row = {'fund_name': fund_name,
                       'holding_type': holding_type,
                       'name': name,
                       'shares': shares,
                       'value': value}
                data_left.append(row)
            elif split_only_value_pattern.search(line):
                
                match = split_only_value_pattern.search(line)

                # if name is multi-line, look back to check the last line saved
                reverse_num = 0  # look back how many lines

                name_str = []  # save the line when we look back
                for line in case_list[num - 1:2:-1]:
                    reverse_num += 1
                    
                    if re.compile(r'<S>', re.I).search(line):
                        
                        if reverse_num == 1:
                            break
                        else:
                            for i in range(reverse_num - 1):
                                string = case_list[num - i - 1].lstrip()
                                name_str.append(string)

                            break

                    elif split_only_value_pattern.search(line):

                        if reverse_num == 1:
                            break
                        else:
                            for i in range(reverse_num - 1):
                                string = case_list[num - i - 1].lstrip()
                                name_str.append(string)
                            break

                if reverse_num == 1:
                    name = match.group(1)
                    name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                    name = name.lstrip()
                else:
                    name = ''
                    for string in name_str[::-1]:
                        name += string
                    name += match.group(1)
                    name = re.sub(r'\.{2,}', '', re.sub(r'\.+$', '', re.sub(r'\$\s*$', '', name)))
                    name = name.lstrip()
                    name = name.split('<C>')[-1]

                value = match.group(2)

                value = locale.atoi(value)

                row = {'fund_name': fund_name,
                       'holding_type': holding_type,
                       'name': name,
                       'value': value}
                data_left.append(row)
    data_left = pd.DataFrame(data_left)
    return data_left

### 5. Execute the extraction and save the holding information 

In [259]:
test = base_dir + '355437_N30D_19950626_0000950116_95_000262.txt'
# test = '/Users/shuhao/Downloads/dfa_test_2.txt'

start = datetime.datetime.now()

with open(test, 'r') as f:

    lines = f.readlines()
    fund_name_list = fund_names(lines)
print(fund_name_list)
with open(test, 'r') as f_2:

    contents = f_2.read()

    holdings = extract_holdings(contents, fund_name_list)

end = datetime.datetime.now()

print(end-start)


                                            port_name
0              The U.S. 9-10 Small Company Portfolio 
1              The U.S. 6-10 Small Company Portfolio 
2                   The U.S. Large Company Portfolio 
3                 The U.S. Small Cap Value Portfolio 
4                 The U.S. Large Cap Value Portfolio 
5       The DFA/AEW Real Estate Securities Portfolio 
6               The Japanese Small Company Portfolio 
7            The Pacific Rim Small Company Portfolio 
8         The United Kingdom Small Company Portfolio 
9                     The Emerging Markets Portfolio 
10           The Continental Small Company Portfolio 
11             The Large Cap International Portfolio 
12  The DFA International High Book to Market Port...
13           The DFA One-Year Fixed Income Portfolio 
14            The DFA Five-Year Government Portfolio 
15             The DFA Global Fixed Income Portfolio 
16  The DFA Intermediate Government Fixed Income P...
17                The U.S. 6

### 5. Execute the 3. and save the reset of the fund holdings

In [260]:

with open(test, 'r') as f_3:
    contents_f_3 = f_3.read()
    left_fund_name = list(set(fund_name_list['port_name']) - set(holdings['fund_name']))

    left_index = []
    for name in left_fund_name:
        print(name)
        left_index.append(fund_name_list[fund_name_list['port_name'] == name].index.tolist()[0])
    print(left_index)
    holdings_left = extract_other_holdings(contents_f_3, fund_name_list, left_index)

The U.S. Small Cap Value Portfolio 
The DFA International High Book to Market Portfolio 
The DFA One-Year Fixed Income Portfolio 
The U.S. Large Cap Value Portfolio 
The Emerging Markets Portfolio 
The U.S. Large Company Portfolio 
The U.S. 6-10 Small Company Portfolio 
[3, 12, 13, 4, 9, 2, 1]
3The U.S. Small Cap Value Portfolio 
12The DFA International High Book to Market Portfolio 
13The DFA One-Year Fixed Income Portfolio 
4The U.S. Large Cap Value Portfolio 
9The Emerging Markets Portfolio 
2The U.S. Large Company Portfolio 
1The U.S. 6-10 Small Company Portfolio 


### 6. merge the two holdings and save to CSV

In [261]:
holdings_all = holdings.append(holdings_left)
holdings_all.reset_index(inplace=True, drop=True)
holdings_all.head()
holdings_all.to_csv('/Users/shuhao/PycharmProjects/Learning/data/SEC_EDGAR_Extract.csv')

In [216]:
# holdings[(holdings['fund_name'] == 'The DFA Global Fixed Income Portfolio ') & (holdings['holding_type'] == 'Bonds')]['name'][-5:]

5903     New South Wales Treasury Corp. 11.500%, 07/01/99
5904           Queensland Treasury Corp. 8.000%, 05/14/97
5905           Queensland Treasury Corp. 8.000%, 07/14/99
5906    Eurofima (Societe Europeene pour  le Financeme...
5907               France (Treasury of) 10.000%, 05/27/00
Name: name, dtype: object

### 7. Check whether the data extrected matched the real data(Use sum of value as the indicator)

In [263]:
read = pd.read_csv('/Users/shuhao/PycharmProjects/Learning/Data/SEC_EDGAR_Extract.csv')
read.groupby(read['fund_name'])['value'].sum()

fund_name
The Continental Small Company Portfolio                    347214213
The DFA Five-Year Government Portfolio                     238743484
The DFA Global Fixed Income Portfolio                      130761523
The DFA Intermediate Government Fixed Income Portfolio      59714818
The DFA International High Book to Market Portfolio        112937915
The DFA International Value Series                         350050218
The DFA One-Year Fixed Income Portfolio                    592285633
The DFA One-Year Fixed Income Series                       588473150
The DFA/AEW Real Estate Securities Portfolio                30436895
The Emerging Markets Portfolio                              15696170
The Emerging Markets Series                                 14865866
The Japanese Small Company Portfolio                       348349990
The Large Cap International Portfolio                       55331124
The Pacific Rim Small Company Portfolio                    212682322
The U.S. 6-10 Small Comp

In [262]:
holdings_all.groupby(holdings_all['fund_name'])['value'].sum()

fund_name
The Continental Small Company Portfolio                    347214213
The DFA Five-Year Government Portfolio                     238743484
The DFA Global Fixed Income Portfolio                      130761523
The DFA Intermediate Government Fixed Income Portfolio      59714818
The DFA International High Book to Market Portfolio        112937915
The DFA International Value Series                         350050218
The DFA One-Year Fixed Income Portfolio                    592285633
The DFA One-Year Fixed Income Series                       588473150
The DFA/AEW Real Estate Securities Portfolio                30436895
The Emerging Markets Portfolio                              15696170
The Emerging Markets Series                                 14865866
The Japanese Small Company Portfolio                       348349990
The Large Cap International Portfolio                       55331124
The Pacific Rim Small Company Portfolio                    212682322
The U.S. 6-10 Small Comp

In [225]:
test_1 = holdings_all[holdings_all['fund_name'] == 'The Continental Small Company Portfolio ']
# test_1['value'].sum()
test_1.groupby(test_1['holding_type']).sum()

Unnamed: 0_level_0,shares,value
holding_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Bonds,115507.0,121688
Common_Stock,35884065.0,329700185
Investment_In_Currency,0.0,16990352
Preferred_Stock,417.0,109809
Rights_Warrants,1117583.0,85179
Temporary_Cash_Investment,207.0,207000
