In [1]:
import pandas as pd
import re
import xmltodict
import flatdict
import os
import sys

In [2]:
def flatdict_toDF(table_d):
    """
    This function takes a flat dictionary object and process it as follows: 
    If there is only 1 item, it is a dictionary. Convert it to a pandas DF object
    If there are more than 1 item, it will be a list. We flat it further and convert it to a pandas DF object
    
    table_d: could be list or a pandas DataFrame that has two rows (one of them contains column names)
    return:  pandas DataFrame with column names
    """
    
    if isinstance(table_d, list):
        d_list = []
        for i in table_d:
            d_list.append(flatdict.FlatDict(i, delimiter='.'))
        
        return pd.DataFrame(d_list)
    
    else:
        tmp_df   = pd.DataFrame(table_d.items()).T
        col_name = tmp_df.iloc[0] 
    
        return tmp_df.drop([0]).reset_index(drop=True).rename(col_name, axis=1)
        

In [3]:
def concat_abtoC(a, b, c):
    """
    This function takes three pandas DF objects a b and c (a and b has one row and c may have multiple rows) to:
    1. Duplicate lines of a and b to the same number of rows in c
    2. Merge a b and c to a large DF along axis=1
    
    a:      pandas DataFrame, one row only
    b:      pandas DataFrame, one row only
    c:      pandas DataFrame, may have multiple rows
    return: pandas DataFrame
    """
    
    n   = len(c.index)
    t1  = pd.concat([a]*n,ignore_index=True)
    t2  = pd.concat([b]*n,ignore_index=True)
    t12 = pd.concat([t1.reset_index(drop=True),t2.reset_index(drop=True)], axis=1)
    
    return pd.concat([t12.reset_index(drop=True),c.reset_index(drop=True)], axis=1)

In [4]:
def save_dftocsv(filepath, filename, df):
    """
    This function takes a pandas DF and save to specific filename in filepath.
    Append if file already exists.
    
    filepath: string, directory for file
    filename: string, full filename for csv output   
    """
    
    fileloc = filepath+filename
    file_present = os.path.isfile(fileloc) 
    if file_present:
        df.to_csv(fileloc, index=False, mode='a', header=False)    
    else:
        df.to_csv(fileloc, index=False)
        
    return

In [5]:
def proc_form4txt(filepath, filename):
    """
    This function processes the xml text for correct reading later using flatdict:
    1. Add a few lines to xml, so that flatdict can process things correctly
    2. Merge "Holding" to "Transaction", so that no separate form is needed
    3. Edit <footnotes> for flatdict to read
    
    filepath: string, directory for file
    filename: string, full filename of Form-4.txt for pre-processing
    """
    
    fileloc = filepath+filename
    infile  = open(fileloc, 'r')
    lines   = infile.readlines()
    
    outfile = open(fileloc+'.mod','w')
    for line in lines:
        # add line so that flatdic can process all as a list
        if r'</nonDerivativeTable>' in line and r'<nonDerivativeTable></nonDerivativeTable>' not in line:
            outfile.write(r'<nonDerivativeTransaction></nonDerivativeTransaction>' + "\n") 
        if r'</derivativeTable>' in line and r'<derivativeTable></derivativeTable>' not in line:
            outfile.write(r'<derivativeTransaction></derivativeTransaction>' + "\n") 
        if r'</footnotes>' in line and r'<footnotes></footnotes>' not in line:
            outfile.write(line.replace('</footnotes>', '<footnote><footnote_>  </footnote_></footnote></footnotes>'))
            continue
            
        # "Holding" and "Transaction" are slight variation of same table
        if 'nonDerivativeHolding' in line:
            outfile.write(line.replace('nonDerivativeHolding', 'nonDerivativeTransaction'))
            continue
        if 'derivativeHolding' in line:
            outfile.write(line.replace('derivativeHolding', 'derivativeTransaction'))
            continue
        
        # add additional nesting in footnote, so that flatdic process and separate the notes
        if r'<footnote ' in line:
            outfile.write(line.replace(' id', '><footnote_>id').replace('</footnote>', '</footnote_></footnote>'))
            continue
        if r'</footnote>' in line:
            outfile.write(line.replace('</footnote>', '</footnote_></footnote>'))
            continue
              
        outfile.write(line)
        
    outfile.close()
    infile.close()
    
    return


In [6]:
class f4data:
    """
    Create a class for holding formatted Form-4 data
    
    """
       
    issuer_list = ["issuerCik", "issuerName", "issuerTradingSymbol"]
    
    reporting_list = [
        "reportingOwnerId.rptOwnerCik",
        "reportingOwnerId.rptOwnerName",
        "reportingOwnerAddress.rptOwnerStreet1",
        "reportingOwnerAddress.rptOwnerStreet2",
        "reportingOwnerAddress.rptOwnerCity",
        "reportingOwnerAddress.rptOwnerState",
        "reportingOwnerAddress.rptOwnerZipCode",
        "reportingOwnerAddress.rptOwnerStateDescription",
        "reportingOwnerRelationship.isDirector",
        "reportingOwnerRelationship.isOfficer",
        "reportingOwnerRelationship.isTenPercentOwner",
        "reportingOwnerRelationship.isOther",
        "reportingOwnerRelationship.officerTitle",
        "reportingOwnerRelationship.otherText"
        ]

    nonDerivative_list = [
        "securityTitle.value",
        "transactionDate.value",
        "deemedExecutionDate.value",
        "transactionCoding.transactionCode",
        "transactionTimeliness.value",
        "transactionAmounts.transactionShares.value",
        "transactionAmounts.transactionAcquiredDisposedCode.value",
        "transactionAmounts.transactionPricePerShare.value",
        "postTransactionAmounts.sharesOwnedFollowingTransaction.value",
        "ownershipNature.directOrIndirectOwnership.value",
        "ownershipNature.natureOfOwnership.value"
        ]
        
    derivative_list = [
        "securityTitle.value",
        "conversionOrExercisePrice.value",
        "transactionDate.value",
        "deemedExecutionDate.value",
        "transactionCoding.transactionCode",
        "transactionTimeliness.value",
        "transactionAmounts.transactionAcquiredDisposedCode.value",
        "transactionAmounts.transactionShares.value",
        "exerciseDate.value",
        "expirationDate.value",
        "underlyingSecurity.underlyingSecurityTitle.value",
        "underlyingSecurity.underlyingSecurityShares.value",
        "transactionAmounts.transactionPricePerShare.value",
        "postTransactionAmounts.sharesOwnedFollowingTransaction.value",
        "ownershipNature.directOrIndirectOwnership.value",
        "ownershipNature.natureOfOwnership.value"
        ]
   
    footnotes_list = ["footnote_"]
    
    
    def __init__(self, table_name, orig_df):
        """
        This function creates a DataFrame, with standardized column names, and dropped redundant entries
        
        table_name: string, name of database to create
        orig_df:    pandas DataFrame, full table contains all the data columns        
        """
        if table_name == "nonDerivative":
            column_list = self.issuer_list + self.reporting_list + self.nonDerivative_list
        elif table_name == "derivative":
            column_list = self.issuer_list + self.reporting_list + self.derivative_list
        elif table_name == "footnotes":
            column_list = self.issuer_list + self.reporting_list + self.footnotes_list
        else:
            raise ValueError("Unknown table name!")
            
        # could do some more checking with column_names after concat         
        empty_df = pd.DataFrame(columns=column_list)
        self.df  = pd.concat([empty_df,orig_df])[column_list]

        self.check_colname(empty_df, orig_df)


    def check_colname(self, empty_df, orig_df):
        """
        This function checks if the code is reading new unknown column names
        
        empty_df: pandas DataFrame, contains empty DataFrame with columns that have standard names
        orig_df:  pandas DataFrame, full table contains all the data columns 
        """

        # Exclude already known list of outliers:
        #         footnote
        #         transactionCoding.transactionFormType
        #         transactionCoding.equitySwapInvolved
        #         transactionTimeliness (if it's still here, it's a duplicate)
        #         deemedExecutionDate, empty field, populated field should be deemedExecutionDate.value
        #         transactionTimeliness, empty field, pupulated field should be transactionTimeliness.value

        out_list = list(set(orig_df.columns.values) - set(empty_df.columns.values))
        for coln in out_list:
            i = coln.lower()
            if ("footnote" not in i and "equityswap" not in i
            and "formtype" not in i and "transactiontimeliness" not in i
            and "transactiontimeliness" not in i and "deemedexecutiondate" not in i):
                print("Unmatched column name: "+coln)
        
        return


In [7]:
def form4xml_toflatdict(filepath, filename):
    
    with open(filepath+filename) as f:
        data = f.read()

    # extract file around ownershipDocument 
    matcher = re.compile('<\?xml.*ownershipDocument>', flags=re.MULTILINE|re.DOTALL)
    matches = matcher.search(data)
    xml     = matches.group(0)

    # load entire xml to dict object
    xmldict = xmltodict.parse(xml)

    # use flatdict tool to flatten levels of dictionary for easy indexing
    return flatdict.FlatDict(xmldict["ownershipDocument"], delimiter='.')


In [8]:
def form4_tocsv(filepath, filename):
    """
    This is the main function that reads form 4 file and process it and save it to .csv database
    
    filepath: string, directory for file
    filename: string, full filename of Form-4.txt after pre-processing
    """
    
    # pre-processing .txt file, so that xml can be formatted properly with flatdict
    proc_form4txt(filepath, filename)
    
    # extract xml information to flatdict object
    full_dict = form4xml_toflatdict(filepath, filename+'.mod')

    # create subsections from the full dictionary
    # issuer and reportingOwner first; hopefully these fields are populated
    issuer_df = flatdict_toDF(full_dict["issuer"])
          
    if isinstance(full_dict["reportingOwner"], list):
        for item in full_dict["reportingOwner"]:
            tmp = flatdict.FlatDict(item, delimiter='.')
            reportingOwner_df = flatdict_toDF(tmp)
            form4df_tocsv(filepath, full_dict, issuer_df, reportingOwner_df)
        
        # # DEBUG only: use only one reporting Owner for multiple owner cases
        # item=full_dict["reportingOwner"][0]
        # tmp = flatdict.FlatDict(item, delimiter='.')
        # reportingOwner_df = flatdict_toDF(tmp)
        # form4df_tocsv(filepath, full_dict, issuer_df, reportingOwner_df)
        # # DEBUG only

    else:
        reportingOwner_df = flatdict_toDF(full_dict["reportingOwner"])
        form4df_tocsv(filepath, full_dict, issuer_df, reportingOwner_df)
        
    return


In [9]:
def form4df_tocsv(filepath, full_dict, issuer_df, reportingOwner_df):
    """
    This function takes read-in information and save it to .csv database
    
    filepath:          string, directory for file
    full_dict:         flatdict, contains full flatdic read from xml 
    issuer_df:         pandas DataFrame, contains issuer info
    reportingOwner_df: pandas DataFrame, contains reporting owner info
    """
    # work on nonDerivativeTable
    if "nonDerivativeTable.nonDerivativeTransaction" in full_dict.keys():
        nonDerivativeTable_df = flatdict_toDF(full_dict["nonDerivativeTable.nonDerivativeTransaction"])
        # add information about issuer and owner to the tables
        nonDerivative_cDF = concat_abtoC(issuer_df, reportingOwner_df, nonDerivativeTable_df)
        # remove the last row that was added for flatdict reading
        f4_nonDerivative  = f4data("nonDerivative", nonDerivative_cDF.iloc[:-1])
        save_dftocsv(filepath, "nonDerivative.csv", f4_nonDerivative.df)
        
    # work on derivativeTable
    if "derivativeTable.derivativeTransaction" in full_dict.keys():
        derivativeTable_df= flatdict_toDF(full_dict["derivativeTable.derivativeTransaction"])
        derivative_cDF    = concat_abtoC(issuer_df, reportingOwner_df, derivativeTable_df)
        f4_derivative     = f4data("derivative", derivative_cDF.iloc[:-1])
        save_dftocsv(filepath, "derivative.csv", f4_derivative.df)
    
    # work on footnotes
    if "footnotes.footnote" in full_dict.keys():
        footnotes_df  = flatdict_toDF(full_dict["footnotes.footnote"])
        footnotes_cDF = concat_abtoC(issuer_df, reportingOwner_df, footnotes_df)
        f4_footnotes  = f4data("footnotes", footnotes_cDF.iloc[:-1])
        save_dftocsv(filepath, "footnotes.csv", f4_footnotes.df)
    
    return

In [10]:
# read_form4txt("tmp.txt")

filepath="./test-jup/test_data"
directory = os.fsencode(filepath)

filename="1047122_1_0001047122-20-000051.txt"
filename = "1023844_1_0001437749-20-000181.txt"
filename = "1023844_1_0001437749-20-000181.txt.mod"
filename = "1192933_2_0001179110-20-005642.txt"
filename ="tmp.txt.mod"
filename ="1192933_2_0001179110-20-005642.txt"
filename ="tmp.txt"
filename ="1501695_3_0000950123-20-007937.txt"
filename ="104169_4_0001127602-20-030022.txt"
# form4_tocsv(filepath, filename)



In [11]:
filepath="./test-jup/test_data/"
directory = os.fsencode(filepath)
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".txt"): 
        print(filename)
        form4_tocsv(filepath, filename)
         



1656557_3_0001493152-20-018537.txt
1632944_3_0001567619-20-015688.txt
1395942_1_0001395942-20-000033.txt
1480691_2_0001567619-20-008697.txt
1505512_4_0001209191-20-061962.txt
1403161_2_0001127602-20-018860.txt
750686_2_0000750686-20-000165.txt
354707_1_0000354707-20-000046.txt
1487371_1_0001487371-20-000092.txt
1555280_4_0001555280-20-000303.txt
1136554_4_0001209191-20-061181.txt
1637459_2_0001637459-20-000087.txt
1692819_4_0001209191-20-054087.txt
1336745_4_0001213900-20-040637.txt
723646_3_0001638509-20-000008.txt
1047122_1_0001047122-20-000051.txt
1412408_1_0000899243-20-001780.txt
1664232_3_0001474506-20-000267.txt
1363364_2_0001638599-20-000500.txt
1619644_1_0001209191-20-001524.txt
1797374_2_0001225208-20-008176.txt
1686807_1_0000947871-20-000166.txt
912728_4_0000912728-20-000168.txt
1034604_2_0001209191-20-036702.txt
1642376_2_0001140361-20-013703.txt
1474838_1_0001123292-20-000425.txt
315054_2_0001140361-20-011342.txt
1109354_3_0001179110-20-009902.txt
19411_1_0001209191-20-016

In [12]:
nd = pd.read_csv("./test-jup/test_data/nonDerivative.csv")
d = pd.read_csv("./test-jup/test_data/derivative.csv")
f = pd.read_csv("./test-jup/test_data/footnotes.csv")
display(nd)
# print(f["footnote_"][0])

Unnamed: 0,issuerCik,issuerName,issuerTradingSymbol,reportingOwnerId.rptOwnerCik,reportingOwnerId.rptOwnerName,reportingOwnerAddress.rptOwnerStreet1,reportingOwnerAddress.rptOwnerStreet2,reportingOwnerAddress.rptOwnerCity,reportingOwnerAddress.rptOwnerState,reportingOwnerAddress.rptOwnerZipCode,...,transactionDate.value,deemedExecutionDate.value,transactionCoding.transactionCode,transactionTimeliness.value,transactionAmounts.transactionShares.value,transactionAmounts.transactionAcquiredDisposedCode.value,transactionAmounts.transactionPricePerShare.value,postTransactionAmounts.sharesOwnedFollowingTransaction.value,ownershipNature.directOrIndirectOwnership.value,ownershipNature.natureOfOwnership.value
0,1076682,"POLARITYTE, INC.",PTE,1656557,Hague Richard,123 WRIGHT BROTHERS DRIVE,,SALT LAKE CITY,UT,84116,...,2020-09-28,,F,,5283.0,D,1.0400,720529.0,D,
1,1564708,NEWS CORP,NWS,1632944,Pitofsky David B,C/O NEWS CORPORATION,1211 AVENUE OF THE AMERICAS,NEW YORK,NY,10036,...,2020-08-15,,M,,81217.0,A,,104083.0,D,
2,1564708,NEWS CORP,NWS,1632944,Pitofsky David B,C/O NEWS CORPORATION,1211 AVENUE OF THE AMERICAS,NEW YORK,NY,10036,...,2020-08-15,,F,,40778.0,D,15.3300,63305.0,D,
3,1395942,"KAR Auction Services, Inc.",KAR,1597236,Fisher Thomas J,"C/O: KAR AUCTION SERVICES, INC.",11299 NORTH ILLINOIS STREET,CARMEL,IN,46032,...,2020-02-22,,M,,753.0,A,0.0000,2073.0,D,
4,1395942,"KAR Auction Services, Inc.",KAR,1597236,Fisher Thomas J,"C/O: KAR AUCTION SERVICES, INC.",11299 NORTH ILLINOIS STREET,CARMEL,IN,46032,...,2020-02-22,,F,,216.0,D,22.2500,1857.0,D,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,766421,"ALASKA AIR GROUP, INC.",ALK,1715974,SCHNEIDER ANDREA L,19300 INTERNATIONAL BLVD,,SEATTLE,WA,98188,...,,,,,,,,12979.0,D,
269,766421,"ALASKA AIR GROUP, INC.",ALK,1715974,SCHNEIDER ANDREA L,19300 INTERNATIONAL BLVD,,SEATTLE,WA,98188,...,,,,,,,,15.0,I,BY SPOUSE
270,1717452,"Odonate Therapeutics, Inc.",ODT,1723966,O'Connell Joseph P,"C/O ODONATE THERAPEUTICS, INC.","4747 EXECUTIVE DRIVE, SUITE 210",SAN DIEGO,CA,92121,...,2020-07-15,,A,,27.0,A,34.9900,2101.0,D,
271,1759509,"Lyft, Inc.",LYFT,1760231,Makavy Ran I.,"C/O LYFT, INC.","185 BERRY STREET, SUITE 5000",SAN FRANCISCO,CA,94107,...,2020-02-26,,S,,9099.0,D,40.1611,574579.0,D,
