In [1]:
import requests as r
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import sleep

In [6]:
def getTradeInfo(html):
    cik = html.find('rptOwnerCik'.lower()).text
    name = html.find('rptOwnerName'.lower()).text
    dates, sales, amounts, rsus = [], [], [], []
    footnotes = html.find('footnotes')
    for transaction in html.findAll('nonDerivativeTransaction'.lower()):
        dates.append(pd.to_datetime( transaction.find('transactionDate'.lower()).text.strip() ))
        sales.append(transaction.find('transactionAcquiredDisposedCode'.lower()).text.strip() == 'D')
        
        shares = transaction.find('transactionShares'.lower())
        amounts.append(float( shares.text.strip() ))
        footnoteNm = shares.find('footnoteId'.lower()).attrs['id']
        footnoteTxt = footnotes.find(id=footnoteNm).text.lower()
        rsus.append(isRSUTrade(footnoteTxt))
    return cik, name, dates, sales, amounts, rsus

def isRSUTrade(text):
    rsuStrings = ['restricted', 'stock', 'units']
    return all(text.find(rsuString) > -1 for rsuString in rsuStrings)

def parsePathSuffixsFromCompanyFile(companyFile, companyName, formIndex=2, formName='4'):
    pathSuffixs = []
    foundCompany = False
    for line in companyFile:
        ls = line.split()
        if len(ls) > 3:
            if ls[:formIndex] == companyName.split() and ls[formIndex] == formName:
                pathSuffixs.append(ls[-1])
                foundCompany = True
            elif foundCompany is True:
                break #sorted by company
    return pathSuffixs

In [22]:
#parameters
currYear = 2020
fileNameZip, fileName = 'company.zip', 'company.idx'
companyName = 'BOEING CO'
formName = '4'
years = [i for i in range(1993, currYear+1)]
#years = [i for i in range(2018, 2021)] #test
quarters = ['QTR{}'.format(i+1) for i in range(4)]

In [28]:
#read local form 4's for company companyName as indexed
#by company.idx in year/quarter/ local files into memory
#build as
#dict(([dates], [sales], [amounts], [ownedAfters]))[cik]
#later transform to
#dict(pd.dataframe[date, sale, amount, ownedAfter])[cik]
transactions = dict()
cikToName = dict()
cikToPaths = dict()
for year in years:
    for quarter in quarters:
        path = '/'.join(('./data', str(year), quarter, ''))
        try:
            with open(path + fileName, 'r') as f:
                pathSuffixs = parsePathSuffixsFromCompanyFile(f, companyName, formName=formName)
        except FileNotFoundError:
            print('no file {} found'.format(path + fileName))
        else:
            #read from files into memory
            for pathSuffix in pathSuffixs:
                #pathSuffix form: 'edgar/data/.../....txt'
                path = '/'.join(('.', pathSuffix[6:]))
                with open(path, 'r') as f:
                    fileText = ''.join(line for line in f)
                html = BeautifulSoup(fileText, 'html.parser')
                #print(html)
                cik, name, dates, sales, amounts, rsus = getTradeInfo(html)
                if len(dates) > 0:
                    if cik not in transactions:
                        cikToName[cik] = name
                        cikToPaths[cik] = []
                        transactions[cik] = {'dates': [],
                                             'sales': [],
                                             'amounts': [],
                                             'rsus': []}
                    cikToPaths[cik].append(path)
                    transactions[cik]['dates'] += dates
                    transactions[cik]['sales'] += sales
                    transactions[cik]['amounts'] += amounts
                    transactions[cik]['rsus'] += rsus

no file ./data/2018/QTR4/company.idx found
no file ./data/2019/QTR1/company.idx found
no file ./data/2019/QTR2/company.idx found
no file ./data/2019/QTR3/company.idx found
no file ./data/2019/QTR4/company.idx found
no file ./data/2020/QTR2/company.idx found
no file ./data/2020/QTR3/company.idx found
no file ./data/2020/QTR4/company.idx found


In [29]:
print(len(transactions),
    sum(len(transactions[key]) for key in transactions))
transactions

1 4


{'0001798479': {'dates': [Timestamp('2020-01-02 00:00:00')],
  'sales': [False],
  'amounts': [1000.0],
  'rsus': [True]}}

In [None]:
#convert to dataframes & change amounts to account for transaction direction
for cik in transactions:
    transactions[cik] = pd.DataFrame(transactions[cik])
    transactions[cik]['amounts'] = transactions[cik]['amounts'] * (1. - 2. * transactions[cik]['sales'])

In [None]:
print(cikToUrls['0001671300'])
transactions['0001671300']

In [None]:
#todo:
##fix url fails; add loop to walk thru failed urls until completion?
##why sum(amounts) != ownedAfter?
##add ability to pull data from multiple years/quarters
##(1) entirely separate: walk thru index files, populate list with urls, plug into this code
##(2) outer loop: in new outer loop, read in an index file, then run this code as inner loop
##(3) ??
###I like the idea of having separate parts of code do separate things [ie. like "(1)" above]
##okay, let's go w/ "(1)" for now. 
##ideally want to have either a database or file storage system so that I don't have to grab
##all the files from SEC every time; should just update w/ any new ones
##(a) replace existing calls to online files with download of online files, calls to local files
##(b) 