#### Notes
* quarterly
* institutional investement managers with holdings over 100M
* Form 13F is required to be filed within 45 days of the end of a calendar quarter (which should be considered as significant information latency)
* only reports long positions (not short)
** different investment managers pursue different strategies with may bias results
** however, the vast majority of investment managers rely significantly on long positions for significant portion of fund performance
* 13F does not reveal international holdings (except for American depositary receipts).
* Section 13(f) securities generally include equity securities that trade on an exchange (including Nasdaq), certain equity options and warrants, shares of closed-end investment companies, and certain convertible debt securities.
* shares of open-end investment companies (i.e. mutual funds) are not Section 13(f) securities
* official list of qualifying securities: https://www.sec.gov/divisions/investment/13flists.htm
* excludes total portfolio value and percentage allocation of each stock listed
* Money managers allocate the most capital to their best ideas. Pay attention to "new positions" in their disclosures as these are their most recent ideas
* 13F is not their whole portfolio and that it's a past snapshot

In [None]:
import pandas as pd
import numpy as np
import html5lib
pd.set_option( 'display.notebook_repr_html', False )

from IPython.display import HTML # useful for snippets
#  e.g. HTML('<iframe src=http://en.mobile.wikipedia.org/?useformat=mobile width=700 height=350></iframe>')
from IPython.display import Image 
#  e.g. Image(filename='holt-winters-equations.png', embed=True) # url= also works
from IPython.display import YouTubeVideo
#  e.g. YouTubeVideo('1j_HxD4iLn8', start='43', width=600, height=400)
from IPython.core import page
get_ipython().set_hook('show_in_pager', page.as_hook(page.display_page), 0)

#  Generate PLOTS inside notebook, "inline" generates static png:
%matplotlib inline   
#          "notebook" argument allows interactive zoom and resize.


# note: https cannot be read by lxml


In [None]:
# load Q3 2018 report URLs
Q3Y18_index_df = pd.read_table('13f_Q3Y18_index.tsv', sep=',', index_col=False, encoding='latin-1')


In [None]:
# inspect size of dataset
Q3Y18_index_df.shape

In [None]:
# take sample of dataset for testing
percentage_sample = 5 # 5% dataset set to test dataset

test_df= Q3Y18_index_df.head(int(np.round(Q3Y18_index_df.shape[0]*percentage_sample/100)))
test_df

In [None]:
# inspect if URL to be parsed is valid
test_df['Filing URL .html'].iloc[0]

In [None]:
# initialize empty list to store dataframes from different investors (to be appended later)
appended_data = []

# loop through all reports, filter relevant data, create normalized dataframes per investor, add to list of dataframes to be appended
for index, row in test_df.iterrows():
 
    # need to parse initial html file for name of html file with investment data
    url = 'https://www.sec.gov/Archives/' + row['Filing URL .html'] #.iloc[index]
    page = pd.read_html( url )
    df = page[0]
    table_url_suffix = df[2].iloc[4]

    report_suffix = row['Filing URL .html'] 
    investor = row['Company Name'] 
    date = row['Filing Date'] 
    
    ### SET TO RETURN TOP 20 STOCKS PER INVESTOR (BY SIZE OF INVESTMENT)
    num_stocks_returned = 20

    stem = 'http://www.sec.gov/Archives/'
    xml_suffix = '/xslForm13F_X01/'

    report_suffix = report_suffix.replace('-index.html', '')
    report_suffix = report_suffix.replace('-', '')

    #  build URL to html file with investment data
    url = stem + report_suffix + xml_suffix + table_url_suffix
    print(url)
    
    # turn HTML file into dataframe
    page = pd.read_html( url )
    #  the last element of page contains relevant investement data
    df = page[-1]

    #  rename columns:
    df.columns = [ 'stock', 'class', 'cusip', 'usd', 'size', 'sh_prin', 'putcall', 'discret', 'manager', 'vote1', 'vote2', 'vote3']

    #  But first three rows are SEC labels, not data, 
    #  so delete them:
    df = df[3:]

    #  Start a new index from 0 instead of 3:
    df.reset_index( drop=True )


    #  Delete irrevelant columns:
    dflite = df.drop( df.columns[[1, 4, 5, 7, 8, 9, 10, 11]], axis=1 )

    #  usd needs float type since usd was read as string:
    dflite[['usd']] = dflite[['usd']].astype( float )
    #                  NOTE: int as type will fail for NaN

    #  Type change allows proper sort:
    dfusd = dflite.sort_values( by=['usd'], ascending=[False] )


    usdsum = sum( dfusd.usd )
    #  Portfolio total in USD:
    #usdsum


    #  New column for percentage of total portfolio:
    dfusd['pcent'] = np.round(( dfusd.usd / usdsum ) * 100, 2)


    # New column for date of report filling
    dfusd.insert(0, 'date', date)

    # New column for investor
    dfusd.insert(0, 'investor', investor)

    #  Dataframe per investor with top num_stocks_returned 
    appended_data.append(dfusd.head( num_stocks_returned ))

# show list of dataframes    
#appended_data



In [None]:
# Concat investor dataframes together
appended_data = pd.concat(appended_data, axis=0)

# Export as CSV file
appended_data.to_csv('test_results.csv')