# Load Into Neo4j


In [73]:
%%capture
%pip install neo4j python-dotenv pandas

In [74]:
import pandas as pd
from dotenv import load_dotenv
import os

pd.set_option('display.width', 0)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 50)


In [77]:
from neo4j import GraphDatabase

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_AUTH = (os.getenv('NEO4J_USERNAME'), os.getenv('NEO4J_PASSWORD'))

reserve_labels = ['Document', 'Company', 'Manager']
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for label in reserve_labels:
        driver.execute_query(f'''
            MATCH (n:{label}) WHERE n.id IS NOT NULL
            SET n:Ext{label}
            REMOVE n:{label}''', database='neo4j')

In [78]:
## Get cusip6
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    res,s,k = driver.execute_query('MATCH(n:Document) RETURN n.fileName AS fileName')
cusip6_list = [r.data()['fileName'][:6] for r in res]
cusip6_list

['06643P',
 '03957W',
 '00912N',
 '00164V',
 '08772B',
 '01626L',
 '09609G',
 '10807Q',
 '13645T',
 '10922N',
 '09075X',
 '30212W',
 '11135B',
 '35953D',
 '36260F',
 '037598',
 '30234E',
 '36162J',
 '53223X',
 '65342V',
 '50077B',
 '70319R',
 '74727A',
 '70432V',
 '92337U',
 '84863T',
 '97382D',
 '120076',
 '132152',
 '75382E',
 '164651',
 '171604',
 '227046',
 '228368',
 '398905',
 '320817',
 '523768',
 '482506',
 '679580',
 '687793',
 '577933',
 '423452',
 '589378',
 '630087',
 '909214',
 '831865',
 '925815',
 '882681',
 '844741']

In [79]:
max_num_assets = 50

In [80]:
form13_df = pd.read_csv('data/form13.csv')
form13_df

Unnamed: 0,source,managerCik,managerName,reportCalendarOrQuarter,cusip6,cusip,companyName,value,shares
0,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,03769M,03769M106,APOLLO GLOBAL MGMT INC,9.568500e+09,150000
1,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,03852U,03852U106,ARAMARK,1.240200e+10,300000
2,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,05605H,05605H100,BWX TECHNOLOGIES INC,1.951488e+10,336000
3,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,060505,060505104,BANK AMERICA CORP,1.324800e+10,400000
4,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,075887,075887109,BECTON DICKINSON & CO,2.265813e+10,89100
...,...,...,...,...,...,...,...,...,...
2651317,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,G29183,G29183103,Eaton,1.492448e+10,69976
2651318,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,G7709Q,G7709Q104,Royalty Pharma Plc,1.119886e+10,412633
2651319,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,G7T16G,G7T16G103,Sapiens,1.227408e+09,43173
2651320,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,H1467J,H1467J104,Chubb Limited,1.110786e+10,53357


In [81]:
form13_df[[ 'managerCik']].groupby(['managerCik']).size().reset_index().rename(columns={0:'numAssets'})[['numAssets']].groupby(['numAssets']).size()

numAssets
1         70
2         44
3         37
4        145
5         36
        ... 
14001      1
14622      1
14720      1
17710      1
22419      1
Length: 1322, dtype: int64

In [82]:
manager_ciks_w_cusip = form13_df.managerCik[form13_df.cusip6.isin(cusip6_list)].unique()
manager_ciks_w_cusip

array([1000097, 1000275, 1000742, ...,    9631,    9634,   98758])

In [83]:
asset_count_df = form13_df[['managerCik', 'managerName']].groupby(['managerCik']).size().reset_index().rename(columns={0:'numAssets'})
asset_count_df

Unnamed: 0,managerCik,numAssets
0,2230,365
1,4977,12
2,5272,10564
3,7195,374
4,7773,39
...,...,...
6014,2002654,493
6015,2002843,1
6016,2003074,9
6017,2003112,23


In [84]:
filtered_manager_list = asset_count_df.managerCik[(asset_count_df.numAssets <= max_num_assets) & asset_count_df.managerCik.isin(manager_ciks_w_cusip)].tolist()
print(len(filtered_manager_list ))
filtered_manager_list

102


[860662,
 921669,
 1027451,
 1103887,
 1159159,
 1218735,
 1259927,
 1275218,
 1275893,
 1278235,
 1301050,
 1336528,
 1337851,
 1351351,
 1357993,
 1387303,
 1420995,
 1426857,
 1427119,
 1447228,
 1464790,
 1484040,
 1492815,
 1505791,
 1509874,
 1510677,
 1512415,
 1512566,
 1515070,
 1517137,
 1524493,
 1531611,
 1532262,
 1536630,
 1536799,
 1539436,
 1540531,
 1551867,
 1567752,
 1580162,
 1596510,
 1598843,
 1599562,
 1604867,
 1635342,
 1647251,
 1649339,
 1649994,
 1665012,
 1666667,
 1683073,
 1688382,
 1693141,
 1710593,
 1716476,
 1721508,
 1722967,
 1727353,
 1728120,
 1728449,
 1730774,
 1743937,
 1748269,
 1753875,
 1759115,
 1762718,
 1766752,
 1769062,
 1770525,
 1771067,
 1773994,
 1782124,
 1786767,
 1801264,
 1802290,
 1804007,
 1805591,
 1807092,
 1808696,
 1808919,
 1825034,
 1833486,
 1839930,
 1842766,
 1844567,
 1844645,
 1847921,
 1850901,
 1859691,
 1878063,
 1896379,
 1906003,
 1909739,
 1910321,
 1912187,
 1930346,
 1932998,
 1939480,
 1949877,
 1967205,
 1

In [85]:
#form13_df[form13_df.cusip6.isin(cusip6_list) & form13_df.managerCik.isin(filtered_manager_list)]
filtered_form13_df = form13_df[form13_df.managerCik.isin(filtered_manager_list)]

In [86]:
filtered_form13s = filtered_form13_df.to_dict(orient='records')
filtered_form13s

[{'source': 'https://sec.gov/Archives/edgar/data/1027451/0000919574-23-001354.txt',
  'managerCik': 1027451,
  'managerName': 'TIGER MANAGEMENT L.L.C.',
  'reportCalendarOrQuarter': '2022-12-31',
  'cusip6': '023135',
  'cusip': '023135106',
  'companyName': 'AMAZON COM INC',
  'value': 473760000.0,
  'shares': 5640},
 {'source': 'https://sec.gov/Archives/edgar/data/1027451/0000919574-23-001354.txt',
  'managerCik': 1027451,
  'managerName': 'TIGER MANAGEMENT L.L.C.',
  'reportCalendarOrQuarter': '2022-12-31',
  'cusip6': '053332',
  'cusip': '053332102',
  'companyName': 'AUTOZONE INC',
  'value': 15783552000.0,
  'shares': 6400},
 {'source': 'https://sec.gov/Archives/edgar/data/1027451/0000919574-23-001354.txt',
  'managerCik': 1027451,
  'managerName': 'TIGER MANAGEMENT L.L.C.',
  'reportCalendarOrQuarter': '2022-12-31',
  'cusip6': '166764',
  'cusip': '166764100',
  'companyName': 'CHEVRON CORP NEW',
  'value': 215388000.0,
  'shares': 1200},
 {'source': 'https://sec.gov/Archives/

In [87]:
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    driver.execute_query('CREATE CONSTRAINT unique_manager IF NOT EXISTS FOR (n:Manager) REQUIRE (n.cik) IS NODE KEY')
    driver.execute_query('CREATE CONSTRAINT unique_company_id IF NOT EXISTS FOR (n:Company) REQUIRE (n.cusip) IS NODE KEY')

In [88]:
# As the dataset gets bigger we will want to chunk up the filings we send to Neo4j
def chunks(xs, n=10_000):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        UNWIND $records AS record
        MERGE (c:Manager {cik: record.managerCik})
        SET c.managerName = record.managerName
        RETURN count(c) AS manager_node_merge_count
        ''', parameters_={'records': d})
        print(res)
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        UNWIND $records AS record
        MERGE (c:Company {cusip: record.cusip6})
        SET c.companyName = record.companyName
        RETURN count(c) AS company_node_merge_count
        ''', parameters_={'records': d})
        print(res)

[<Record fileName='06643P-BANKFINANCIAL_CORP-form10k-item1-2023.pdf'>, <Record fileName='03957W-ARCHROCK_INC-form10k-item1-2023.pdf'>, <Record fileName='00912N-AIR_INDS_GROUP-form10k-item1-2023.pdf'>, <Record fileName='00164V-AMC_NETWORKS_INC-form10k-item1-2023.pdf'>, <Record fileName="08772B-ONE,_'BETTER_WORLD_ACQUISITION_COR-form10k-item1-2023.pdf">, <Record fileName='01626L-ALIGOS_THERAPEUTICS_INC-form10k-item1-2023.pdf'>, <Record fileName='09609G-BLUEBIRD_BIO_INC-form10k-item1-2023.pdf'>, <Record fileName='10807Q-BRIDGELINE_DIGITAL_INC-form10k-item1-2023.pdf'>, <Record fileName='13645T-CANADIAN_PAC_RY_LTD-form10k-item1-2023.pdf'>, <Record fileName='10922N-BRIGHTHOUSE_FINL_PFD-form10k-item1-2023.pdf'>, <Record fileName='09075X-BIODESIX_INC-form10k-item1-2023.pdf'>, <Record fileName='30212W-EXP_WORLD_HLDGS_INC-form10k-item1-2023.pdf'>, <Record fileName='11135B-BROADMARK_REALTY_CAP_INC-form10k-item1-2023.pdf'>, <Record fileName='35953D-FUBOTV_INC-form10k-item1-2023.pdf'>, <Record file

In [89]:
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        UNWIND $records AS record
        MATCH (m:Manager {cik: record.managerCik})
        MATCH (c:Company {cusip: record.cusip6})
        MERGE(m)-[r:OWNS]->(c)
        SET r.reportCalendarOrQuarter = record.reportCalendarOrQuarter,
            r.value = record.value,
            r.shares = record.shares,
            r.source = record.source
        RETURN count(r) AS owns_relationship_merge_count
        ''', parameters_={'records': d})
        print(res)

[<Record fileName='06643P-BANKFINANCIAL_CORP-form10k-item1-2023.pdf'>, <Record fileName='03957W-ARCHROCK_INC-form10k-item1-2023.pdf'>, <Record fileName='00912N-AIR_INDS_GROUP-form10k-item1-2023.pdf'>, <Record fileName='00164V-AMC_NETWORKS_INC-form10k-item1-2023.pdf'>, <Record fileName="08772B-ONE,_'BETTER_WORLD_ACQUISITION_COR-form10k-item1-2023.pdf">, <Record fileName='01626L-ALIGOS_THERAPEUTICS_INC-form10k-item1-2023.pdf'>, <Record fileName='09609G-BLUEBIRD_BIO_INC-form10k-item1-2023.pdf'>, <Record fileName='10807Q-BRIDGELINE_DIGITAL_INC-form10k-item1-2023.pdf'>, <Record fileName='13645T-CANADIAN_PAC_RY_LTD-form10k-item1-2023.pdf'>, <Record fileName='10922N-BRIGHTHOUSE_FINL_PFD-form10k-item1-2023.pdf'>, <Record fileName='09075X-BIODESIX_INC-form10k-item1-2023.pdf'>, <Record fileName='30212W-EXP_WORLD_HLDGS_INC-form10k-item1-2023.pdf'>, <Record fileName='11135B-BROADMARK_REALTY_CAP_INC-form10k-item1-2023.pdf'>, <Record fileName='35953D-FUBOTV_INC-form10k-item1-2023.pdf'>, <Record file

In [90]:
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        MATCH(n:Document)
        WITH left(n.fileName,6) AS cusip, n
        MATCH(c:Company {cusip:cusip})
        MERGE (c)-[:HAS_DOCUMENT]->(n)
        ''', parameters_={'records': d})
        print(res)

[<Record fileName='06643P-BANKFINANCIAL_CORP-form10k-item1-2023.pdf'>, <Record fileName='03957W-ARCHROCK_INC-form10k-item1-2023.pdf'>, <Record fileName='00912N-AIR_INDS_GROUP-form10k-item1-2023.pdf'>, <Record fileName='00164V-AMC_NETWORKS_INC-form10k-item1-2023.pdf'>, <Record fileName="08772B-ONE,_'BETTER_WORLD_ACQUISITION_COR-form10k-item1-2023.pdf">, <Record fileName='01626L-ALIGOS_THERAPEUTICS_INC-form10k-item1-2023.pdf'>, <Record fileName='09609G-BLUEBIRD_BIO_INC-form10k-item1-2023.pdf'>, <Record fileName='10807Q-BRIDGELINE_DIGITAL_INC-form10k-item1-2023.pdf'>, <Record fileName='13645T-CANADIAN_PAC_RY_LTD-form10k-item1-2023.pdf'>, <Record fileName='10922N-BRIGHTHOUSE_FINL_PFD-form10k-item1-2023.pdf'>, <Record fileName='09075X-BIODESIX_INC-form10k-item1-2023.pdf'>, <Record fileName='30212W-EXP_WORLD_HLDGS_INC-form10k-item1-2023.pdf'>, <Record fileName='11135B-BROADMARK_REALTY_CAP_INC-form10k-item1-2023.pdf'>, <Record fileName='35953D-FUBOTV_INC-form10k-item1-2023.pdf'>, <Record file

In [91]:
# Create structured info params
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    driver.execute_query('''
    MATCH(:Manager)-[r:OWNS]->()
    SET r.info= '(report period: '+ toString(coalesce(r.reportCalendarOrQuarter, '')) + ') owned '+ toString(coalesce(r.shares,'')) + ' shares of'
    RETURN r.info
    ''')
    driver.execute_query('''
    MATCH(n:Manager)
    SET n.info='Asset Manager ' + n.managerName + ' (cik ' + toString(n.cik) + ')'
    RETURN n.info
    ''')
    driver.execute_query('''
    MATCH(n:Company)
    SET n.info=n.companyName + ' (cusip ' + toString(n.cusip) + ')'
    RETURN n.info
    ''')

In [None]:
# Create openai embeddings and vector index
'''
MATCH (n:Chunk) WHERE size(n.text) <> 0
WITH collect(n) AS nodes, toInteger(rand()*20) AS partition
CALL {
    WITH nodes
    CALL genai.vector.encodeBatch([node IN nodes| node.text], "OpenAI", { token: $openAIKey})
    YIELD index, vector
    CALL db.create.setNodeVectorProperty(nodes[index], "openAIEmbedding", vector)
} IN TRANSACTIONS OF 1 ROW;
CREATE VECTOR INDEX openai_embeddings IF NOT EXISTS FOR (n:Chunk) ON (n.openAIEmbedding)
OPTIONS {indexConfig: {
 `vector.dimensions`: toInteger(1536),
 `vector.similarity_function`: 'cosine'
}};
'''