# Load Into Neo4j


In [3]:
%%capture
%pip install neo4j python-dotenv pandas

In [2]:
import pandas as pd
from dotenv import load_dotenv
import os

pd.set_option('display.width', 0)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 50)


In [35]:
from neo4j import GraphDatabase

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEW_NEO4J_URI')
NEO4J_AUTH = (os.getenv('NEW_NEO4J_USERNAME'), os.getenv('NEW_NEO4J_PASSWORD'))

reserve_labels = ['Document', 'Company', 'Manager']
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for label in reserve_labels:
        driver.execute_query(f'''
            MATCH (n:{label}) WHERE n.id IS NOT NULL
            SET n:Ext{label}
            REMOVE n:{label}''', database='neo4j')

In [36]:
## Get cusip6
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    res,s,k = driver.execute_query('MATCH(n:Document) RETURN n.fileName AS fileName')
cusip6_list = [r.data()['fileName'][:6] for r in res]
cusip6_list

['00123Q', '001055', '001228']

In [37]:
max_num_assets = 100

In [38]:
form13_df = pd.read_csv('data/form13.csv')
form13_df

Unnamed: 0,source,managerCik,managerName,reportCalendarOrQuarter,cusip6,cusip,companyName,value,shares
0,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,03769M,03769M106,APOLLO GLOBAL MGMT INC,9.568500e+09,150000
1,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,03852U,03852U106,ARAMARK,1.240200e+10,300000
2,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,05605H,05605H100,BWX TECHNOLOGIES INC,1.951488e+10,336000
3,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,060505,060505104,BANK AMERICA CORP,1.324800e+10,400000
4,https://sec.gov/Archives/edgar/data/1000097/0001000097-23-000002.txt,1000097,"KINGDON CAPITAL MANAGEMENT, L.L.C.",2022-12-31,075887,075887109,BECTON DICKINSON & CO,2.265813e+10,89100
...,...,...,...,...,...,...,...,...,...
2651317,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,G29183,G29183103,Eaton,1.492448e+10,69976
2651318,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,G7709Q,G7709Q104,Royalty Pharma Plc,1.119886e+10,412633
2651319,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,G7T16G,G7T16G103,Sapiens,1.227408e+09,43173
2651320,https://sec.gov/Archives/edgar/data/98758/0000950123-23-009574.txt,98758,Torray Investment Partners LLC,2023-09-30,H1467J,H1467J104,Chubb Limited,1.110786e+10,53357


In [16]:
form13_df[[ 'managerCik']].groupby(['managerCik']).size().reset_index().rename(columns={0:'numAssets'})[['numAssets']].groupby(['numAssets']).size()

numAssets
1         70
2         44
3         37
4        145
5         36
        ... 
14001      1
14622      1
14720      1
17710      1
22419      1
Length: 1322, dtype: int64

In [27]:
manager_ciks_w_cusip = form13_df.managerCik[form13_df.cusip6.isin(cusip6_list)].unique()
manager_ciks_w_cusip

[1000275,
 1001085,
 1002672,
 1002784,
 1003518,
 1005354,
 1005441,
 1005817,
 1006378,
 1007280,
 1007524,
 1008322,
 1008929,
 1009076,
 1009207,
 1009209,
 1010873,
 1011443,
 1013701,
 1016021,
 1016287,
 1018331,
 1018674,
 1020317,
 1020580,
 1020585,
 1021258,
 1021926,
 102909,
 1033505,
 1034771,
 1034886,
 1035350,
 1035463,
 1037389,
 1037763,
 1039765,
 1040188,
 1044905,
 1044929,
 1046192,
 1050068,
 1050470,
 1053055,
 1054677,
 105495,
 1055964,
 1055980,
 1056053,
 1056288,
 1056527,
 1056559,
 1056823,
 1062938,
 1066816,
 1068837,
 1068855,
 107136,
 1071640,
 1074272,
 1078013,
 1079736,
 1080107,
 1080369,
 1080493,
 1082339,
 1083190,
 1083340,
 1084208,
 1086318,
 1086483,
 1086619,
 1088731,
 1089877,
 1090413,
 1091923,
 1092838,
 1092903,
 1095836,
 1102578,
 1105467,
 1105837,
 1105863,
 1107261,
 1107314,
 1108969,
 1109228,
 1109448,
 1114618,
 1115055,
 1115418,
 1120927,
 1123812,
 1125816,
 1126328,
 1128213,
 1129919,
 1130344,
 1132716,
 1133014,
 11

In [29]:
asset_count_df = form13_df[['managerCik', 'managerName']].groupby(['managerCik']).size().reset_index().rename(columns={0:'numAssets'})
asset_count_df

Unnamed: 0,managerCik,numAssets
0,2230,365
1,4977,12
2,5272,10564
3,7195,374
4,7773,39
...,...,...
6014,2002654,493
6015,2002843,1
6016,2003074,9
6017,2003112,23


In [31]:
filtered_manager_list = asset_count_df.managerCik[(asset_count_df.numAssets <= max_num_assets) & asset_count_df.managerCik.isin(manager_ciks_w_cusip)].tolist()
print(len(filtered_manager_list ))
filtered_manager_list

49


[897599,
 924171,
 945625,
 1091923,
 1140771,
 1218199,
 1278951,
 1351063,
 1364725,
 1373017,
 1499066,
 1534949,
 1536063,
 1575301,
 1602020,
 1702490,
 1703081,
 1722053,
 1730525,
 1775053,
 1775391,
 1781948,
 1791555,
 1803294,
 1803523,
 1808361,
 1812093,
 1841633,
 1845617,
 1849055,
 1859474,
 1867570,
 1894203,
 1896419,
 1905106,
 1905644,
 1911087,
 1911264,
 1911735,
 1940823,
 1942361,
 1964829,
 1966171,
 1967205,
 1989941,
 1994333,
 1999827,
 2000571,
 2001016]

In [40]:
#form13_df[form13_df.cusip6.isin(cusip6_list) & form13_df.managerCik.isin(filtered_manager_list)]
filtered_form13_df = form13_df[form13_df.managerCik.isin(filtered_manager_list)]

In [45]:
filtered_form13s = filtered_form13_df.to_dict(orient='records')
filtered_form13s

[{'source': 'https://sec.gov/Archives/edgar/data/1091923/0001091923-23-000001.txt',
  'managerCik': 1091923,
  'managerName': 'PENINSULA ASSET MANAGEMENT INC',
  'reportCalendarOrQuarter': '2022-12-31',
  'cusip6': '464287',
  'cusip': '464287507',
  'companyName': 'S&P Mid Cap 400 Ishares',
  'value': 17297000.0,
  'shares': 71508},
 {'source': 'https://sec.gov/Archives/edgar/data/1091923/0001091923-23-000001.txt',
  'managerCik': 1091923,
  'managerName': 'PENINSULA ASSET MANAGEMENT INC',
  'reportCalendarOrQuarter': '2022-12-31',
  'cusip6': '464287',
  'cusip': '464287804',
  'companyName': 'S&P Small Cap 600 Ishares',
  'value': 10084000.0,
  'shares': 106555},
 {'source': 'https://sec.gov/Archives/edgar/data/1091923/0001091923-23-000007.txt',
  'managerCik': 1091923,
  'managerName': 'PENINSULA ASSET MANAGEMENT INC',
  'reportCalendarOrQuarter': '2023-03-31',
  'cusip6': '001055',
  'cusip': '001055102',
  'companyName': 'AFLAC INC',
  'value': 4191542000.0,
  'shares': 64965},
 

In [46]:
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    driver.execute_query('CREATE CONSTRAINT unique_manager IF NOT EXISTS FOR (n:Manager) REQUIRE (n.cik) IS NODE KEY')
    driver.execute_query('CREATE CONSTRAINT unique_company_id IF NOT EXISTS FOR (n:Company) REQUIRE (n.cusip) IS NODE KEY')

In [50]:
# As the dataset gets bigger we will want to chunk up the filings we send to Neo4j
def chunks(xs, n=10_000):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        UNWIND $records AS record
        MERGE (c:Manager {cik: record.managerCik})
        SET c.managerName = record.managerName
        RETURN count(c) AS manager_node_merge_count
        ''', parameters_={'records': d})
        print(res)
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        UNWIND $records AS record
        MERGE (c:Company {cusip: record.cusip6})
        SET c.companyName = record.companyName
        RETURN count(c) AS company_node_merge_count
        ''', parameters_={'records': d})
        print(res)

[<Record fileName='00123Q-AGNC_INVT_CORP-form10k-item1-2023.pdf'>, <Record fileName='001055-AFLAC_INC-form10k-item1-2023.pdf'>, <Record fileName='001228-AG_MORTGAGE_INVESTMENT_TRUST-form10k-item1-2023.pdf'>]
[<Record fileName='00123Q-AGNC_INVT_CORP-form10k-item1-2023.pdf'>, <Record fileName='001055-AFLAC_INC-form10k-item1-2023.pdf'>, <Record fileName='001228-AG_MORTGAGE_INVESTMENT_TRUST-form10k-item1-2023.pdf'>]


In [51]:
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        UNWIND $records AS record
        MATCH (m:Manager {cik: record.managerCik})
        MATCH (c:Company {cusip: record.cusip6})
        MERGE(m)-[r:OWNS]->(c)
        SET r.reportCalendarOrQuarter = record.reportCalendarOrQuarter,
            r.value = record.value,
            r.shares = record.shares,
            r.source = record.source
        RETURN count(r) AS owns_relationship_merge_count
        ''', parameters_={'records': d})
        print(res)

[<Record fileName='00123Q-AGNC_INVT_CORP-form10k-item1-2023.pdf'>, <Record fileName='001055-AFLAC_INC-form10k-item1-2023.pdf'>, <Record fileName='001228-AG_MORTGAGE_INVESTMENT_TRUST-form10k-item1-2023.pdf'>]


In [52]:
with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
    for d in chunks(filtered_form13_df.to_dict(orient='records')):
        driver.execute_query('''
        MATCH(n:Document)
        WITH left(n.fileName,6) AS cusip, n
        MATCH(c:Company {cusip:cusip})
        MERGE (c)-[:HAS_DOCUMENT]->(n)
        ''', parameters_={'records': d})
        print(res)

[<Record fileName='00123Q-AGNC_INVT_CORP-form10k-item1-2023.pdf'>, <Record fileName='001055-AFLAC_INC-form10k-item1-2023.pdf'>, <Record fileName='001228-AG_MORTGAGE_INVESTMENT_TRUST-form10k-item1-2023.pdf'>]


In [None]:
"""MATCH(:Manager)-[r:OWNS]->()
SET r.info= '(report period: '+ toString(coalesce(r.reportCalendarOrQuarter, '')) + ') owned '+ toString(coalesce(r.shares,'')) + ' shares of'
RETURN r.info

MATCH(n:Manager)
SET n.info='Asset Manager ' + n.managerName + ' (cik ' + toString(n.cik) + ')'
RETURN n.info

MATCH(n:Company)
SET n.info=n.companyName + ' (cusip ' + toString(n.cusip) + ')'
RETURN n.info"""