# KG Creation

In [1]:
%%capture
%pip install "google-cloud-aiplatform>=1.25.0" --upgrade
%pip install "google-cloud-aiplatform[pipelines]>=1.25.0"
%pip install "langchain>=0.0.237" graphdatascience pydantic gradio IProgress tqdm python-dotenv xmltodict

## Load Environment Variables

In [2]:
from dotenv import load_dotenv
import os

load_dotenv('credentials.env', override=True)

GCP_PROJECT_ID=os.getenv('GCP_PROJECT_ID')
GCP_REGION=os.getenv('GCP_REGION')
NEO4J_URI=os.getenv('NEO4J_URI')
NEO4J_USERNAME=os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD=os.getenv('NEO4J_PASSWORD')
AURA_DS=eval(os.getenv('AURA_DS').title()) #True/False

In [3]:
print(GCP_REGION)

us-central1


## Simple Parsing

In [4]:
# Import Test Case
import json
inp_text = ''
with open('./data/raw_2023-05-15_archives_edgar_data_1027451_0000919574-23-003245.txt') as f:
#with open('./data/raw_2023-05-15_archives_edgar_data_1748305_0001748305-23-000010.txt') as f:
    inp_text = f.read()

In [5]:
# Simple functions to extract data
import datetime
import xmltodict

# function to strip namespaces post xmltodict trnasformation
def strip_ns(x):
    if isinstance(x, dict):
        x_striped = dict()
        for k, v in x.items():
            x_striped[k.split(':')[-1]] = strip_ns(v)
    elif isinstance(x, list):
        x_striped = [strip_ns(i) for i in x]
    else:
        x_striped = x
    return x_striped


def extract_submission_info(contents):
    xml = contents[1].split('</XML>')[0].strip()
    return strip_ns(xmltodict.parse(xml))['edgarSubmission']

def extract_investment_info(contents):
    xml = contents[2].split('</XML>')[0].strip()
    return strip_ns(xmltodict.parse(xml))['informationTable']['infoTable'] 

def filter_and_format(info_tables, manager_cik, manager_name, report_period):
    res = []
    if isinstance(info_tables, dict):
        info_tables = [info_tables]
    for info_table in info_tables:
        # Skip none to report incidencnes
        if info_table['cusip'] == '000000000':
            pass
        # Only want stock holdings, not options
        if info_table['shrsOrPrnAmt']['sshPrnamtType'] != 'SH':
            pass
        # Only want holdings over $10m
        elif (float(info_table['value']) * 1000) < 10000000:
            pass
        # Only want common stock
        elif info_table['titleOfClass'] != 'COM':
            pass
        else:
            res.append({'managerCik': manager_cik, 
                        'managerName': manager_name, 
                        'reportCalendarOrQuarter': datetime.datetime.strptime(report_period, '%m-%d-%Y').date(),
                        'cusip': info_table['cusip'], 
                        'companyName':info_table['nameOfIssuer'], 
                        'value': float(info_table['value'].replace(' ', '') + '000'),
                        'shares': int(info_table['shrsOrPrnAmt']['sshPrnamt'])})
    return res

def extract_dict(txt):
    contents = txt.split('<XML>')
    submt_dict = extract_submission_info(contents)
    mng_cik = submt_dict['headerData']['filerInfo']['filer']['credentials']['cik']
    mng_name = submt_dict['formData']['coverPage']['filingManager']['name']
    report_period = submt_dict['formData']['coverPage']['reportCalendarOrQuarter']
    info_dict = extract_investment_info(contents)
    return filter_and_format(info_dict, mng_cik, mng_name, report_period)

In [6]:
# test to see
extract_dict(inp_text)

[{'managerCik': '0001027451',
  'managerName': 'TIGER MANAGEMENT L.L.C.',
  'reportCalendarOrQuarter': datetime.date(2023, 3, 31),
  'cusip': '023135106',
  'companyName': 'AMAZON COM INC',
  'value': 582556000.0,
  'shares': 5640},
 {'managerCik': '0001027451',
  'managerName': 'TIGER MANAGEMENT L.L.C.',
  'reportCalendarOrQuarter': datetime.date(2023, 3, 31),
  'cusip': '053332102',
  'companyName': 'AUTOZONE INC',
  'value': 9218063000.0,
  'shares': 3750},
 {'managerCik': '0001027451',
  'managerName': 'TIGER MANAGEMENT L.L.C.',
  'reportCalendarOrQuarter': datetime.date(2023, 3, 31),
  'cusip': '256677105',
  'companyName': 'DOLLAR GEN CORP NEW',
  'value': 7110391000.0,
  'shares': 33785},
 {'managerCik': '0001027451',
  'managerName': 'TIGER MANAGEMENT L.L.C.',
  'reportCalendarOrQuarter': datetime.date(2023, 3, 31),
  'cusip': 'N3167Y103',
  'companyName': 'FERRARI N V',
  'value': 357641000.0,
  'shares': 1320},
 {'managerCik': '0001027451',
  'managerName': 'TIGER MANAGEMENT 

In [7]:
%%time
# Go through all files and concatinate to dataframe
import pandas as pd

filing_dfs = []
for file_name in os.listdir('./data'):
        if file_name.endswith('.txt'):
            file_path = os.path.join('./data', file_name)
            with open(file_path, 'r') as file:
                filing = extract_dict(file.read())
                tmp_filing_df = pd.DataFrame(filing)
                tmp_filing_df['sourceFilingId'] = file_name
                filing_dfs.append(tmp_filing_df)
filing_df = pd.concat(filing_dfs, ignore_index=True)
filing_df

CPU times: user 52.3 s, sys: 361 ms, total: 52.6 s
Wall time: 52.6 s


Unnamed: 0,managerCik,managerName,reportCalendarOrQuarter,cusip,companyName,value,shares,sourceFilingId
0,0001574010,"Lowe Wealth Advisors, LLC",2023-06-30,002824100,ABBOTT LABORATORIES,1.798800e+07,165.0,raw_2023-07-07_archives_edgar_data_1574010_000...
1,0001574010,"Lowe Wealth Advisors, LLC",2023-06-30,007903107,Advanced Micro Devices Inc,1.139100e+07,100.0,raw_2023-07-07_archives_edgar_data_1574010_000...
2,0001574010,"Lowe Wealth Advisors, LLC",2023-06-30,008474108,Agnico Eagle Mines Ltd,1.499400e+07,300.0,raw_2023-07-07_archives_edgar_data_1574010_000...
3,0001574010,"Lowe Wealth Advisors, LLC",2023-06-30,009158106,Air Products & Chemicals Inc,5.990600e+07,200.0,raw_2023-07-07_archives_edgar_data_1574010_000...
4,0001574010,"Lowe Wealth Advisors, LLC",2023-06-30,016255101,Align Technology Inc,1.060900e+07,30.0,raw_2023-07-07_archives_edgar_data_1574010_000...
...,...,...,...,...,...,...,...,...
292778,0001004140,PENNSYLVANIA CAPITAL MANAGEMENT INC /ADV,2023-06-30,88160R101,TESLA INC,1.237649e+09,4728.0,raw_2023-07-18_archives_edgar_data_1004140_000...
292779,0001004140,PENNSYLVANIA CAPITAL MANAGEMENT INC /ADV,2023-06-30,882508104,TEXAS INSTRS INC,5.672430e+08,3151.0,raw_2023-07-18_archives_edgar_data_1004140_000...
292780,0001004140,PENNSYLVANIA CAPITAL MANAGEMENT INC /ADV,2023-06-30,883556102,THERMO FISHER SCIENTIFIC INC,8.254090e+08,1582.0,raw_2023-07-18_archives_edgar_data_1004140_000...
292781,0001004140,PENNSYLVANIA CAPITAL MANAGEMENT INC /ADV,2023-06-30,91324P102,UNITEDHEALTH GROUP INC,1.344350e+09,2797.0,raw_2023-07-18_archives_edgar_data_1004140_000...


## Using LLM
Hitting Input size limits

In [8]:
import vertexai
vertexai.init(project=GCP_PROJECT_ID, location=GCP_REGION)

In [9]:
from vertexai.preview.language_models import TextGenerationModel

def run_text_model(
    model_name: str,
    temperature: float,
    max_decode_steps: int,
    top_p: float,
    top_k: int,
    prompt: str,
    project_id: str = GCP_PROJECT_ID,
    location: str = GCP_REGION,
    tuned_model_name: str = None,
    ) :
    """Text Completion Use a Large Language Model."""
    vertexai.init(project=GCP_PROJECT_ID, location=location)
    model = TextGenerationModel.from_pretrained(model_name)
    if tuned_model_name is not None:
        model = model.get_tuned_model(tuned_model_name)
    response = model.predict(
        prompt,
        temperature=temperature,
        max_output_tokens=max_decode_steps,
        top_k=top_k,
        top_p=top_p)
    return response

In [10]:
def extract_entities_relationships(prompt, tuned_model_name: str = None):
    try:
        res = run_text_model("text-bison@001", 0, 1024, 0.8, 1, prompt, tuned_model_name=tuned_model_name)
        return res
    except Exception as e:
        print(e)

In [11]:
mgr_info_tpl="""From the text below, extract the following as json. Do not miss any of these information.
* The tags mentioned below may or may not namespaced. So extract accordingly. Eg: <ns1:tag> is equal to <tag>
* "name" - The name from the <name> tag under <filingManager> tag
* "street1" - The manager's street1 address from the <com:street1> tag under <address> tag
* "street2" - The manager's street2 address from the <com:street2> tag under <address> tag
* "city" - The manager's city address from the <com:city> tag under <address> tag
* "stateOrCounty" - The manager's stateOrCounty address from the <com:stateOrCountry> tag under <address> tag
* "zipCode" - The manager's zipCode from the <com:zipCode> tag under <address> tag
* "reportCalendarOrQuarter" - The reportCalendarOrQuarter from the <reportCalendarOrQuarter> tag under <address> tag
* Just return me the JSON enclosed by 3 backticks. No other text in the response

Text:
$ctext
"""

In [12]:
from string import Template

_prompt = Template(mgr_info_tpl).substitute(ctext=inp_text)

In [13]:
_extraction = extract_entities_relationships(_prompt)

400 Request contains an invalid argument.


In [14]:
print(_extraction)

None


In [15]:
filing_info_tpl = """From the text below, extract the following as a list of json enclosed by 3 back ticks. You will find many filing information under the <infoTable> tag. Extract all of them.
* "nameOfIssuer" - The name from the <nameOfIssuer> tag under <infoTable> tag
* "cusip" - The cusip from the <cusip> tag under <infoTable> tag
* "value" - The value from the <value> tag under <infoTable> tag
* "sshPrnamt" - The sshPrnamt from the <sshPrnamt> tag under <infoTable> tag
* "sshPrnamtType" - The sshPrnamtType from the <sshPrnamtType> tag under <infoTable> tag
* "investmentDiscretion" - The investmentDiscretion from the <investmentDiscretion> tag under <infoTable> tag
* "votingSole" - The votingSole from the <votingSole> tag under <infoTable> tag
* "votingShared" - The votingShared from the <votingShared> tag under <infoTable> tag
* "votingNone" - The votingNone from the <votingNone> tag under <infoTable> tag

Text:
$ctext
"""

In [16]:
_prompt = Template(filing_info_tpl).substitute(ctext=inp_text)
_extraction = extract_entities_relationships(_prompt)

400 Request contains an invalid argument.


In [17]:
print(_extraction)

None


In [18]:
inp_text

'<SEC-DOCUMENT>0000919574-23-003245.txt : 20230515\n<SEC-HEADER>0000919574-23-003245.hdr.sgml : 20230515\n<ACCEPTANCE-DATETIME>20230515103943\nACCESSION NUMBER:\t\t0000919574-23-003245\nCONFORMED SUBMISSION TYPE:\t13F-HR\nPUBLIC DOCUMENT COUNT:\t\t2\nCONFORMED PERIOD OF REPORT:\t20230331\nFILED AS OF DATE:\t\t20230515\nDATE AS OF CHANGE:\t\t20230515\nEFFECTIVENESS DATE:\t\t20230515\n\nFILER:\n\n\tCOMPANY DATA:\t\n\t\tCOMPANY CONFORMED NAME:\t\t\tTIGER MANAGEMENT L.L.C.\n\t\tCENTRAL INDEX KEY:\t\t\t0001027451\n\t\tIRS NUMBER:\t\t\t\t000000000\n\t\tSTATE OF INCORPORATION:\t\t\tDE\n\n\tFILING VALUES:\n\t\tFORM TYPE:\t\t13F-HR\n\t\tSEC ACT:\t\t1934 Act\n\t\tSEC FILE NUMBER:\t028-05892\n\t\tFILM NUMBER:\t\t23919492\n\n\tBUSINESS ADDRESS:\t\n\t\tSTREET 1:\t\t101 PARK AVENUE\n\t\tCITY:\t\t\tNEW YORK\n\t\tSTATE:\t\t\tNY\n\t\tZIP:\t\t\t10178\n\t\tBUSINESS PHONE:\t\t212-984-2500\n\n\tMAIL ADDRESS:\t\n\t\tSTREET 1:\t\t101 PARK AVENUE\n\t\tCITY:\t\t\tNEW YORK\n\t\tSTATE:\t\t\tNY\n\t\tZIP:\t\t\t101