# Bridging the Gaps in LLM Interpretability of Unstructured Financial Earnings Data with Knowledge Graphs
- Use case: answer questions about companies' financial performance based on the transcripts of their earnings calls.
- Uses StrictJSON to parse the Knowledge Graph: https://github.com/tanchongmin/strictjson

## Import packages

In [47]:
!pip install strictjson pandas



In [48]:
from strictjson import *
import os
import openai
from openai import OpenAI
import networkx as nx
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import json
import pandas as pd
import numpy as np

In [49]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

## Load dataset

In [50]:
# Function to parse JSONL file
def parse_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

file_path = 'data/earnings-transcripts.jsonl'
earnings_data = parse_jsonl(file_path)


In [51]:
df = pd.DataFrame(earnings_data)

In [52]:
df.head()

Unnamed: 0,date,exchange,q,ticker,transcript
0,"Aug 27, 2020, 9:00 p.m. ET",NASDAQ: BILI,2020-Q2,BILI,"Prepared Remarks:\nOperator\nGood day, and wel..."
1,"Jul 30, 2020, 4:30 p.m. ET",NYSE: GFF,2020-Q3,GFF,Prepared Remarks:\nOperator\nThank you for sta...
2,"Oct 23, 2019, 5:00 p.m. ET",NASDAQ: LRCX,2020-Q1,LRCX,Prepared Remarks:\nOperator\nGood day and welc...
3,"Nov 6, 2019, 12:00 p.m. ET",NASDAQ: BBSI,2019-Q3,BBSI,"Prepared Remarks:\nOperator\nGood day, everyon..."
4,"Aug 7, 2019, 8:30 a.m. ET",NASDAQ: CSTE,2019-Q2,CSTE,Prepared Remarks:\nOperator\nGreetings and wel...


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        1000 non-null   object
 1   exchange    1000 non-null   object
 2   q           1000 non-null   object
 3   ticker      1000 non-null   object
 4   transcript  1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [54]:
df.describe()

Unnamed: 0,date,exchange,q,ticker,transcript
count,1000,1000,1000,1000,1000
unique,783,703,22,703,962
top,"May 27, 2020, 9:00 p.m. ET",NASDAQ: TSLA,2020-Q4,TSLA,"Prepared Remarks:\nOperator\nGood morning, and..."
freq,7,8,210,8,7


In [55]:
# Remove data and exchange columns as they are not needed
df = df[['q', 'ticker', 'transcript']]
df.head()

Unnamed: 0,q,ticker,transcript
0,2020-Q2,BILI,"Prepared Remarks:\nOperator\nGood day, and wel..."
1,2020-Q3,GFF,Prepared Remarks:\nOperator\nThank you for sta...
2,2020-Q1,LRCX,Prepared Remarks:\nOperator\nGood day and welc...
3,2019-Q3,BBSI,"Prepared Remarks:\nOperator\nGood day, everyon..."
4,2019-Q2,CSTE,Prepared Remarks:\nOperator\nGreetings and wel...


## Utility functions

In [56]:
def chat(system_prompt, user_prompt = '', model = 'gpt-4', temperature = 0, **kwargs):
    ''' This replies the user based on a system prompt and user prompt to call OpenAI Chat Completions API '''
    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        temperature = temperature,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        **kwargs
    )
    res = response.choices[0].message.content
    return res

In [57]:
def plot_graph(kg):
    ''' Plots graph based on knowledge graph '''
    # Create graph
    G = nx.DiGraph()
    G.add_edges_from((source, target, {'relation': relation}) for source, relation, target in kg)

    # Plot the graph
    plt.figure(figsize=(10,6), dpi=300)
    pos = nx.spring_layout(G, k=3, seed=0)

    nx.draw_networkx_nodes(G, pos, node_size=1500)
    nx.draw_networkx_edges(G, pos, edge_color='gray')
    nx.draw_networkx_labels(G, pos, font_size=12)
    edge_labels = nx.get_edge_attributes(G, 'relation')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10)

    # Display the plot
    plt.axis('off')
    plt.show()

In [58]:
# Function to summarize text
def summarize_text(text):
    '''Summarize long text to a shorter format'''
    completion = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": f"Summarize the following text: {text}"}],
        max_tokens=500,
        temperature=0
    )
    return completion.choices[0].message.content

In [59]:
import json
from string import Template

# Function to extract nodes and edges from summarized text
def extract_nodes_edges(transcript_summary, schema):
    '''Extract nodes and edges from summarized text using schema'''
    completion = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "Extract nodes and edges for a knowledge graph."},
            {"role": "user", "content": f"Extract entities and relationships using the following schema: {json.dumps(schema)}. Here's the text: {transcript_summary}"}
        ],
        max_tokens=1500,
        temperature=0
    )
    return completion.choices[0].message.content

In [60]:
import pandas as pd

# Summarize the transcripts
summary = summarize_text(df['transcript'][0])

# Extract nodes and edges based on the schema
schema = {
  "nodes": {
    "Company": {"attributes": ["name", "ticker", "industry", "exchange"]},
    "Person": {"attributes": ["name", "role", "company"]},
    "Financial Metric": {"attributes": ["name", "value", "date"]},
    "Product/Service": {"attributes": ["name", "type", "company"]},
    "Event": {"attributes": ["name", "date", "type"]}
  },
  "edges": {
    "Held": {"format": "head|relationship|tail", "connects": ["Company", "Event"]},
    "Managed By": {"format": "head|relationship|tail", "connects": ["Company", "Person"]},
    "Reports": {"format": "head|relationship|tail", "connects": ["Event", "Financial Metric"]},
    "Mentions": {"format": "head|relationship|tail", "connects": ["Event", "Person", "Product/Service", "Company"]},
    "Partnership": {"format": "head|relationship|tail", "connects": ["Company", "Company"]},
    "Launches": {"format": "head|relationship|tail", "connects": ["Company", "Product/Service"]},
    "Discusses": {"format": "head|relationship|tail", "connects": ["Event", "Topic", "Product/Service"]}
  }
}

nodes_and_edges = extract_nodes_edges(summary, schema)

In [61]:
nodes_and_edges

'Nodes:\n1. Company: {"name": "Bilibili", "ticker": null, "industry": null, "exchange": null}\n2. Event: {"name": "Bilibili 2020 Second Quarter Earnings Conference Call", "date": null, "type": "Earnings Conference Call"}\n3. Product/Service: {"name": "live broadcasting business", "type": "content ecosystem", "company": "Bilibili"}\n4. Product/Service: {"name": "game business", "type": null, "company": "Bilibili"}\n5. Product/Service: {"name": "Princess Connect!", "type": "game", "company": "Bilibili"}\n6. Financial Metric: {"name": "revenue growth", "value": "strong", "date": "2020 Second Quarter"}\n7. Financial Metric: {"name": "gross margins", "value": "improved", "date": "2020 Second Quarter"}\n\nEdges:\n1. "Bilibili 2020 Second Quarter Earnings Conference Call" | Discusses | "Bilibili"\n2. "Bilibili 2020 Second Quarter Earnings Conference Call" | Discusses | "live broadcasting business"\n3. "Bilibili 2020 Second Quarter Earnings Conference Call" | Discusses | "game business"\n4. "B