# Bridging the Gaps in LLM Interpretability of Unstructured Financial Earnings Data with Knowledge Graphs
- Use case: answer questions about companies' financial performance based on the transcripts of their earnings calls.
- Uses StrictJSON to parse the Knowledge Graph: https://github.com/tanchongmin/strictjson

## Import packages

In [2]:
!pip install datasets pandas 

843.07s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [3]:
from strictjson import *
import os
import openai
from openai import OpenAI
import networkx as nx
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import json
import pandas as pd
import numpy as np

In [4]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

## Load dataset

In [5]:
# Function to parse JSONL file
def parse_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

file_path = 'data/earnings-transcripts.jsonl'
earnings_data = parse_jsonl(file_path)


In [71]:
df = pd.DataFrame(earnings_data)

In [72]:
df.head()

Unnamed: 0,date,exchange,q,ticker,transcript
0,"Aug 27, 2020, 9:00 p.m. ET",NASDAQ: BILI,2020-Q2,BILI,"Prepared Remarks:\nOperator\nGood day, and wel..."
1,"Jul 30, 2020, 4:30 p.m. ET",NYSE: GFF,2020-Q3,GFF,Prepared Remarks:\nOperator\nThank you for sta...
2,"Oct 23, 2019, 5:00 p.m. ET",NASDAQ: LRCX,2020-Q1,LRCX,Prepared Remarks:\nOperator\nGood day and welc...
3,"Nov 6, 2019, 12:00 p.m. ET",NASDAQ: BBSI,2019-Q3,BBSI,"Prepared Remarks:\nOperator\nGood day, everyon..."
4,"Aug 7, 2019, 8:30 a.m. ET",NASDAQ: CSTE,2019-Q2,CSTE,Prepared Remarks:\nOperator\nGreetings and wel...


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        1000 non-null   object
 1   exchange    1000 non-null   object
 2   q           1000 non-null   object
 3   ticker      1000 non-null   object
 4   transcript  1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [74]:
df.describe()

Unnamed: 0,date,exchange,q,ticker,transcript
count,1000,1000,1000,1000,1000
unique,783,703,22,703,962
top,"May 27, 2020, 9:00 p.m. ET",NASDAQ: TSLA,2020-Q4,TSLA,"Prepared Remarks:\nOperator\nGood morning, and..."
freq,7,8,210,8,7


In [75]:
# Remove data and exchange columns as they are not needed
df = df[['q', 'ticker', 'transcript']]
df.head()

Unnamed: 0,q,ticker,transcript
0,2020-Q2,BILI,"Prepared Remarks:\nOperator\nGood day, and wel..."
1,2020-Q3,GFF,Prepared Remarks:\nOperator\nThank you for sta...
2,2020-Q1,LRCX,Prepared Remarks:\nOperator\nGood day and welc...
3,2019-Q3,BBSI,"Prepared Remarks:\nOperator\nGood day, everyon..."
4,2019-Q2,CSTE,Prepared Remarks:\nOperator\nGreetings and wel...


In [83]:
# Create new df only with rows with specific tickers
tickers = ['TSM', 'COHR', 'SWKS', 'ASML', 'MTSI']
df_new = df[df['ticker'].isin(tickers)]
df_new

Unnamed: 0,q,ticker,transcript
15,2020-Q4,TSM,Prepared Remarks:\nJeff Su -- Director of Inve...
136,2023-Q1,COHR,Prepared Remarks:\nOperator\nLadies and gentle...
153,2022-Q3,TSM,"Prepared Remarks:\nJeff Su\nGood afternoon, ev..."
225,2021-Q4,MTSI,Prepared Remarks:\nOperator\nWelcome to MACOM'...
506,2021-Q4,MTSI,Prepared Remarks:\nOperator\nWelcome to MACOM'...
511,2023-Q2,COHR,"Prepared Remarks:\nOperator\nGood day, and tha..."
591,2019-Q4,MTSI,"Prepared Remarks:\nOperator\nGood afternoon, a..."
596,2023-Q2,COHR,"Prepared Remarks:\nOperator\nGood day, and tha..."
620,2020-Q2,SWKS,Prepared Remarks:\nOperator\nGood afternoon an...
924,2022-Q1,SWKS,"Prepared Remarks:\nOperator\nGood afternoon, a..."


In [84]:
# Count words for each transcript
df_new['word_count'] = df_new['transcript'].apply(lambda x: len(x.split()))
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['word_count'] = df_new['transcript'].apply(lambda x: len(x.split()))


Unnamed: 0,q,ticker,transcript,word_count
15,2020-Q4,TSM,Prepared Remarks:\nJeff Su -- Director of Inve...,12478
136,2023-Q1,COHR,Prepared Remarks:\nOperator\nLadies and gentle...,11602
153,2022-Q3,TSM,"Prepared Remarks:\nJeff Su\nGood afternoon, ev...",10069
225,2021-Q4,MTSI,Prepared Remarks:\nOperator\nWelcome to MACOM'...,11095
506,2021-Q4,MTSI,Prepared Remarks:\nOperator\nWelcome to MACOM'...,11095
511,2023-Q2,COHR,"Prepared Remarks:\nOperator\nGood day, and tha...",10960
591,2019-Q4,MTSI,"Prepared Remarks:\nOperator\nGood afternoon, a...",7136
596,2023-Q2,COHR,"Prepared Remarks:\nOperator\nGood day, and tha...",10960
620,2020-Q2,SWKS,Prepared Remarks:\nOperator\nGood afternoon an...,8089
924,2022-Q1,SWKS,"Prepared Remarks:\nOperator\nGood afternoon, a...",7562


## Utility functions

In [85]:
def chat(system_prompt, user_prompt = '', model = 'gpt-4-turbo', temperature = 0, **kwargs):
    ''' This replies the user based on a system prompt and user prompt to call OpenAI Chat Completions API '''
    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        temperature = temperature,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        **kwargs
    )
    res = response.choices[0].message.content
    return res

In [86]:
def plot_graph(kg):
    ''' Plots graph based on knowledge graph '''
    # Create graph
    G = nx.DiGraph()
    G.add_edges_from((source, target, {'relation': relation}) for source, relation, target in kg)

    # Plot the graph
    plt.figure(figsize=(10,6), dpi=300)
    pos = nx.spring_layout(G, k=3, seed=0)

    nx.draw_networkx_nodes(G, pos, node_size=1500)
    nx.draw_networkx_edges(G, pos, edge_color='gray')
    nx.draw_networkx_labels(G, pos, font_size=12)
    edge_labels = nx.get_edge_attributes(G, 'relation')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10)

    # Display the plot
    plt.axis('off')
    plt.show()

In [88]:
# Function to summarize text
def summarize_text(text):
    '''Summarize long text to a shorter format'''
    completion = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": f"Summarize the following text to 600 words while keeping all the essential information: {text}"}],
    )
    return completion.choices[0].message.content

In [89]:
# Summarize the transcript column and create a new column with the summarized text
df_new['summary'] = df_new['transcript'].apply(summarize_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['summary'] = df_new['transcript'].apply(summarize_text)


In [92]:
# Count summarized words for each transcript
df_new['summary_word_count'] = df_new['summary'].apply(lambda x: len(x.split()))
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['summary_word_count'] = df_new['summary'].apply(lambda x: len(x.split()))


Unnamed: 0,q,ticker,transcript,word_count,summary,summary_word_count
15,2020-Q4,TSM,Prepared Remarks:\nJeff Su -- Director of Inve...,12478,In a conference call hosted by TSMC for their ...,439
136,2023-Q1,COHR,Prepared Remarks:\nOperator\nLadies and gentle...,11602,Prepared Remarks:\n\nDuring Coherent Corp.'s F...,358
153,2022-Q3,TSM,"Prepared Remarks:\nJeff Su\nGood afternoon, ev...",10069,**Prepared Remarks:**\n\n**Jeff Su:**\nGood af...,451
225,2021-Q4,MTSI,Prepared Remarks:\nOperator\nWelcome to MACOM'...,11095,Operator:\nWelcome to MACOM's Fourth Fiscal Qu...,402
506,2021-Q4,MTSI,Prepared Remarks:\nOperator\nWelcome to MACOM'...,11095,Operator\nWelcome to MACOM's Fiscal Q4 2021 Co...,360
511,2023-Q2,COHR,"Prepared Remarks:\nOperator\nGood day, and tha...",10960,Operator:\nWelcome to Coherent Corp.'s FY '23 ...,486
591,2019-Q4,MTSI,"Prepared Remarks:\nOperator\nGood afternoon, a...",7136,Operator opened the conference call for MACOM'...,377
596,2023-Q2,COHR,"Prepared Remarks:\nOperator\nGood day, and tha...",10960,"Mary Jane Raymond, CFO of Coherent Corp., open...",335
620,2020-Q2,SWKS,Prepared Remarks:\nOperator\nGood afternoon an...,8089,Operator:\nThe earnings call for Skyworks Solu...,406
924,2022-Q1,SWKS,"Prepared Remarks:\nOperator\nGood afternoon, a...",7562,"**Operator**: Good afternoon, welcome to Skywo...",452


In [97]:
def build_knowledge_graph(transcript_summary):
    '''Extract knowledge graph from summarized text using schema'''
    completion = openai.chat.completions.create(
        model="gpt-4-turbo",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": '''You are a knowledge graph builder, extract nodes and edges for a knowledge graph from the following text.
            You are to output relations between two objects in the form (object_1, relation, object_2). 
            All information about dates must be included.
            Example Input: John bought a laptop
            Example Output: [('John', 'bought', 'laptop')]
            Example Input: John built a house in 2019
            Example Output: [('John', 'built', 'house'), ('house', 'built in', '2019')]
            The final output should be in JSON as follows: {"List of triplets": "List of triplets of the form (object_1, relation, object_2), type: list"}'''},
            {"role": "user", "content": f"Here's the text: {transcript_summary}"}
        ]
    )

    answer = json.loads(completion.choices[0].message.content) if completion.choices else "No response"

    return answer

In [138]:
def parse_knowledge_graph(kg, question):
    '''Parse knowledge graph to extract relevant relations'''
    completion = openai.chat.completions.create(
        model="gpt-4-turbo",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": f'''You are a knowledge graph parser for the following knowledge graph {kg}. 
            Only output the triplets that are relevant to the question.
            The final output should be in JSON as follows: {"Parsed Knowledge Graph": "List of triplets of the form (object1, relation, object2), type: list"}'''},
            {"role": "user", "content": f"Here's the question: {question}"}
        ]
    )

    answer = json.loads(completion.choices[0].message.content) if completion.choices else "No response"

    return answer