# Creating Knowledge graphs from Pdf files

In [11]:
from langchain.graphs import Neo4jGraph

url = ""
username =""
password = ""
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [12]:
import os
from dotenv import load_dotenv 

load_dotenv()

# Add OpenAI key
#os.environ['OPENAI_API_KEY']=''
api_key = os.environ['OPENAI_API_KEY']


In [13]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

# class Source(BaseSource):
#     properties: Optional[List[Property]] = Field(
#         None, description="List of sources"
#     )        
        
class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [14]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# Using GPT-3.5-turbo-16k model to create the knowledge graph

In [15]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# os.environ["OPENAI_API_KEY"] = "sk-"
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [16]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    print(graph_document)
    graph.add_graph_documents([graph_document])
    return graph_document

# With 200 chunk size

In [23]:

from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime

loader = PyPDFLoader("../data/Apple stock during pandemic.pdf")
start_time = datetime.now()
pages = loader.load_and_split()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
chunks = text_splitter.split_documents(pages)

In [24]:
from tqdm import tqdm

distinct_nodes = set()
nodes = []
relations = []

for i, chunk in tqdm(enumerate(chunks), total=len(chunks)):
    graph_document=extract_and_store_graph(chunk)
    
    #Get distinct nodes
    for node in graph_document.nodes :
        nodes.append(node.id)
        distinct_nodes.add(node.id)
        
    
    #Get all relations   
    for relation in graph_document.relationships :
        relations.append(relation.type)
        
print("nodes list :", len(nodes))
print("Distinct nodes :",len(distinct_nodes))
print("Relationships :",len(relations))


  0%|          | 0/23 [00:00<?, ?it/s]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Stockprice', type='Concept', properties={'name': 'Stockprice'}), Node(id='Epidemic', type='Event', properties={'name': 'Epidemic'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Stockprice', type='Concept', properties={'name': 'Stockprice'}), type='has'), Relationship(source=Node(id='Stockprice', type='Concept', properties={'name': 'Stockprice'}), target=Node(id='Epidemic', type='Event', properties={'name': 'Epidemic'}), type='during')] source=Document(page_content="The Changes of Apple’s Stock Price During the \nPandemic  \nWanting Sun1,* \n1WendySun, Kinglee High School, Zheng Zhou, He Nan, China, 450000  \n*Corresponding author. Email:  wendysun040718@163.com  \nABSTRACT  \nDuring the epidemic, the economies of various countries, including many large companies, were subject to very large \nfluctuations and impacts. Therefore, this arti

  4%|▍         | 1/23 [00:04<01:46,  4.85s/it]

nodes=[Node(id='Economy', type='Concept', properties={'name': 'Economy'}), Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Stock Price', type='Concept', properties={'name': 'Stock Price'}), Node(id='Covid-19', type='Concept', properties={'name': 'Covid-19'})] relationships=[Relationship(source=Node(id='Economy', type='Concept', properties={'name': 'Economy'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='has been damaged'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Stock Price', type='Concept', properties={'name': 'Stock Price'}), type='fell sharply'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Covid-19', type='Concept', properties={'name': 'Covid-19'}), type='impacted'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Stock Price', type='Concept', properties={'name

  9%|▊         | 2/23 [00:13<02:34,  7.37s/it]

nodes=[Node(id='Epidemic', type='Event', properties={'name': 'Epidemic'}), Node(id='2020', type='Year', properties={'name': '2020'}), Node(id='Countries', type='Group', properties={'name': 'Countries'}), Node(id='Companies', type='Group', properties={'name': 'Companies'}), Node(id='Population Decline', type='Concept', properties={'name': 'Population Decline'}), Node(id='Economic Losses', type='Concept', properties={'name': 'Economic Losses'}), Node(id='National Unrest', type='Concept', properties={'name': 'National Unrest'}), Node(id='U.S. Economic Recession', type='Event', properties={'name': 'U.S. Economic Recession'}), Node(id='Euro Zone Economy', type='Concept', properties={'name': 'Euro Zone Economy'}), Node(id='British Economy', type='Concept', properties={'name': 'British Economy'}), Node(id='World War Ii', type='Event', properties={'name': 'World War Ii'}), Node(id="China'S Transportation", type='Concept', properties={'name': "China'S Transportation"}), Node(id='Tourism', type=

 13%|█▎        | 3/23 [00:38<05:05, 15.29s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple', 'specializesin': 'consumer electronics, computer software, online services', 'headquarters': 'Cupertino, California', 'founded': 'April 1, 1976', 'wentpublic': 'December 12, 1980', 'marketvaluerecord': '$623.5 billion in 2012', 'largestcompanybymarketcap': 'Yes'})] relationships=[] source=Document(page_content='dominate the world been hit? Let\'s take the most \nfamilia r Apple company as an example, and analyze \nhow much impact this devastating epidemic has brought \nto Apple’s stocks in several aspects . \n \n 2. BACKGROUND INFORM ATION OF \nAPPLE COMPANY  \nApple Inc. is an American multinational technology \ncompany that specializes in consumer electronics, \ncomputer software, and online services. In it’s \nheadquartered in Cupertino, California. Apple went \npublic on December 12, 1980, and set a record of \n$623.5 billion in market value in 2012. As of June \n2014, Apple has become the world \'s largest compan

 17%|█▋        | 4/23 [00:43<03:29, 11.04s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple', 'marketvalue': 'US$2 trillion', 'rank': '6th', 'revenue': 'largest technology company', 'pcvendorrank': '4th', 'smartphonemanufacturerrank': '4th'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Coca-Cola', type='Company', properties={'name': 'Coca-Cola'}), type='surpassed', properties={'year': '2014'}), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Google', type='Company', properties={'name': 'Google'}), type='surpassed', properties={'year': '2014'}), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='marketValueExceeded', properties={'date': 'August 19, 2020', 'value': 'US$2 trillion'})] source=Document(page_content=' Group\'s "Best \nGlobal Brand" report, Apple surpassed Coca -Cola to 

 22%|██▏       | 5/23 [00:52<03:04, 10.25s/it]

nodes=[Node(id='Cultural Industry (Icemci 2021)', type='Event', properties={'name': 'Cultural Industry (Icemci 2021)'}), Node(id='The Authors', type='Organization', properties={'name': 'The Authors'}), Node(id='Atlantis Press International B.V.', type='Organization', properties={'name': 'Atlantis Press International B.V.'})] relationships=[Relationship(source=Node(id='Cultural Industry (Icemci 2021)', type='Event', properties={'name': 'Cultural Industry (Icemci 2021)'}), target=Node(id='The Authors', type='Organization', properties={'name': 'The Authors'}), type='publishedBy'), Relationship(source=Node(id='Cultural Industry (Icemci 2021)', type='Event', properties={'name': 'Cultural Industry (Icemci 2021)'}), target=Node(id='Atlantis Press International B.V.', type='Organization', properties={'name': 'Atlantis Press International B.V.'}), type='publishedBy')] source=Document(page_content='\nCultural Industry (ICEMCI 2021)\nCopyright © 2021 The Authors. Published by Atlantis Press Inter

 26%|██▌       | 6/23 [00:57<02:24,  8.49s/it]

nodes=[Node(id='Macroperspective', type='Concept', properties={'description': 'From a macro perspective, the decline in population, the decrease in spending, and the increase in inventories have caused many industries to stagnate. This is bound to bring a huge blow on the global economy to a serious extent.', 'name': 'Macroperspective'}), Node(id='Globaleconomy', type='Concept', properties={'description': 'The global economy is expected to shrink by 3% in 2020 due to the new crown epidemic. The impact will be much more severe than the 2008-2009 financial crisis.', 'name': 'Globaleconomy'}), Node(id='Imf', type='Organization', properties={'description': 'The International Monetary Fund', 'name': 'Imf'}), Node(id='April2020Globaleconomicsurvey', type='Publication', properties={'title': 'April 2020 Global Economic Survey', 'name': 'April2020Globaleconomicsurvey'}), Node(id='Economiccontraction', type='Concept', properties={'description': 'The impact of the epidemic crisis on economic cont

 30%|███       | 7/23 [01:14<03:01, 11.36s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='China', type='Country', properties={'name': 'China'}), Node(id='Wall Street Journal', type='Publication', properties={'name': 'Wall Street Journal'}), Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='China', type='Country', properties={'name': 'China'}), type='production_location'), Relationship(source=Node(id='China', type='Country', properties={'name': 'China'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='affected_by_epidemic'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Wall Street Journal', type='Publication', properties={'name': 'Wall Street Journal'}), type='reported_by', properties={'date': 'March 24, 2020'}), Relationship(source=Node(id='Apple', type='Company', properties={'

 35%|███▍      | 8/23 [01:24<02:44, 10.98s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='February 3, 2020', type='Date', properties={'openingprice': '76.07', 'closingprice': '80.01', 'name': 'February 3, 2020'}), Node(id='March 23, 2020', type='Date', properties={'openingprice': '57.02', 'closingprice': '56.09', 'name': 'March 23, 2020'}), Node(id='September 1', type='Date', properties={'openingprice': '132.76', 'closingprice': '134.18', 'name': 'September 1'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='February 3, 2020', type='Date', properties={'name': 'February 3, 2020'}), type='priceChange'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='March 23, 2020', type='Date', properties={'name': 'March 23, 2020'}), type='priceChange'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='September 1', type='Date', properties={'n

 39%|███▉      | 9/23 [01:34<02:27, 10.53s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='September 18, 2020', type='Date', properties={'name': 'September 18, 2020'}), type='stock_price', properties={'opening_price': '110.40', 'closing_price': '106.84'})] source=Document(page_content=" \n134.18. However, after September 1, the price started to \ndrop again. There is a ver y low price on September 18, \n2020. The opening price is 110.40, the closing price is \n106.84.  \n \nFigure 1 Apple's stock price  \n \n \nAdvances in Economics, Business and Management Research, volume 203\n374", metadata={'source': '../data/Apple stock during pandemic.pdf', 'page': 1})


 43%|████▎     | 10/23 [01:38<01:52,  8.64s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Stock', type='Concept', properties={'name': 'Stock'}), Node(id='Announcement', type='Event', properties={'name': 'Announcement'}), Node(id='New Apple Products', type='Product', properties={'name': 'New Apple Products'}), Node(id='Iphone 12', type='Product', properties={'name': 'Iphone 12'}), Node(id='Iphone 12 Pro', type='Product', properties={'name': 'Iphone 12 Pro'}), Node(id='Iphone 12 Pro Max', type='Product', properties={'name': 'Iphone 12 Pro Max'}), Node(id='October 14, 2020', type='Date', properties={'name': 'October 14, 2020'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Stock', type='Concept', properties={'name': 'Stock'}), type='has'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Announcement', type='Event', properties={'name': 'Announcement'}), type='caused'), Relationsh

 48%|████▊     | 11/23 [01:55<02:15, 11.26s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='November 2', type='Date', properties={'name': 'November 2'}), Node(id='December 28', type='Date', properties={'name': 'December 28'}), Node(id='2020', type='Year', properties={'name': '2020'}), Node(id='2021', type='Year', properties={'name': '2021'}), Node(id='February 3', type='Date', properties={'name': 'February 3'}), Node(id='January 26', type='Date', properties={'name': 'January 26'})] relationships=[Relationship(source=Node(id='Buy These Products', type='Action', properties={'name': 'Buy These Products'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='caused'), Relationship(source=Node(id='November 2', type='Date', properties={'name': 'November 2'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='drop', properties={'openingPrice': '109.11', 'closingPrice': '108.77'}), Relationship(source=Node(id='December 28', type='Date', properties={'name

 52%|█████▏    | 12/23 [02:09<02:11, 12.00s/it]

nodes=[Node(id='January 26, 2021', type='Date', properties={'day': '26', 'month': 'January', 'year': '2021', 'name': 'January 26, 2021'}), Node(id='February 3, 2021', type='Date', properties={'day': '3', 'month': 'February', 'year': '2021', 'name': 'February 3, 2021'}), Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), Node(id='Bill Gates', type='Person', properties={'name': 'Bill Gates'}), Node(id='Paul Allen', type='Person', properties={'name': 'Paul Allen'})] relationships=[Relationship(source=Node(id='January 26, 2021', type='Date', properties={'name': 'January 26, 2021'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='stock_price', properties={'opening_price': '143.6', 'closing_price': '143.16'}), Relationship(source=Node(id='February 3, 2021', type='Date', properties={'name': 'February 3, 2021'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}

 57%|█████▋    | 13/23 [02:23<02:05, 12.57s/it]

nodes=[Node(id='Seattle', type='City', properties={'name': 'Seattle'}), Node(id='Microsoft', type='Company', properties={'famousproducts': 'Windows operating system, Office series software', 'marketvalue': '$2 trillion (as of June 23, 2021)', 'patents': 'More than Apple', 'r&dRatio': 'Higher than Apple', 'r&dInvestment': 'Higher than Apple', 'name': 'Microsoft'}), Node(id='Apple', type='Company', properties={'profit': 'Not specified', 'name': 'Apple'})] relationships=[Relationship(source=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), target=Node(id='Seattle', type='City', properties={'name': 'Seattle'}), type='headquarteredIn'), Relationship(source=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), target=Node(id='Seattle', type='City', properties={'name': 'Seattle'}), type='focusesOn', properties={'areas': 'R&D, manufacturing, licensing, computer software services'}), Relationship(source=Node(id='Microsoft', type='Company', properties={'na

 61%|██████    | 14/23 [02:32<01:44, 11.62s/it]

nodes=[Node(id='Environment', type='Concept', properties={'name': 'Environment'}), Node(id='R&D Ratio', type='Concept', properties={'name': 'R&D Ratio'}), Node(id='R&D Investment', type='Concept', properties={'name': 'R&D Investment'}), Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), Node(id='Profit', type='Concept', properties={'name': 'Profit'}), Node(id='Profit Margin', type='Concept', properties={'name': 'Profit Margin'}), Node(id='Revenue', type='Concept', properties={'name': 'Revenue'}), Node(id='Market Value', type='Concept', properties={'name': 'Market Value'}), Node(id='Enterprise-Level Market', type='Concept', properties={'name': 'Enterprise-Level Market'}), Node(id='Customers', type='Concept', properties={'name': 'Customers'}), Node(id='Epidemic', type='Concept', properties={'name': 'Epidemic'}), Node(id='Employees', type='Concept', properties={'name': 'Employees'}), Node(id='Stock Price'

 65%|██████▌   | 15/23 [03:02<02:17, 17.17s/it]

nodes=[Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), Node(id='Monday', type='Date', properties={'name': 'Monday'}), Node(id='April 24, 2000', type='Date', properties={'name': 'April 24, 2000'}), Node(id='February 10, 2020', type='Date', properties={'name': 'February 10, 2020'}), Node(id='March 23', type='Date', properties={'name': 'March 23'}), Node(id='March 16', type='Date', properties={'name': 'March 16'})] relationships=[Relationship(source=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), target=Node(id='Monday', type='Date', properties={'name': 'Monday'}), type='stock_price_change', properties={'percentage_change': '-14.74%'}), Relationship(source=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), target=Node(id='April 24, 2000', type='Date', properties={'name': 'April 24, 2000'}), type='stock_price_change', properties={'percentage_change': '-15.6%'}), Relationship(source=Node(id='Microsoft', type='Company', pro

 70%|██████▉   | 16/23 [03:15<01:51, 15.86s/it]

nodes=[Node(id='Apple', type='Company', properties={'marketvalue': 'higher than Microsoft', 'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), type='volumeComparison', properties={'relation': 'higher than'})] source=Document(page_content=" but their volume is quite \nsimilar, even Apple’s volumes are higher than \nMicrosoft. It is because Apple's market value is higher \nthan Microsoft.  \n6. OVERALL TREND  \nFor Apple, th e epidemic only affected it at the \nbeginning, but it took only four months for Apple to \nreturn to its previous normal stock price. It can be seen \nthat the true hidden strength behind Apple can enable \nApple. Recovered in such a short time. The overall \nsitua tion of Apple's stock price continued to rise. It only \nfell sharply between February a

 74%|███████▍  | 17/23 [03:19<01:13, 12.20s/it]

nodes=[Node(id='Apple', type='Company', properties={'stockincrease': '67.4%', 'highestvalue': 'December', 'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'highestvalue': 'September', 'name': 'Microsoft'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), type='similarTo')] source=Document(page_content=" then fell back to normal levels. There was also \na peak in early 2021, and then it returned to normal \nlevels. The overall level of Apple's stock increased by \n67.4% over the same period.  \nFor Microsoft, his situation is actually similar to that \nof the same type of Apple. At the beginning of the \nepidemic, there was a huge decline from February to \nMarch, but it was basically fully restored to the \nprevious level before June. Like Apple, it took just four \nmonths to solve the severe damage caused by the \nepidemic, which shows th

 78%|███████▊  | 18/23 [03:23<00:48,  9.74s/it]

nodes=[Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), Node(id='September', type='Date', properties={'name': 'September'}), Node(id='Stock', type='Concept', properties={'name': 'Stock'}), Node(id='Apple', type='Company', properties={'name': 'Apple'})] relationships=[Relationship(source=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), target=Node(id='Stock', type='Concept', properties={'name': 'Stock'}), type='has'), Relationship(source=Node(id='September', type='Date', properties={'name': 'September'}), target=Node(id='Stock', type='Concept', properties={'name': 'Stock'}), type='highest value in'), Relationship(source=Node(id='Stock', type='Concept', properties={'name': 'Stock'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='compared to'), Relationship(source=Node(id='Stock', type='Concept', properties={'name': 'Stock'}), target=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), type='co

 83%|████████▎ | 19/23 [03:36<00:43, 10.87s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'})] relationships=[] source=Document(page_content=" Apple. Instead, it only took it four \nmonths to restore its stock to its original value, or even \nrise. This is enough to show that Apple's foundation is \nstable and there are excellent strategies to enable them \nto recover their losses in time when they faced strong \nimpacts. In general, Apple's stock only declined during \nthe most severe period of the pandemic, and remained \nat a stable value for the rest of the time, and there was \nalso a phenomenon of continuous ri se. \nREFERENCES  \n[1] Yahoo is now a part of Verizon Media. (n.d.). \nYahoo. Retrieved August 29, 2021, from \nhttps://finance.yahoo.com/quote/AAPL/chart? \\ \n[2] Ziemba, W. T. (2020, June 21). The COVID -19 \nCrash in", metadata={'source': '../data/Apple stock during pandemic.pdf', 'page': 3})


 87%|████████▋ | 20/23 [03:38<00:24,  8.14s/it]

nodes=[Node(id='W. T.', type='Author', properties={'name': 'W. T.'}), Node(id='Ziemba', type='Journal', properties={'title': 'The COVID-19 Crash in the US Stock Market', 'url': 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3632410', 'name': 'Ziemba'}), Node(id='B. Yan', type='Author', properties={'name': 'B. Yan'}), Node(id='Yan', type='Journal', properties={'title': 'Analysis of the Effect of COVID-19 on the Stock Market and Investing Strategies', 'url': 'https://papers.ssrn.com/sol3/Papers.cfm?abstract_id=3563380', 'name': 'Yan'}), Node(id='J. A. C. Camba', type='Author', properties={'name': 'J. A. C. Camba'}), Node(id='The Journal Of Asian Finance', type='Journal', properties={'title': 'The Effects of Restrictions in Economic Activity on the Spread of COVID-19 in the Philippines: Insights from Apple and Google Mobility Indicators', 'name': 'The Journal Of Asian Finance'})] relationships=[] source=Document(page_content=', W. T. (2020, June 21). The COVID -19 \nCrash in the US 

 91%|█████████▏| 21/23 [03:45<00:15,  7.87s/it]

nodes=[Node(id='Economic Activity On The Spread Of Covid-19 In The Philippines', type='Topic', properties={'name': 'Economic Activity On The Spread Of Covid-19 In The Philippines'}), Node(id='Insights From Apple And Google Mobility Indicators', type='Article', properties={'source': 'The Journal of Asian Finance, Economics and Business | Korea Science', 'link': 'https://www.koreascience.or.kr/article/JAKO202034651879125.page', 'name': 'Insights From Apple And Google Mobility Indicators'}), Node(id='Comparing Apple To Amazon: Just A Matter Of Words', type='Article', properties={'author': 'Shivaani, M. V.', 'source': 'Journal of Emerging Technologies in Accounting', 'link': 'https://meridian.allenpress.com/jeta/article-abstract/doi/10.2308/JETA-2020-045/464597/Comparing-Apple-to-Amazon-Just-a-Matter-of-Words', 'name': 'Comparing Apple To Amazon: Just A Matter Of Words'})] relationships=[Relationship(source=Node(id='Economic Activity On The Spread Of Covid-19 In The Philippines', type='Top

 96%|█████████▌| 22/23 [03:57<00:09,  9.00s/it]

nodes=[Node(id='Apple', type='Company', properties={'ticker': 'AAPL', 'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'ticker': 'MSFT', 'name': 'Microsoft'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), type='competitor')] source=Document(page_content='aring -Apple -to-Amazon -Just-a-\nMatter -of-Words  \n[6] Apple (AAPL) Vs Microsoft (MSFT): Which Is A \nBetter Tech Stock To Buy Right Now?, 28 Jan. \n2021, www.nasdaq.com/articles/apple -aapl-vs-\nmicrosoft -msft%3A -which -is-a-better -tech-stock-\nto-buy-right -now-2021 -01-28. \n[7] Osipovich Alexander. “Apple’s market capitalisation \nhas fallen below trillion, and Microsoft is the only \nremaining U.S. company in the trillion -dollar \nclub. ” Wall Street News, 24 Mar. 2020, \ncn.wsj.com/articles /  \nAdvances in Economics, Business and Management Research, volume 203\n376', meta

100%|██████████| 23/23 [04:01<00:00, 10.52s/it]

nodes list : 111
Distinct nodes : 80
Relationships : 113





# With 500 chunk size

In [17]:
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime
from tqdm import tqdm

loader = PyPDFLoader("../data/Apple stock during pandemic.pdf")
start_time = datetime.now()
pages = loader.load_and_split()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=20)
chunks = text_splitter.split_documents(pages)

distinct_nodes = set()
nodes = []
relations = []

for i, chunk in tqdm(enumerate(chunks), total=len(chunks)):
    graph_document=extract_and_store_graph(chunk)
    
    #Get distinct nodes
    for node in graph_document.nodes :
        nodes.append(node.id)
        distinct_nodes.add(node.id)
         
    #Get all relations   
    for relation in graph_document.relationships :
        relations.append(relation.type)
        
print("nodes list :", len(nodes))
print("Distinct nodes :",len(distinct_nodes))
print("Relationships :",len(relations))        

  0%|          | 0/10 [00:00<?, ?it/s]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Covid-19', type='Event', properties={'name': 'Covid-19'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Covid-19', type='Event', properties={'name': 'Covid-19'}), type='stock_price_changes', properties={'description': "During the pandemic, Apple's stock price experienced fluctuations."})] source=Document(page_content="The Changes of Apple’s Stock Price During the \nPandemic  \nWanting Sun1,* \n1WendySun, Kinglee High School, Zheng Zhou, He Nan, China, 450000  \n*Corresponding author. Email:  wendysun040718@163.com  \nABSTRACT  \nDuring the epidemic, the economies of various countries, including many large companies, were subject to very large \nfluctuations and impacts. Therefore, this article will explore the ups and downs of Apple's stock price duri ng the \nepidemic. At different points in time and severity, Apple's stock price has ris

 10%|█         | 1/10 [00:04<00:44,  4.93s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple', 'specializesin': 'consumer electronics, computer software, online services', 'headquarters': 'Cupertino, California', 'founded': 'April 1, 1976', 'marketvalue': '$623.5 billion (2012)', 'largestcompanybymarketcap': 'Yes', 'mostvaluablebrand': 'Yes', 'marketvalue2020': '$2 trillion', 'fortuneglobal500rank': '6th (2021)', 'largesttechcompanybyrevenue': 'Yes', 'largestpcvendor': '4th (2021)', 'largestsmartphonemanufacturer': '4th (2021)'})] relationships=[] source=Document(page_content=' is the main source of power to \npromote world gr owth, once the Chinese economy is \naffected, the world economy will inevitably be affected. \nThus, in such an environment where the whole world is \naffected, to what extent have the industry giants that \ndominate the world been hit? Let\'s take the most \nfamilia r Apple company as an example, and analyze \nhow much impact this devastating epidemic has brought \nto Apple’s stocks in s

 20%|██        | 2/10 [00:13<00:54,  6.81s/it]

nodes=[Node(id='Macro_Perspective', type='Perspective', properties={'description': 'From a macro perspective, the decline in population, the decrease in spending, and the increase in inventories have caused many industries to stagnate. This is bound to bring a huge blow on the global economy to a serious extent.', 'name': 'Macro_Perspective'}), Node(id='Global_Economy', type='Concept', properties={'description': 'The global economy is expected to shrink by 3% in 2020 due to the new crown epidemic. The impact will be much more severe than the 2008-2009 financial crisis.', 'name': 'Global_Economy'}), Node(id='International_Monetary_Fund', type='Organization', properties={'description': "The International Monetary Fund stated in the 'April 2020 Global Economic Survey' that the global economy is expected to shrink by 3% in 2020 due to the new crown epidemic.", 'name': 'International_Monetary_Fund'}), Node(id='Epidemic_Impact', type='Concept', properties={'description': 'The impact of the e

 30%|███       | 3/10 [00:32<01:29, 12.73s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='09', type='Date', properties={'name': '09'}), Node(id='September 1', type='Date', properties={'name': 'September 1'}), Node(id='September 18, 2020', type='Date', properties={'name': 'September 18, 2020'})] relationships=[Relationship(source=Node(id='09', type='Date', properties={'name': '09'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='lowest_price_during_epidemic'), Relationship(source=Node(id='September 1', type='Date', properties={'name': 'September 1'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='peak', properties={'opening_price': '132.76', 'closing_price': '134.18'}), Relationship(source=Node(id='September 18, 2020', type='Date', properties={'name': 'September 18, 2020'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='very_low_price', properties={'opening_price': '110.40', 'closing_price': '106.84'})] s

 40%|████      | 4/10 [00:39<01:02, 10.50s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Iphone 12', type='Product', properties={'name': 'Iphone 12'}), Node(id='Iphone 12 Pro', type='Product', properties={'name': 'Iphone 12 Pro'}), Node(id='Iphone 12 Pro Max', type='Product', properties={'name': 'Iphone 12 Pro Max'})] relationships=[Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Iphone 12', type='Product', properties={'name': 'Iphone 12'}), type='announces'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Iphone 12 Pro', type='Product', properties={'name': 'Iphone 12 Pro'}), type='announces'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Iphone 12 Pro Max', type='Product', properties={'name': 'Iphone 12 Pro Max'}), type='announces'), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Octo

 50%|█████     | 5/10 [01:00<01:10, 14.19s/it]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'foundedby': 'Bill Gates, Paul Allen', 'foundedon': 'April 4, 1975', 'headquarters': 'Redmond, Washington', 'focus': 'R&D, manufacturing, licensing, computer software services', 'famousproducts': 'Windows operating system, Office series software', 'marketvalue': '$2 trillion (as of June 23, 2021)', 'patents': 'More than Apple', 'r&dratio': 'Higher than Apple', 'r&dinvestment': 'Higher than Apple', 'name': 'Microsoft'})] relationships=[Relationship(source=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='competitor')] source=Document(page_content="ANY  \nAnother very famous company which can compete \nwith Apple is Microsoft Company. Microsof t is an \nAmerican multinational technology company founded \nby Bill Gates and Paul Allen on April 4, 1975. The \ncompany is headq

 60%|██████    | 6/10 [01:06<00:45, 11.44s/it]

nodes=[Node(id='Microsoft', type='Company', properties={'stockpriceplummeted': '14.74%', 'largestsingledaydrop': 'April 24, 2000', 'stockpricefallen': '27%', 'name': 'Microsoft'}), Node(id='Monday', type='Day', properties={'name': 'Monday'}), Node(id='April 24, 2000', type='Date', properties={'name': 'April 24, 2000'}), Node(id='February 10, 2020', type='Date', properties={'name': 'February 10, 2020'}), Node(id='March 23', type='Date', properties={'name': 'March 23'}), Node(id='March 16', type='Date', properties={'openingprice': '140.0', 'closingprice': '135.42', 'name': 'March 16'}), Node(id='Apple', type='Company', properties={'lowestprice': 'March 23', 'volume': 'higher than Microsoft', 'marketvalue': 'higher than Microsoft', 'name': 'Apple'}), Node(id="Microsoft'S Stock", type='Stock', properties={'name': "Microsoft'S Stock"}), Node(id="Apple'S Stock", type='Stock', properties={'name': "Apple'S Stock"}), Node(id='Epidemic', type='Event', properties={'name': 'Epidemic'}), Node(id='F

 70%|███████   | 7/10 [01:35<00:50, 16.96s/it]

nodes=[Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), Node(id='Apple', type='Company', properties={'name': 'Apple'})] relationships=[Relationship(source=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), target=Node(id='Apple', type='Company', properties={'name': 'Apple'}), type='comparison', properties={'observation': "Microsoft's stock price is much higher than Apple's."}), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), type='comparison', properties={'observation': "Apple's number of transactions is many times greater than Microsoft's."}), Relationship(source=Node(id='Apple', type='Company', properties={'name': 'Apple'}), target=Node(id='Microsoft', type='Company', properties={'name': 'Microsoft'}), type='comparison', properties={'observation': 'The total trading volume of Apple stock and Microsoft stock is similar.'})] sour

 80%|████████  | 8/10 [01:42<00:27, 13.82s/it]

nodes=[Node(id='Yan', type='Person', properties={'name': 'Yan'}), Node(id='Covid-19', type='Virus', properties={'name': 'Covid-19'}), Node(id='Stock Market', type='Concept', properties={'name': 'Stock Market'}), Node(id='Investing Strategies', type='Concept', properties={'name': 'Investing Strategies'}), Node(id='Camba', type='Person', properties={'name': 'Camba'}), Node(id='Philippines', type='Country', properties={'name': 'Philippines'}), Node(id='Apple', type='Company', properties={'name': 'Apple'}), Node(id='Google', type='Company', properties={'name': 'Google'}), Node(id='Journal Of Asian Finance', type='Journal', properties={'name': 'Journal Of Asian Finance'})] relationships=[Relationship(source=Node(id='Yan', type='Person', properties={'name': 'Yan'}), target=Node(id='Covid-19', type='Virus', properties={'name': 'Covid-19'}), type='research', properties={'title': 'Analysis of the Effect of COVID-19 on the Stock Market and Investing Strategies', 'date': '2020-03-28', 'source': '

 90%|█████████ | 9/10 [01:52<00:12, 12.73s/it]

nodes=[Node(id='Economic Activity On The Spread Of Covid-19 In The Philippines', type='Topic', properties={'name': 'Economic Activity On The Spread Of Covid-19 In The Philippines'}), Node(id='Insights From Apple And Google Mobility Indicators', type='Article', properties={'author': 'Shivaani, M. V.', 'publication': 'The Journal of Asian Finance, Economics and Business | Korea Science', 'url': 'https://www.koreascience.or.kr/article/JAKO202034651879125.page', 'name': 'Insights From Apple And Google Mobility Indicators'}), Node(id='Comparing Apple To Amazon: Just A Matter Of Words', type='Article', properties={'author': 'Shivaani, M. V.', 'publication': 'Journal of Emerging Technologies in Accounting', 'url': 'https://meridian.allenpress.com/jeta/article-abstract/doi/10.2308/JETA-2020-045/464597/Comparing-Apple-to-Amazon-Just-a-Matter-of-Words', 'name': 'Comparing Apple To Amazon: Just A Matter Of Words'}), Node(id='Apple (Aapl)', type='Company', properties={'name': 'Apple (Aapl)'}), Nod

100%|██████████| 10/10 [02:02<00:00, 12.21s/it]

nodes list : 52
Distinct nodes : 41
Relationships : 54





# With 1000 chunk size

In [18]:
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime

loader = PyPDFLoader("../data/Apple stock during pandemic.pdf")
start_time = datetime.now()
pages = loader.load_and_split()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=20)
chunks = text_splitter.split_documents(pages)

distinct_nodes = set()
nodes = []
relations = []

for i, chunk in tqdm(enumerate(chunks), total=len(chunks)):
    graph_document=extract_and_store_graph(chunk)
    
    #Get distinct nodes
    for node in graph_document.nodes :
        nodes.append(node.id)
        distinct_nodes.add(node.id)
         
    #Get all relations   
    for relation in graph_document.relationships :
        relations.append(relation.type)

print("nodes list :", len(nodes))
print("Distinct nodes :",len(distinct_nodes))
print("Relationships :",len(relations))        

  0%|          | 0/6 [00:00<?, ?it/s]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple', 'location': 'Cupertino, California', 'founded': 'April 1, 1976', 'industry': 'Technology', 'revenue': 'USD $274.52 billion (2020)', 'marketcap': 'USD $2.43 trillion (2021)'})] relationships=[] source=Document(page_content='The Changes of Apple’s Stock Price During the \nPandemic  \nWanting Sun1,* \n1WendySun, Kinglee High School, Zheng Zhou, He Nan, China, 450000  \n*Corresponding author. Email:  wendysun040718@163.com  \nABSTRACT  \nDuring the epidemic, the economies of various countries, including many large companies, were subject to very large \nfluctuations and impacts. Therefore, this article will explore the ups and downs of Apple\'s stock price duri ng the \nepidemic. At different points in time and severity, Apple\'s stock price has risen and fallen. By observing the data on \nYahoo, we can summarize Apple\'s stock price changes during this period. At the beginning of the epidemic, the global \neconomy has b 

 17%|█▋        | 1/6 [00:04<00:24,  4.93s/it]

nodes=[Node(id='Macro_Perspective', type='Perspective', properties={'description': 'From a macro perspective, the decline in population, the decrease in spending, and the increase in inventories have caused many industries to stagnate. This is bound to bring a huge blow on the global economy to a serious extent.', 'name': 'Macro_Perspective'}), Node(id='Global_Economy', type='Concept', properties={'description': 'The global economy is expected to shrink by 3% in 2020 due to the new crown epidemic, with a more severe impact than the 2008-2009 financial crisis.', 'name': 'Global_Economy'}), Node(id='Economic_Survey', type='Publication', properties={'title': 'April 2020 Global Economic Survey', 'name': 'Economic_Survey'}), Node(id='Epidemic_Impact', type='Concept', properties={'description': 'The impact of the epidemic crisis on economic contraction is predicted to be three times that of the financial crisis.', 'name': 'Epidemic_Impact'}), Node(id='Global_Recession', type='Event', propert

 33%|███▎      | 2/6 [00:27<01:01, 15.28s/it]

nodes=[Node(id='Apple', type='Company', properties={'founded': 'April 4, 1975', 'headquarters': 'Redmond, Washington', 'founders': 'Bill Gates, Paul Allen', 'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'founded': 'April 4, 1975', 'headquarters': 'Redmond, Washington', 'founders': 'Bill Gates, Paul Allen', 'name': 'Microsoft'})] relationships=[] source=Document(page_content="Figure 2.  Apple's stock price  \nThe reasons which caused this drop might be a lot, \nthe most common answer might be the delay of the \nannouncement of new Apple Products. A numbers of \npeople waiting on this day for a long time, because \nApple supposed to announce the new products like \niPhone  12, iPhone 12 Pro and iPhone 12 Pro Max, etc. \nNevertheless, Apple delayed the publish of their new \nproducts until October 14, 2020. There is a small peak \nof rising stock price on October 12, which is two days \nbefore the publish of the new products. The in creasing \nof Apple stock might be

 50%|█████     | 3/6 [00:32<00:31, 10.41s/it]

nodes=[Node(id='Microsoft', type='Company', properties={'stockpriceplummeted': '14.74%', 'largestsingledaydrop': 'April 24, 2000', 'stockpricefallen': '27%', 'name': 'Microsoft'}), Node(id='Monday', type='Day', properties={'name': 'Monday'}), Node(id='April 24, 2000', type='Date', properties={'name': 'April 24, 2000'}), Node(id='February 10, 2020', type='Date', properties={'name': 'February 10, 2020'}), Node(id='March 23', type='Date', properties={'name': 'March 23'}), Node(id='March 16', type='Date', properties={'openingprice': '140.0', 'closingprice': '135.42', 'name': 'March 16'}), Node(id='Apple', type='Company', properties={'lowestpricedate': 'March 23', 'highestpricedate': 'December', 'overalltrendincrease': '67.4%', 'name': 'Apple'}), Node(id='August', type='Month', properties={'name': 'August'}), Node(id='November', type='Month', properties={'name': 'November'}), Node(id='2021', type='Year', properties={'name': '2021'}), Node(id='September', type='Month', properties={'name': 'S

 67%|██████▋   | 4/6 [01:09<00:42, 21.10s/it]

nodes=[Node(id='Covid-19', type='Concept', properties={'name': 'Covid-19'}), Node(id='Stock Market', type='Concept', properties={'name': 'Stock Market'}), Node(id='Investing Strategies', type='Concept', properties={'name': 'Investing Strategies'}), Node(id='Yan', type='Person', properties={'name': 'Yan'}), Node(id='Https://Papers.Ssrn.Com/Sol3/Papers.Cfm?Abstract_Id=3563380', type='Source', properties={'name': 'Https://Papers.Ssrn.Com/Sol3/Papers.Cfm?Abstract_Id=3563380'}), Node(id='Camba, J. A. C.', type='Person', properties={'name': 'Camba, J. A. C.'}), Node(id='The Journal Of Asian Finance', type='Journal', properties={'name': 'The Journal Of Asian Finance'})] relationships=[Relationship(source=Node(id='Covid-19', type='Concept', properties={'name': 'Covid-19'}), target=Node(id='Stock Market', type='Concept', properties={'name': 'Stock Market'}), type='effect'), Relationship(source=Node(id='Covid-19', type='Concept', properties={'name': 'Covid-19'}), target=Node(id='Investing Strate

 83%|████████▎ | 5/6 [01:20<00:17, 17.56s/it]

nodes=[Node(id='Economic Activity On The Spread Of Covid-19 In The Philippines', type='Topic', properties={'name': 'Economic Activity On The Spread Of Covid-19 In The Philippines'}), Node(id='Insights From Apple And Google Mobility Indicators', type='Journal', properties={'source': 'The Journal of Asian Finance, Economics and Business | Korea Science', 'link': 'https://www.koreascience.or.kr/article/JAKO202034651879125.page', 'name': 'Insights From Apple And Google Mobility Indicators'}), Node(id='Comparing Apple To Amazon: Just A Matter Of Words', type='Article', properties={'author': 'Shivaani, M. V.', 'journal': 'Journal of Emerging Technologies in Accounting', 'source': 'Machine Learning World', 'link': 'https://meridian.allenpress.com/jeta/article-abstract/doi/10.2308/JETA-2020-045/464597/Comparing-Apple-to-Amazon-Just-a-Matter-of-Words', 'name': 'Comparing Apple To Amazon: Just A Matter Of Words'}), Node(id='Apple (Aapl)', type='Company', properties={'name': 'Apple (Aapl)'}), Nod

100%|██████████| 6/6 [01:40<00:00, 16.68s/it]

nodes list : 37
Distinct nodes : 33
Relationships : 37





# With 2000 chunk size

In [19]:
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime

loader = PyPDFLoader("../data/Apple stock during pandemic.pdf")
start_time = datetime.now()
pages = loader.load_and_split()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=2000, chunk_overlap=20)
chunks = text_splitter.split_documents(pages)

distinct_nodes = set()
nodes = []
relations = []

for i, chunk in tqdm(enumerate(chunks), total=len(chunks)):
    graph_document=extract_and_store_graph(chunk)
    
    #Get distinct nodes
    for node in graph_document.nodes :
        nodes.append(node.id)
        distinct_nodes.add(node.id)
         
    #Get all relations   
    for relation in graph_document.relationships :
        relations.append(relation.type)

print("nodes list :", len(nodes))
print("Distinct nodes :",len(distinct_nodes))
print("Relationships :",len(relations))        

  0%|          | 0/5 [00:00<?, ?it/s]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple', 'location': 'Cupertino, California', 'founded': 'April 1, 1976', 'industry': 'Technology', 'revenue': 'USD $274.52 billion (2020)', 'marketcapitalization': 'USD $2.43 trillion (2021)'})] relationships=[] source=Document(page_content='The Changes of Apple’s Stock Price During the \nPandemic  \nWanting Sun1,* \n1WendySun, Kinglee High School, Zheng Zhou, He Nan, China, 450000  \n*Corresponding author. Email:  wendysun040718@163.com  \nABSTRACT  \nDuring the epidemic, the economies of various countries, including many large companies, were subject to very large \nfluctuations and impacts. Therefore, this article will explore the ups and downs of Apple\'s stock price duri ng the \nepidemic. At different points in time and severity, Apple\'s stock price has risen and fallen. By observing the data on \nYahoo, we can summarize Apple\'s stock price changes during this period. At the beginning of the epidemic, the global \neco

 20%|██        | 1/5 [00:04<00:19,  4.97s/it]

nodes=[Node(id='Macroperspective', type='Perspective', properties={'description': 'From a macro perspective, the decline in population, the decrease in spending, and the increase in inventories have caused many industries to stagnate. This is bound to bring a huge blow on the global economy to a serious extent.', 'name': 'Macroperspective'}), Node(id='Globaleconomy', type='Concept', properties={'description': 'The global economy is expected to shrink by 3% in 2020 due to the new crown epidemic. The impact will be much more severe than the 2008-2009 financial crisis.', 'name': 'Globaleconomy'}), Node(id='Economicsurvey', type='Publication', properties={'title': 'April 2020 Global Economic Survey', 'name': 'Economicsurvey'}), Node(id='Experts', type='Group', properties={'description': 'Some experts have predicted that the impact of the epidemic crisis on economic contraction will be three times that of the financial crisis.', 'name': 'Experts'}), Node(id='Recession', type='Event', proper

 40%|████      | 2/5 [00:21<00:34, 11.62s/it]

nodes=[Node(id='Apple', type='Company', properties={'founded': 'April 4, 1975', 'headquarters': 'Redmond, Washington', 'founders': 'Bill Gates, Paul Allen', 'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'founded': 'April 4, 1975', 'headquarters': 'Redmond, Washington', 'founders': 'Bill Gates, Paul Allen', 'name': 'Microsoft'})] relationships=[] source=Document(page_content="Figure 2.  Apple's stock price  \nThe reasons which caused this drop might be a lot, \nthe most common answer might be the delay of the \nannouncement of new Apple Products. A numbers of \npeople waiting on this day for a long time, because \nApple supposed to announce the new products like \niPhone  12, iPhone 12 Pro and iPhone 12 Pro Max, etc. \nNevertheless, Apple delayed the publish of their new \nproducts until October 14, 2020. There is a small peak \nof rising stock price on October 12, which is two days \nbefore the publish of the new products. The in creasing \nof Apple stock might be

 60%|██████    | 3/5 [00:25<00:16,  8.14s/it]

nodes=[Node(id='Microsoft', type='Company', properties={'stockpriceplummeted': '14.74%', 'largestsingledaydrop': 'April 24, 2000', 'stockpricefallen': '27%', 'name': 'Microsoft'}), Node(id='Monday', type='Day', properties={'name': 'Monday'}), Node(id='April 24, 2000', type='Date', properties={'name': 'April 24, 2000'}), Node(id='February 10, 2020', type='Date', properties={'name': 'February 10, 2020'}), Node(id='March 23', type='Date', properties={'name': 'March 23'}), Node(id='March 16', type='Date', properties={'name': 'March 16'}), Node(id='140.0', type='Number', properties={'name': '140.0'}), Node(id='135.42', type='Number', properties={'name': '135.42'}), Node(id='Apple', type='Company', properties={'lowestpricedate': 'March 23', 'openingprice': '140.0', 'closingprice': '135.42', 'marketvalue': 'higher than Microsoft', 'name': 'Apple'}), Node(id='August', type='Month', properties={'name': 'August'}), Node(id='November', type='Month', properties={'name': 'November'}), Node(id='Earl

 80%|████████  | 4/5 [00:46<00:13, 13.45s/it]

nodes=[Node(id='Economic Activity On The Spread Of Covid-19 In The Philippines', type='Topic', properties={'name': 'Economic Activity On The Spread Of Covid-19 In The Philippines'}), Node(id='Insights From Apple And Google Mobility Indicators', type='Article', properties={'source': 'The Journal of Asian Finance, Economics and Business | Korea Science', 'link': 'https://www.koreascience.or.kr/article/JAKO202034651879125.page', 'name': 'Insights From Apple And Google Mobility Indicators'}), Node(id='Comparing Apple To Amazon: Just A Matter Of Words', type='Article', properties={'author': 'Shivaani, M. V.', 'source': 'Journal of Emerging Technologies in Accounting', 'link': 'https://meridian.allenpress.com/jeta/article-abstract/doi/10.2308/JETA-2020-045/464597/Comparing-Apple-to-Amazon-Just-a-Matter-of-Words', 'name': 'Comparing Apple To Amazon: Just A Matter Of Words'}), Node(id='Apple (Aapl) Vs Microsoft (Msft): Which Is A Better Tech Stock To Buy Right Now?', type='Article', properties

100%|██████████| 5/5 [00:58<00:00, 11.64s/it]

nodes list : 30
Distinct nodes : 26
Relationships : 17





# With 5000 chunk size

In [20]:
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime

loader = PyPDFLoader("../data/Apple stock during pandemic.pdf")
start_time = datetime.now()
pages = loader.load_and_split()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=5000, chunk_overlap=20)
chunks = text_splitter.split_documents(pages)

distinct_nodes = set()
nodes = []
relations = []

for i, chunk in tqdm(enumerate(chunks), total=len(chunks)):
    graph_document=extract_and_store_graph(chunk)
    
    #Get distinct nodes
    for node in graph_document.nodes :
        nodes.append(node.id)
        distinct_nodes.add(node.id)
         
    #Get all relations   
    for relation in graph_document.relationships :
        relations.append(relation.type)

  0%|          | 0/5 [00:00<?, ?it/s]

nodes=[Node(id='Apple', type='Company', properties={'name': 'Apple', 'location': 'Cupertino, California', 'founded': 'April 1, 1976', 'industry': 'Technology', 'revenue': 'USD $274.52 billion (2020)', 'marketcapitalization': 'USD $2.43 trillion (2021)'})] relationships=[] source=Document(page_content='The Changes of Apple’s Stock Price During the \nPandemic  \nWanting Sun1,* \n1WendySun, Kinglee High School, Zheng Zhou, He Nan, China, 450000  \n*Corresponding author. Email:  wendysun040718@163.com  \nABSTRACT  \nDuring the epidemic, the economies of various countries, including many large companies, were subject to very large \nfluctuations and impacts. Therefore, this article will explore the ups and downs of Apple\'s stock price duri ng the \nepidemic. At different points in time and severity, Apple\'s stock price has risen and fallen. By observing the data on \nYahoo, we can summarize Apple\'s stock price changes during this period. At the beginning of the epidemic, the global \neco

 20%|██        | 1/5 [00:05<00:21,  5.38s/it]

nodes=[Node(id='Macroperspective', type='Perspective', properties={'description': 'From a macro perspective, the decline in population, the decrease in spending, and the increase in inventories have caused many industries to stagnate. This is bound to bring a huge blow on the global economy to a serious extent.', 'name': 'Macroperspective'}), Node(id='Globaleconomy', type='Economy', properties={'impact': "The International Monetary Fund stated in the 'April 2020 Global Economic Survey' that due to the new crown epidemic, the global economy is expected to shrink by 3% in 2020, and the impact will be much more severe than the 2008-2009 financial crisis.", 'name': 'Globaleconomy'}), Node(id='Epidemicimpact', type='Impact', properties={'comparisontofinancialcrisis': 'Some experts have predicted that the impact of the epidemic crisis on economic contraction will be three times that of the financial crisis.', 'name': 'Epidemicimpact'}), Node(id='Globalrecession', type='Recession', properties

 40%|████      | 2/5 [00:20<00:32, 10.84s/it]

nodes=[Node(id='Apple', type='Company', properties={'founded': 'April 4, 1975', 'headquarters': 'Redmond, Washington', 'founders': 'Bill Gates, Paul Allen', 'name': 'Apple'}), Node(id='Microsoft', type='Company', properties={'founded': 'April 4, 1975', 'headquarters': 'Redmond, Washington', 'founders': 'Bill Gates, Paul Allen', 'name': 'Microsoft'})] relationships=[] source=Document(page_content="Figure 2.  Apple's stock price  \nThe reasons which caused this drop might be a lot, \nthe most common answer might be the delay of the \nannouncement of new Apple Products. A numbers of \npeople waiting on this day for a long time, because \nApple supposed to announce the new products like \niPhone  12, iPhone 12 Pro and iPhone 12 Pro Max, etc. \nNevertheless, Apple delayed the publish of their new \nproducts until October 14, 2020. There is a small peak \nof rising stock price on October 12, which is two days \nbefore the publish of the new products. The in creasing \nof Apple stock might be

 60%|██████    | 3/5 [00:24<00:15,  7.98s/it]

nodes=[Node(id='Microsoft', type='Company', properties={'stockpriceplummeted': '14.74%', 'largestsingledaydrop': 'April 24, 2000', 'stockpricefallen': '27%', 'name': 'Microsoft'}), Node(id='Monday', type='Day', properties={'name': 'Monday'}), Node(id='April 24, 2000', type='Date', properties={'name': 'April 24, 2000'}), Node(id='February 10, 2020', type='Date', properties={'name': 'February 10, 2020'}), Node(id='March 23', type='Date', properties={'name': 'March 23'}), Node(id='March 16', type='Date', properties={'name': 'March 16'}), Node(id='140.0', type='Price', properties={'name': '140.0'}), Node(id='135.42', type='Price', properties={'name': '135.42'}), Node(id='Apple', type='Company', properties={'lowestpricedate': 'March 23', 'openingprice': '140.0', 'closingprice': '135.42', 'marketvalue': 'higher than Microsoft', 'name': 'Apple'}), Node(id='August', type='Month', properties={'name': 'August'}), Node(id='November', type='Month', properties={'name': 'November'}), Node(id='Early 

 80%|████████  | 4/5 [00:44<00:12, 12.67s/it]

nodes=[Node(id='Economic Activity On The Spread Of Covid-19 In The Philippines', type='Topic', properties={'name': 'Economic Activity On The Spread Of Covid-19 In The Philippines'}), Node(id='Insights From Apple And Google Mobility Indicators', type='Article', properties={'source': 'The Journal of Asian Finance, Economics and Business | Korea Science', 'link': 'https://www.koreascience.or.kr/article/JAKO202034651879125.page', 'name': 'Insights From Apple And Google Mobility Indicators'}), Node(id='Comparing Apple To Amazon: Just A Matter Of Words', type='Article', properties={'author': 'Shivaani, M. V.', 'source': 'Journal of Emerging Technologies in Accounting', 'link': 'https://meridian.allenpress.com/jeta/article-abstract/doi/10.2308/JETA-2020-045/464597/Comparing-Apple-to-Amazon-Just-a-Matter-of-Words', 'name': 'Comparing Apple To Amazon: Just A Matter Of Words'}), Node(id='Apple (Aapl) Vs Microsoft (Msft): Which Is A Better Tech Stock To Buy Right Now?', type='Article', properties

100%|██████████| 5/5 [00:53<00:00, 10.74s/it]


In [21]:
print("nodes list :", len(nodes))
print("Distinct nodes :",len(distinct_nodes))
print("Relationships :",len(relations))

nodes list : 28
Distinct nodes : 24
Relationships : 16
