In [31]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [32]:
# Read the text files

# Read the "Moby Dick" extract
with open("mobydick.txt", "r", encoding="utf-8", errors="replace") as file:
    moby_dick_text = file.read()

# Read the ai_forecast1.txt file
with open("ai_forecast1.txt", "r", encoding="utf-8", errors="replace") as file:
    ai_forecast1_text = file.read()

# Read the ai_forecast2.txt file
with open("ai_forecast2.txt", "r", encoding="utf-8", errors="replace") as file:
    ai_forecast2_text = file.read()


In [33]:
# Process the text with SpaCy

# Increase the maximum length allowed by SpaCy
nlp.max_length = len(moby_dick_text)

doc_moby_dick = nlp(moby_dick_text)
doc_ai_forecast1 = nlp(ai_forecast1_text)
doc_ai_forecast2 = nlp(ai_forecast2_text)


In [34]:
# Print named entities in the "Moby Dick" extract
for ent in doc_moby_dick.ents:
    print(ent.text, ent.label_)

WHALE ORG
Herman Melville PERSON
CHAPTER 1 LAW
CHAPTER 2 LAW
Carpet-Bag ORG
CHAPTER 3 LAW
The Spouter-Inn ORG
CHAPTER 4 LAW
Counterpane ORG
CHAPTER 5 LAW
CHAPTER 6 LAW
CHAPTER 7 LAW
CHAPTER 8 LAW
CHAPTER 9 LAW
Sermon PERSON
CHAPTER 10 LAW
Bosom Friend ORG
CHAPTER 11 LAW
CHAPTER 12 LAW
CHAPTER 13 LAW
CHAPTER 14 LAW
CHAPTER 15 LAW
CHAPTER 16 LAW
CHAPTER 17 LAW
CHAPTER 18 LAW
Mark PERSON
CHAPTER 19 LAW
CHAPTER 20 LAW
CHAPTER 21 LAW
Going Aboard PERSON
CHAPTER 22 LAW
Christmas DATE
CHAPTER 23 LAW
The Lee Shore ORG
CHAPTER 24 LAW
Advocate ORG
CHAPTER 25 LAW
CHAPTER 26 LAW
Knights and Squires ORG
CHAPTER 27 LAW
Knights and Squires ORG
CHAPTER 28 LAW
CHAPTER 29 LAW
Ahab PERSON
CHAPTER 30 LAW
Pipe FAC
CHAPTER 31 LAW
Queen Mab PERSON
CHAPTER 32 LAW
CHAPTER 33 LAW
Specksnyder ORG
CHAPTER 34 LAW
The Cabin-Table ORG
CHAPTER 35 LAW
CHAPTER 36 LAW
The Quarter-Deck ORG
CHAPTER 37 LAW
CHAPTER 38 LAW
CHAPTER 39 LAW
First Night-Watch ORG
CHAPTER 40 LAW
Midnight TIME
Forecastle PERSON
CHAPTER 41 LAW
Moby

In [35]:
# Perform tokenization, POS tagging, sentence parsing (dep) and named entity recognition (NER) on ai_forecast1
print("Analysis of ai_forecast1.txt")
for sent in doc_ai_forecast1.sents:
    print(f"\nSentence: {sent.text}\n")

    for token in sent: 
        print(f"Token: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")

# Named entities in ai_forecast1
print("\nNamed Entities in ai_forecast1:")
for ent in doc_ai_forecast1.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Analysis of ai_forecast1.txt

Sentence: Pune, India, Sept. 13, 2022 (GLOBE NEWSWIRE) --

Token: Pune, POS: PROPN, Dep: ROOT, Head: Pune
Token: ,, POS: PUNCT, Dep: punct, Head: Pune
Token: India, POS: PROPN, Dep: appos, Head: Pune
Token: ,, POS: PUNCT, Dep: punct, Head: Pune
Token: Sept., POS: PROPN, Dep: npadvmod, Head: Pune
Token: 13, POS: NUM, Dep: nummod, Head: Sept.
Token: ,, POS: PUNCT, Dep: punct, Head: Sept.
Token: 2022, POS: NUM, Dep: nummod, Head: Sept.
Token: (, POS: PUNCT, Dep: punct, Head: Pune
Token: GLOBE, POS: PROPN, Dep: compound, Head: NEWSWIRE
Token: NEWSWIRE, POS: PROPN, Dep: appos, Head: Pune
Token: ), POS: PUNCT, Dep: punct, Head: Pune
Token: --, POS: PUNCT, Dep: punct, Head: Pune

Sentence: The global AI market size is projected to grow from USD 387.45 billion in 2022 to USD 1394.30 billion in 2029 at a CAGR of 20.1% in the forecast period.

Token: The, POS: DET, Dep: det, Head: size
Token: global, POS: ADJ, Dep: amod, Head: size
Token: AI, POS: PROPN, Dep: compou

In [36]:
# Perform tokenization, POS tagging, sentence parsing (dep) and named entity recognition (NER) on ai_forecast2
print("\n\nAnalysis of ai_forecast2.txt")
for sent in doc_ai_forecast2.sents:
    print(f"\nSentence: {sent.text}\n")

    for token in sent:
        print(f"Token: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")

# Named entities in ai_forecast2
print("\nNamed Entities in ai_forecast2:")
for ent in doc_ai_forecast2.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")



Analysis of ai_forecast2.txt

Sentence: The global artificial intelligence market size was $93.5 billion in 2021.

Token: The, POS: DET, Dep: det, Head: size
Token: global, POS: ADJ, Dep: amod, Head: size
Token: artificial, POS: ADJ, Dep: amod, Head: size
Token: intelligence, POS: NOUN, Dep: compound, Head: market
Token: market, POS: NOUN, Dep: compound, Head: size
Token: size, POS: NOUN, Dep: nsubj, Head: was
Token: was, POS: AUX, Dep: ROOT, Head: was
Token: $, POS: SYM, Dep: quantmod, Head: billion
Token: 93.5, POS: NUM, Dep: compound, Head: billion
Token: billion, POS: NUM, Dep: attr, Head: was
Token: in, POS: ADP, Dep: prep, Head: was
Token: 2021, POS: NUM, Dep: pobj, Head: in
Token: ., POS: PUNCT, Dep: punct, Head: was

Sentence: And according to Grand View Research, Inc., it is projected to expand at a compound annual growth rate (CAGR) of 38.1% from 2022 to 2030.

Token: And, POS: CCONJ, Dep: cc, Head: projected
Token: according, POS: VERB, Dep: prep, Head: projected
Token: to

In [37]:
# Explain POS and dependency tag
print(spacy.explain("VERB")) 
print(spacy.explain("nsubj")) 

verb
nominal subject


In [44]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Create a pattern for "Artificial Intelligence"
pattern = [{"LOWER": "artificial"}, {"LOWER": "intelligence"}]

matcher.add("AI_PHRASE", [pattern])
doc = nlp(ai_forecast2_text)
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(f"Matched: '{matched_span.text}' at position {matched_span.start_char} to {matched_span.end_char}")

Matched: 'artificial intelligence' at position 11 to 34
Matched: 'Artificial intelligence' at position 510 to 533
Matched: 'artificial intelligence' at position 930 to 953
Matched: 'Artificial Intelligence' at position 2111 to 2134
Matched: 'Artificial Intelligence' at position 2662 to 2685
Matched: 'artificial intelligence' at position 2744 to 2767
Matched: 'Artificial Intelligence' at position 3761 to 3784
Matched: 'artificial intelligence' at position 4606 to 4629
Matched: 'Artificial Intelligence' at position 7144 to 7167
Matched: 'artificial intelligence' at position 7866 to 7889
Matched: 'artificial intelligence' at position 8341 to 8364
Matched: 'artificial intelligence' at position 9199 to 9222


In [45]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Create a pattern for "AI" followed by a verb
pattern = [{"LOWER": "ai"}, {"POS": "VERB"}]

matcher.add("AI_VERB", [pattern])
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(f"Matched: '{matched_span.text}' at position {matched_span.start_char} to {matched_span.end_char}")

Matched: 'AI grew' at position 6191 to 6198


In [46]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Create a pattern for numbers followed by "%"
pattern = [{"LIKE_NUM": True}, {"TEXT": "%"}]

matcher.add("PERCENTAGE", [pattern])
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(f"Matched: '{matched_span.text}' at position {matched_span.start_char} to {matched_span.end_char}")

Matched: '38.1%' at position 187 to 192
Matched: '66%' at position 1152 to 1155
Matched: '115%' at position 7259 to 7263
Matched: '42%' at position 8070 to 8073


In [47]:
# Extract organizations from the text
for ent in doc.ents:
    if ent.label_ == "ORG":
        print(f"Company Name: '{ent.text}' found at position {ent.start_char} to {ent.end_char}")

Company Name: 'Grand View Research, Inc.' found at position 91 to 116
Company Name: 'AI' found at position 505 to 507
Company Name: 'AI' found at position 817 to 819
Company Name: 'AI' found at position 1193 to 1195
Company Name: 'AI' found at position 1513 to 1515
Company Name: 'NVIDIA Corporation' found at position 1763 to 1781
Company Name: 'NVDA' found at position 1783 to 1787
Company Name: 'Intel Corporation' found at position 1790 to 1807
Company Name: 'Alphabet Inc.' found at position 1816 to 1829
Company Name: 'Amazon Web Services Inc.' found at position 1843 to 1867
Company Name: 'Artificial Intelligence Market by Component Analysis' found at position 2111 to 2163
Company Name: 'AI' found at position 2352 to 2354
Company Name: 'AI' found at position 2511 to 2513
Company Name: 'Artificial Intelligence Market by Technology  

' found at position 2662 to 2710
Company Name: 'AI' found at position 2860 to 2862
Company Name: 'NLP' found at position 2935 to 2938
Company Name: 'NLP' f

In [48]:
# Create a pattern to look for proper nouns (assuming they are typically multi-word proper nouns)
pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]

matcher.add("COMPANY_NAME", [pattern])
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(f"Matched Company: '{matched_span.text}' at position {matched_span.start_char} to {matched_span.end_char}")

Matched Company: 'Grand View' at position 91 to 101
Matched Company: 'View Research' at position 97 to 110
Matched Company: '38.1%' at position 187 to 192
Matched Company: '66%' at position 1152 to 1155
Matched Company: 'NVIDIA Corporation' at position 1763 to 1781
Matched Company: 'Intel Corporation' at position 1790 to 1807
Matched Company: 'Alphabet Inc.' at position 1816 to 1829
Matched Company: 'Amazon Web' at position 1843 to 1853
Matched Company: 'Web Services' at position 1850 to 1862
Matched Company: 'Services Inc.' at position 1854 to 1867
Matched Company: 'Artificial Intelligence' at position 2111 to 2134
Matched Company: 'Intelligence Market' at position 2122 to 2141
Matched Company: 'Component Analysis' at position 2145 to 2163
Matched Company: 'Artificial Intelligence' at position 2662 to 2685
Matched Company: 'Intelligence Market' at position 2673 to 2692
Matched Company: 'Artificial Intelligence' at position 3761 to 3784
Matched Company: 'Intelligence Market' at positio