In [12]:
import spacy
import os
from spacy.matcher import Matcher

In [13]:
def check_word_validity(word):
    if word.is_alpha and word.text.lower() not in nlp.Defaults.stop_words:
        return True
    else:
        return False


def extract_sample_from_file(file_name, character_count, start_position=0):
    possible_encodings = ['utf-8', 'latin-1', 'windows-1252']
    unprocessed_text = ''
    for encoding in possible_encodings:
        try:
            with open(file_name, 'r', encoding=encoding) as f:
                f.seek(start_position)
                text = f.read(character_count)
                unprocessed_text += text
            doc = nlp(text)
            # Check and remove the first token if it's not a valid word
            if check_word_validity(doc[0]):
                doc = doc[1:]

            # Check and remove the last token if it's not a valid word
            if check_word_validity(doc[-1]):
                doc = doc[:-1]
            return doc, unprocessed_text
        except UnicodeDecodeError:
            continue

In [14]:


nlp = spacy.load('en_core_web_sm')
#  You read a small  extract of the file mobydick.txt
mobydick_sample, unprocessed_mobydick_sample = extract_sample_from_file(os.path.join("../data","mobydick.txt"), 1000, 1000)
# take a small extraact from the file ai_forecast1.txt
ai_forecast1_sample, unprocessed_ai_forecast1_sample = extract_sample_from_file(os.path.join("../data","ai_forecast1.txt"), 5000)
# take a small extraact from the file ai_forecast2.txt
ai_forecast2_sample, unprocessed_ai_forecast2_sample = extract_sample_from_file(os.path.join("../data","ai_forecast2.txt"), 5000)

In [15]:
# print out the grammatical structure of the sentences 
print("Mobydick POS \n")
for sentence in mobydick_sample.sents:
    print(sentence)
    for token in sentence:
        print(token.text, token.pos_)
    print('\n')
print("************************")
print("ai_forecast1 POS \n")
for sentence in ai_forecast1_sample.sents:
    print(sentence)
    for token in sentence:
        print(token.text, token.pos_)
    print('\n')
print("************************")
print("ai_forecast2 POS \n")
for sentence in ai_forecast2_sample.sents:
    print(sentence)
    for token in sentence:
        print(token.text, token.pos_)
    print('\n')

Mobydick POS 

.
. PUNCT


Enter Ahab; to Him, Stubb.

CHAPTER 30.
Enter VERB
Ahab PROPN
; PUNCT
to ADP
Him PRON
, PUNCT
Stubb PROPN
. PUNCT


 SPACE
CHAPTER NOUN
30 NUM
. PUNCT


The Pipe.


The DET
Pipe PROPN
. PUNCT


 SPACE


CHAPTER 31.
CHAPTER NOUN
31 NUM
. PUNCT


Queen Mab.

CHAPTER 32.
Queen PROPN
Mab PROPN
. PUNCT


 SPACE
CHAPTER PROPN
32 NUM
. PUNCT


Cetology.


Cetology NOUN
. PUNCT


 SPACE


CHAPTER 33.
CHAPTER NOUN
33 NUM
. PUNCT


The Specksnyder.


The DET
Specksnyder PROPN
. PUNCT


 SPACE


CHAPTER 34.
CHAPTER NOUN
34 NUM
. PUNCT


The Cabin-Table.


The DET
Cabin PROPN
- PUNCT
Table PROPN
. PUNCT


 SPACE


CHAPTER 35.
CHAPTER NOUN
35 NUM
. PUNCT


The Mast-Head.


The DET
Mast PROPN
- PUNCT
Head PROPN
. PUNCT


 SPACE


CHAPTER 36.
CHAPTER NOUN
36 NUM
. PUNCT


The Quarter-Deck.

CHAPTER 37.
The DET
Quarter PROPN
- PUNCT
Deck PROPN
. PUNCT


 SPACE
CHAPTER NOUN
37 NUM
. PUNCT


Sunset.

CHAPTER 38.
Sunset PROPN
. PUNCT


 SPACE
CHAPTER PROPN
38 NUM
. PUNCT


Dusk

In [16]:
# Get the named entities from the samples

# Print named entities
print("MobyDick Named Entities:")
for ent in mobydick_sample.ents:
    print(f"{ent.text} ({ent.label_})")
print("************************")

MobyDick Named Entities:
Ahab (PERSON)
CHAPTER 30 (LAW)
Pipe (FAC)
CHAPTER 31 (LAW)
Queen Mab (PERSON)
CHAPTER 32 (LAW)
CHAPTER 33 (LAW)
Specksnyder (ORG)
CHAPTER 34 (LAW)
The Cabin-Table (ORG)
CHAPTER 35 (LAW)
CHAPTER 36 (LAW)
The Quarter-Deck (ORG)
CHAPTER 37 (LAW)
CHAPTER 38 (LAW)
CHAPTER 39 (LAW)
First Night-Watch (ORG)
CHAPTER 40 (LAW)
Midnight (TIME)
Forecastle (PERSON)
CHAPTER 41 (LAW)
Moby Dick (PERSON)
CHAPTER 42 (LAW)
The Whiteness of the Whale (ORG)
CHAPTER 43 (LAW)
CHAPTER 44 (LAW)
Chart (ORG)
CHAPTER 45 (LAW)
Affidavit (GPE)
CHAPTER 46 (LAW)
CHAPTER 47 (LAW)
The Mat-Maker (ORG)
CHAPTER 48 (LAW)
The First Lowering (ORG)
CHAPTER 49 (LAW)
Hyena (GPE)
CHAPTER 50 (LAW)
Ahab’s Boat (PERSON)
Crew (PERSON)
Fedallah (PERSON)
CHAPTER 51 (LAW)
CHAPTER 52 (LAW)
Albatross (LOC)
CHAPTER 53 (LAW)
CHAPTER 54 (LAW)
The Town-Ho’s Story (ORG)
CHAPTER 55 (LAW)
the Monstrous Pictures of Whales (ORG)
CHAPTER 56 (LAW)
the Less Erroneous Pictures (ORG)
the True
Pictures of Whaling Scenes (ORG)
CH

In [17]:
# Get the named entities from the samples

# Print named entities
print("ai_forecast1 Named Entities:")
for ent in ai_forecast1_sample.ents:
    print(f"{ent.text} ({ent.label_})")
print("************************")

ai_forecast1 Named Entities:
Sept. 13, 2022 (DATE)
GLOBE NEWSWIRE (ORG)
AI (ORG)
USD 387.45 billion (MONEY)
2022 (DATE)
USD 1394.30 billion (MONEY)
2029 (DATE)
20.1% (PERCENT)
AI (ORG)
the next several years (DATE)
Fortune Business Insights (ORG)
2022-2029. (DATE)
USD 328.34 billion (MONEY)
2021 (DATE)
BFSI (ORG)
Request a Sample Copy of the Research Report (WORK_OF_ART)
Microsoft (ORG)
Nuance (GPE)
April 12 (DATE)
2021 (DATE)
Microsoft News Center	

Share on Facebook (ORG)
LinkedIn (GPE)
Twitter (PERSON)
AI (ORG)
Microsoft (ORG)
Nuance (GPE)
REDMOND (GPE)
Wash. (GPE)
BURLINGTON (GPE)
Mass. (GPE)
April 12 (DATE)
2021 (DATE)
Microsoft Corp (ORG)
Nuance Communications, Inc. (ORG)
today (DATE)
Microsoft (ORG)
Nuance (GPE)
56.00 (MONEY)
23% (PERCENT)
Nuance (GPE)
Friday, April 9 (DATE)
$19.7 billion (MONEY)
Nuances (ORG)
AI (ORG)
decades (DATE)
AI (ORG)
Mark Benjamin (PERSON)
Nuance (GPE)
Scott Guthrie (PERSON)
Cloud & AI (ORG)
Microsoft (ORG)
this calendar year (DATE)
Microsoft (ORG)
t

In [18]:
# Get the named entities from the samples

# Print named entities
print("ai_forecast2 Named Entities:")
for ent in ai_forecast2_sample.ents:
    print(f"{ent.text} ({ent.label_})")
print("************************")

ai_forecast2 Named Entities:
$93.5 billion (MONEY)
2021 (DATE)
Grand View Research, Inc. (ORG)
annual (DATE)
38.1% (PERCENT)
2022 (DATE)
2030 (DATE)
AI (ORG)
AI (ORG)
around 66% (PERCENT)
2016 (DATE)
AI (ORG)
eCommerce (PRODUCT)
AI (ORG)
NVIDIA Corporation (ORG)
NVDA (ORG)
Intel Corporation (ORG)
Alphabet Inc. (ORG)
Amazon Web Services Inc. (ORG)
Artificial Intelligence Market by Component Analysis (ORG)
AI (ORG)
AI (ORG)
Artificial Intelligence Market by Technology  

 (ORG)
AI (ORG)
NLP (ORG)
AI (GPE)
NLP (ORG)
night (TIME)
Artificial Intelligence Market by End-User Industry  

 (ORG)
AI (ORG)
BFSI (ORG)
AI (ORG)
2020 (DATE)
CoverGirl (ORG)
************************


In [19]:
# You also let Spacy explain the meaning of the grammar and the part-of speech tags.
matcher = Matcher(nlp.vocab)

# Task 1: Look for "Artificial Intelligence" in the Key Market Insights text
pattern_ai = [{"LOWER": "artificial"}, {"LOWER": "intelligence"}]
matcher.add("AI_MATCH", [pattern_ai])

# Task 2: Find "AI" followed by a verb
pattern_ai_verb = [{"LOWER": "ai"}, {"POS": "VERB"}]
matcher.add("AI_VERB_MATCH", [pattern_ai_verb])

# Task 3: Find numbers followed by a %
pattern_percent = [{"LIKE_NUM": True}, {"TEXT": "%"}]
matcher.add("PERCENT_MATCH", [pattern_percent])

# Task 4: Find company names (using a simplified example)
pattern_company = [{"ENT_TYPE": "ORG"}]
matcher.add("COMPANY_MATCH", [pattern_company])

# Find matches in the document
moby_dick_matches = matcher(mobydick_sample)
ai_forecast_1_matches = matcher(ai_forecast1_sample)
ai_forecast_2_matches = matcher(ai_forecast2_sample)

In [20]:
# Display matches
for match_id, start, end in moby_dick_matches:
    match_text = mobydick_sample[start:end].text
    print(f"Matched '{match_text}' (pattern: {nlp.vocab.strings[match_id]})")

Matched 'Specksnyder' (pattern: COMPANY_MATCH)
Matched 'The' (pattern: COMPANY_MATCH)
Matched 'Cabin' (pattern: COMPANY_MATCH)
Matched '-' (pattern: COMPANY_MATCH)
Matched 'Table' (pattern: COMPANY_MATCH)
Matched 'The' (pattern: COMPANY_MATCH)
Matched 'Quarter' (pattern: COMPANY_MATCH)
Matched '-' (pattern: COMPANY_MATCH)
Matched 'Deck' (pattern: COMPANY_MATCH)
Matched 'First' (pattern: COMPANY_MATCH)
Matched 'Night' (pattern: COMPANY_MATCH)
Matched '-' (pattern: COMPANY_MATCH)
Matched 'Watch' (pattern: COMPANY_MATCH)
Matched 'The' (pattern: COMPANY_MATCH)
Matched 'Whiteness' (pattern: COMPANY_MATCH)
Matched 'of' (pattern: COMPANY_MATCH)
Matched 'the' (pattern: COMPANY_MATCH)
Matched 'Whale' (pattern: COMPANY_MATCH)
Matched 'Chart' (pattern: COMPANY_MATCH)
Matched 'The' (pattern: COMPANY_MATCH)
Matched 'Mat' (pattern: COMPANY_MATCH)
Matched '-' (pattern: COMPANY_MATCH)
Matched 'Maker' (pattern: COMPANY_MATCH)
Matched 'The' (pattern: COMPANY_MATCH)
Matched 'First' (pattern: COMPANY_MATC

In [21]:
# Display matches
for match_id, start, end in ai_forecast_1_matches:
    match_text = ai_forecast1_sample[start:end].text
    print(f"Matched '{match_text}' (pattern: {nlp.vocab.strings[match_id]})")

Matched 'GLOBE' (pattern: COMPANY_MATCH)
Matched 'NEWSWIRE' (pattern: COMPANY_MATCH)
Matched 'AI' (pattern: COMPANY_MATCH)
Matched '20.1%' (pattern: PERCENT_MATCH)
Matched 'AI' (pattern: COMPANY_MATCH)
Matched 'Fortune' (pattern: COMPANY_MATCH)
Matched 'Business' (pattern: COMPANY_MATCH)
Matched 'Insights' (pattern: COMPANY_MATCH)
Matched 'artificial intelligence' (pattern: AI_MATCH)
Matched 'BFSI' (pattern: COMPANY_MATCH)
Matched 'Microsoft' (pattern: COMPANY_MATCH)
Matched 'Microsoft' (pattern: COMPANY_MATCH)
Matched 'News' (pattern: COMPANY_MATCH)
Matched 'Center' (pattern: COMPANY_MATCH)
Matched '	

' (pattern: COMPANY_MATCH)
Matched 'Share' (pattern: COMPANY_MATCH)
Matched 'on' (pattern: COMPANY_MATCH)
Matched 'Facebook' (pattern: COMPANY_MATCH)
Matched 'AI' (pattern: COMPANY_MATCH)
Matched 'Microsoft' (pattern: COMPANY_MATCH)
Matched 'Microsoft' (pattern: COMPANY_MATCH)
Matched 'Corp' (pattern: COMPANY_MATCH)
Matched 'Nuance' (pattern: COMPANY_MATCH)
Matched 'Communications' (pa

In [22]:
# Display matches
for match_id, start, end in ai_forecast_2_matches:
    match_text = ai_forecast2_sample[start:end].text
    print(f"Matched '{match_text}' (pattern: {nlp.vocab.strings[match_id]})")

Matched 'artificial intelligence' (pattern: AI_MATCH)
Matched 'Grand' (pattern: COMPANY_MATCH)
Matched 'View' (pattern: COMPANY_MATCH)
Matched 'Research' (pattern: COMPANY_MATCH)
Matched ',' (pattern: COMPANY_MATCH)
Matched 'Inc.' (pattern: COMPANY_MATCH)
Matched '38.1%' (pattern: PERCENT_MATCH)
Matched 'AI' (pattern: COMPANY_MATCH)
Matched 'Artificial intelligence' (pattern: AI_MATCH)
Matched 'AI' (pattern: COMPANY_MATCH)
Matched 'artificial intelligence' (pattern: AI_MATCH)
Matched '66%' (pattern: PERCENT_MATCH)
Matched 'AI' (pattern: COMPANY_MATCH)
Matched 'AI' (pattern: COMPANY_MATCH)
Matched 'NVIDIA' (pattern: COMPANY_MATCH)
Matched 'Corporation' (pattern: COMPANY_MATCH)
Matched 'NVDA' (pattern: COMPANY_MATCH)
Matched 'Intel' (pattern: COMPANY_MATCH)
Matched 'Corporation' (pattern: COMPANY_MATCH)
Matched 'Alphabet' (pattern: COMPANY_MATCH)
Matched 'Inc.' (pattern: COMPANY_MATCH)
Matched 'Amazon' (pattern: COMPANY_MATCH)
Matched 'Web' (pattern: COMPANY_MATCH)
Matched 'Services' (pa