# SpaCy Setup and Text Processing

In [1]:
# run the next line only once if needed 
# This model includes word vectors and is useful for various natural language processing tasks.
!python -m spacy download en_core_web_lg 

# Importing the spaCy library
import spacy

# Loading the downloaded English language model for spaCy
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
# Define a text string containing a sentence.
text = "My friend vidya gayathri likes watching movies."

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Iterate through each token in the processed document and print them with a delimiter "|".
for token in doc:
    print(token, end=' | ')

My | friend | vidya | gayathri | likes | watching | movies | . | 

# Tokenization and Information Extraction

In [4]:
# Import the pandas library with alias pd for data manipulation and analysis.
import pandas as pd

# Define a function display_nlp that takes a spaCy document (doc) and an optional flag (include_punct) 
# to decide whether to include punctuation tokens in the output DataFrame.
def display_nlp(doc, include_punct=False):
    """
    Generate a DataFrame for visualization of spaCy tokens.
    
    Parameters:
        doc (spacy.Doc): The processed spaCy document.
        include_punct (bool): Flag to include punctuation tokens. Default is False.

    Returns:
        pd.DataFrame: DataFrame containing token information.
    """
    # Initialize an empty list to store rows of token information.
    rows = []
    
    # Iterate through each token in the spaCy document.
    for i, t in enumerate(doc):
        # Check if the token is not punctuation or if include_punct flag is True.
        if not t.is_punct or include_punct:
            # Create a dictionary containing token information.
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            # Append the dictionary to the rows list.
            rows.append(row)
    
    # Create a DataFrame from the list of token information rows.
    df = pd.DataFrame(rows).set_index('token')
    
    # Set the index name to None for cleaner display.
    df.index.name = None
    
    # Return the DataFrame.
    return df

# Call the display_nlp function with the spaCy document (doc) to generate a DataFrame
# for visualization of spaCy tokens.
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,friend,friend,False,True,NOUN,compound,,O
2,vidya,vidya,False,True,PROPN,compound,PERSON,B
3,gayathri,gayathri,False,True,PROPN,nsubj,PERSON,I
4,likes,like,False,True,VERB,ROOT,,O
5,watching,watch,False,True,VERB,xcomp,,O
6,movies,movie,False,True,NOUN,dobj,,O


# REMOVING STOPWORS

In [6]:
# Define a text string containing a sentence.
text = "Dear Vidya, we need to go and eat some food outside. Regards, Srija"

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Create a list comprehension to filter out tokens that are neither stop words nor punctuation.
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]

# Print the list of non-stopword and non-punctuation tokens.
print(non_stop)

[Dear, Vidya, need, eat, food, outside, Regards, Srija]


# FINDING ALL NOUNS

In [7]:
# Define a text string containing a sentence.
text = "My friend vidya gayathri likes watching movies."

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Create a list comprehension to filter out tokens that are nouns or proper nouns.
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]

# Print the list of tokens identified as nouns or proper nouns.
print(nouns)

[friend, vidya, gayathri, movies]


# Identifying Named Entities

In [8]:
# Iterate through each named entity in the processed document and print its text and label.
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(vidya gayathri, PERSON) 

# TRYING ANOTHER ONE

In [11]:
# Define a text string containing a sentence.
text = "Vidya, my best friend, lives in New Haven, connecticut but loves to stay in New Jersey."

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Iterate through each named entity in the processed document and print its text and label.
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(New Haven, GPE) (connecticut, GPE) (New Jersey, GPE) 

# VISUALIZING NERS

In [12]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the spaCy document (doc) with entity visualization style and display it in Jupyter Notebook.
displacy.render(doc, style='ent', jupyter=True)

# TRYING FOR REAL DATASET

INSTALLING NECCESSARY LIBRARIES

In [13]:
!pip install html5lib
!pip install --upgrade pip setuptools
!pip install beautifulsoup4

Collecting setuptools
  Downloading setuptools-69.2.0-py3-none-any.whl.metadata (6.3 kB)
Downloading setuptools-69.2.0-py3-none-any.whl (821 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.5/821.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 68.0.0
    Uninstalling setuptools-68.0.0:
      Successfully uninstalled setuptools-68.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.75 requires requests_mock, which is not installed.
spyder 5.4.3 requires pyqt5<5.16, which is not installed.
spyder 5.4.3 requires pyqtwebengine<5.16, which is not installed.
conda-repo-cli 1.0.75 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.75 requir

# LOADING DATASET - ARTICLE 1

In [14]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import re

# Define a function to extract text content from a given URL
def url_to_string(url):
    # Send a GET request to the URL
    res = requests.get(url)
    # Extract the HTML content
    html = res.text
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html, 'html5lib')
    # Remove unwanted elements like scripts, styles, and asides
    for script in soup(["script", "style", 'aside']):
        script.extract()
    # Extract the text content and join it into a single string
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Use the defined function to fetch content from a given URL (in this case, the New York Times trending page)
ny_bb = url_to_string('https://www.emarketer.com/content/digital-retailers-want-use-ai-hyperpersonal-2024')

# Process the extracted text using the spaCy language model loaded previously
article = nlp(ny_bb)

# Calculate the number of named entities in the processed text
len(article.ents)

117

# NERS

In [15]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the spaCy document (article) with entity visualization style and display it in Jupyter Notebook.
displacy.render(article, style='ent', jupyter=True)

# NER TYPES

In [16]:
# Download the large English language model for spaCy if it hasn't been downloaded already.
!python -m spacy download en_core_web_lg 

# Import the spaCy library
import spacy

# Load the downloaded English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Import Counter from collections module to count occurrences of each element
from collections import Counter

# Extract the labels of named entities from the processed text and count their occurrences
labels = [x.label_ for x in article.ents]
counter = Counter(labels)

# Print the count of each entity label
print(counter)

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Counter({'ORG': 42, 'DATE': 25, 'CARDINAL': 16, 'PRODUCT': 10, 'GPE': 6, 'PERSON': 4, 'PERCENT': 4, 'FAC': 3, 'NORP': 2, 'ORDINAL': 2, 'WORK_OF_ART': 2, 'MONEY': 1})


# POPULAR NER

In [17]:
# Extract the text of named entities from the processed text and count their occurrences
items = [x.text for x in article.ents]
counter_items = Counter(items)

# Get the five most common named entities and their counts
most_common_entities = counter_items.most_common(5)

# Print the result
print(most_common_entities)

[('AI', 9), ('Mar 26', 5), ('Mar 25', 4), ('2024', 3), ('MarketingEmail', 3)]



# PRINTING SENTENCES

In [18]:
# Import the spaCy library
import spacy

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Extract sentences containing named entities from the processed text
sentences = [x for x in article.ents]

# Print the 21st sentence containing named entities
print(sentences[1:13])

[AI, 2024, ClientBecome, ClientGet, DemoPricingCalendarIndustriesProductsInsightsEventsPricingAboutIndustries, OverviewOur, five, Advertising & MarketingSocial, MarketingEmail, SalesSocial, AmericaWestern EuropeHealthValue, CareDigital]


# NER TAGS

In [19]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the 21st sentence containing named entities with entity visualization style
displacy.render(nlp(str(sentences[1:13])), jupyter=True, style='ent')

# Word Categories in the Sentence

In [20]:
# Extract non-stopword and non-punctuation tokens from the 21st sentence containing named entities
tokens_info = [(x.orth_, x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[1:13])) if not y.is_stop and y.pos_ != 'PUNCT']]

# Print the tokens' orthographic form, part-of-speech, and lemma
print(tokens_info)

[('[', 'X', '['), ('AI', 'PROPN', 'AI'), ('2024', 'NUM', '2024'), ('ClientBecome', 'PROPN', 'ClientBecome'), ('ClientGet', 'PROPN', 'ClientGet'), ('DemoPricingCalendarIndustriesProductsInsightsEventsPricingAboutIndustries', 'NOUN', 'demopricingcalendarindustriesproductsinsightseventspricingaboutindustrie'), ('OverviewOur', 'PROPN', 'OverviewOur'), ('Advertising', 'PROPN', 'Advertising'), ('&', 'CCONJ', '&'), ('MarketingSocial', 'PROPN', 'MarketingSocial'), ('MarketingEmail', 'PROPN', 'MarketingEmail'), ('SalesSocial', 'PROPN', 'SalesSocial'), ('AmericaWestern', 'PROPN', 'AmericaWestern'), ('EuropeHealthValue', 'PROPN', 'EuropeHealthValue'), ('CareDigital', 'PROPN', 'CareDigital')]


# Dependency Tree for the Sentence

In [21]:
# Import the displacy module from spaCy for dependency visualization.
from spacy import displacy

# Render the dependency parse of the 21st sentence containing named entities and display it in Jupyter Notebook.
# Adjust the distance between tokens for better visualization.
displacy.render(nlp(str(sentences[1:5])), style='dep', jupyter=True, options={'distance': 150})

# LOADING DATASET - ARTICLE-2

In [22]:
#Import necessary libraries
from bs4 import BeautifulSoup
import requests
import re

# Define a function to extract text content from a given URL
def url_to_string(url):
    # Send a GET request to the URL
    res = requests.get(url)
    # Extract the HTML content
    html = res.text
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html, 'html5lib')
    # Remove unwanted elements like scripts, styles, and asides
    for script in soup(["script", "style", 'aside']):
        script.extract()
    # Extract the text content and join it into a single string
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Use the defined function to fetch content from a given URL (in this case, the New York Times trending page)
ny_bb = url_to_string('https://time.com/6960491/donald-trump-stock-truth-social-worth-billions/')

# Process the extracted text using the spaCy language model loaded previously
article = nlp(ny_bb)

# Calculate the number of named entities in the processed text
len(article.ents)

146

# NERS

In [23]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the spaCy document (article) with entity visualization style and display it in Jupyter Notebook.
displacy.render(article, style='ent', jupyter=True)

# NER TYPES

In [24]:
# Import the spaCy library
import spacy

# Load the downloaded English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Import Counter from collections module to count occurrences of each element
from collections import Counter

# Extract the labels of named entities from the processed text and count their occurrences
labels = [x.label_ for x in article.ents]
counter = Counter(labels)

# Print the count of each entity label
print(counter)

Counter({'ORG': 55, 'PERSON': 40, 'DATE': 17, 'CARDINAL': 9, 'GPE': 7, 'MONEY': 5, 'PRODUCT': 4, 'WORK_OF_ART': 2, 'TIME': 2, 'NORP': 2, 'PERCENT': 2, 'ORDINAL': 1})


# POPULAR NER

In [25]:
# Extract the text of named entities from the processed text and count their occurrences
items = [x.text for x in article.ents]
counter_items = Counter(items)

# Get the five most common named entities and their counts
most_common_entities = counter_items.most_common(5)

# Print the result
print(most_common_entities)

[('Trump', 19), ('TIME', 7), ('Truth Social', 7), ('Truth Social Worth Billions', 3), ('Yass', 3)]


# PRINTING SENTENCES

In [26]:
# Import the spaCy library
import spacy

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Extract sentences containing named entities from the processed text
sentences = [x for x in article.ents]

# Print the 21st sentence containing named entities
print(sentences[1:13])

[Truth Social Worth Billions, Our Ideas NewsletterSubscribeSubscribeSectionsHomeU.S.PoliticsWorldHealthClimateFuture of Work, TIMECouponsPersonal Finance, TIME, TIME, StampedJoin UsNewslettersSubscribeGive a, GiftShop, TIME, Cover StoreCustomer CareUS & CanadaGlobal Help CenterReach, RoomContact, PermissionsMoreAbout UsPrivacy PolicyYour, UseModern Slavery]


# NER TAGS

In [27]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the 21st sentence containing named entities with entity visualization style
displacy.render(nlp(str(sentences[1:13])), jupyter=True, style='ent')

# TYPES OF WORDS IN THE SENTENCE 

In [28]:
# Extract non-stopword and non-punctuation tokens from the 21st sentence containing named entities
tokens_info = [(x.orth_, x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[1:13])) if not y.is_stop and y.pos_ != 'PUNCT']]

# Print the tokens' orthographic form, part-of-speech, and lemma
print(tokens_info)

[('Truth', 'PROPN', 'Truth'), ('Social', 'PROPN', 'Social'), ('Worth', 'PROPN', 'Worth'), ('Billions', 'PROPN', 'Billions'), ('Ideas', 'PROPN', 'Ideas'), ('NewsletterSubscribeSubscribeSectionsHomeU.S.PoliticsWorldHealthClimateFuture', 'PROPN', 'NewsletterSubscribeSubscribeSectionsHomeU.S.PoliticsWorldHealthClimateFuture'), ('Work', 'PROPN', 'Work'), ('TIMECouponsPersonal', 'PROPN', 'TIMECouponsPersonal'), ('Finance', 'PROPN', 'Finance'), ('TIME', 'PROPN', 'TIME'), ('TIME', 'PROPN', 'TIME'), ('StampedJoin', 'PROPN', 'StampedJoin'), ('UsNewslettersSubscribeGive', 'NOUN', 'usnewsletterssubscribegive'), ('GiftShop', 'PROPN', 'GiftShop'), ('TIME', 'PROPN', 'TIME'), ('Cover', 'PROPN', 'Cover'), ('StoreCustomer', 'PROPN', 'StoreCustomer'), ('CareUS', 'PROPN', 'CareUS'), ('&', 'CCONJ', '&'), ('CanadaGlobal', 'PROPN', 'CanadaGlobal'), ('Help', 'VERB', 'help'), ('CenterReach', 'PROPN', 'CenterReach'), ('RoomContact', 'PROPN', 'RoomContact'), ('PermissionsMoreAbout', 'PROPN', 'PermissionsMoreAbou

# SENTENCE DEPENDENCY TREE

In [29]:
# Import the displacy module from spaCy for dependency visualization.
from spacy import displacy

# Render the dependency parse of the 21st sentence containing named entities and display it in Jupyter Notebook.
# Adjust the distance between tokens for better visualization.
displacy.render(nlp(str(sentences[1:3])), style='dep', jupyter=True, options={'distance': 150})

# LOADING DATASET - ARTICLE-3 AND REPEATING SAME PROCESS 

In [30]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import re

# Define a function to extract text content from a given URL
def url_to_string(url):
    # Send a GET request to the URL
    res = requests.get(url)
    # Extract the HTML content
    html = res.text
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html, 'html5lib')
    # Remove unwanted elements like scripts, styles, and asides
    for script in soup(["script", "style", 'aside']):
        script.extract()
    # Extract the text content and join it into a single string
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Use the defined function to fetch content from a given URL (in this case, the New York Times trending page)
ny_bb = url_to_string('https://time.com/6899782/thailand-prime-minister-srettha-thavisin-business-hub/')

# Process the extracted text using the spaCy language model loaded previously
article = nlp(ny_bb)

# Calculate the number of named entities in the processed text
len(article.ents)

362

In [31]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the spaCy document (article) with entity visualization style and display it in Jupyter Notebook.
displacy.render(article, style='ent', jupyter=True)

In [32]:
# Import the spaCy library
import spacy

# Load the downloaded English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Import Counter from collections module to count occurrences of each element
from collections import Counter

# Extract the labels of named entities from the processed text and count their occurrences
labels = [x.label_ for x in article.ents]
counter = Counter(labels)

# Print the count of each entity label
print(counter)

Counter({'PERSON': 74, 'ORG': 70, 'GPE': 63, 'DATE': 49, 'NORP': 36, 'CARDINAL': 24, 'LOC': 9, 'MONEY': 7, 'PRODUCT': 6, 'ORDINAL': 6, 'PERCENT': 6, 'TIME': 4, 'WORK_OF_ART': 3, 'EVENT': 2, 'QUANTITY': 2, 'FAC': 1})


In [33]:
# Extract the text of named entities from the processed text and count their occurrences
items = [x.text for x in article.ents]
counter_items = Counter(items)

# Get the five most common named entities and their counts
most_common_entities = counter_items.most_common(5)

# Print the result
print(most_common_entities)

[('Srettha', 36), ('Thailand', 29), ('Thai', 17), ('TIME', 9), ('Bangkok', 7)]


In [34]:
# Import the spaCy library
import spacy

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Extract sentences containing named entities from the processed text
sentences = [x for x in article.ents]

# Print the 21st sentence containing named entities
print(sentences[1:13])

[TIMETIME, Our Ideas NewsletterSubscribeSubscribeSectionsHomeU.S.PoliticsWorldHealthClimateFuture of Work, TIMECouponsPersonal Finance, TIME, TIME, StampedJoin UsNewslettersSubscribeGive a, GiftShop, TIME, Cover StoreCustomer CareUS & CanadaGlobal Help CenterReach, RoomContact, PermissionsMoreAbout UsPrivacy PolicyYour, UseModern Slavery]


In [35]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the 21st sentence containing named entities with entity visualization style
displacy.render(nlp(str(sentences[1:13])), jupyter=True, style='ent')

In [36]:
# Extract non-stopword and non-punctuation tokens from the 21st sentence containing named entities
tokens_info = [(x.orth_, x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[1:13])) if not y.is_stop and y.pos_ != 'PUNCT']]

# Print the tokens' orthographic form, part-of-speech, and lemma
print(tokens_info)

[('[', 'X', '['), ('TIMETIME', 'PROPN', 'TIMETIME'), ('Ideas', 'PROPN', 'Ideas'), ('NewsletterSubscribeSubscribeSectionsHomeU.S.PoliticsWorldHealthClimateFuture', 'PROPN', 'NewsletterSubscribeSubscribeSectionsHomeU.S.PoliticsWorldHealthClimateFuture'), ('Work', 'PROPN', 'Work'), ('TIMECouponsPersonal', 'PROPN', 'TIMECouponsPersonal'), ('Finance', 'PROPN', 'Finance'), ('TIME', 'PROPN', 'TIME'), ('TIME', 'PROPN', 'TIME'), ('StampedJoin', 'PROPN', 'StampedJoin'), ('UsNewslettersSubscribeGive', 'NOUN', 'usnewsletterssubscribegive'), ('GiftShop', 'PROPN', 'GiftShop'), ('TIME', 'PROPN', 'TIME'), ('Cover', 'PROPN', 'Cover'), ('StoreCustomer', 'PROPN', 'StoreCustomer'), ('CareUS', 'PROPN', 'CareUS'), ('&', 'CCONJ', '&'), ('CanadaGlobal', 'PROPN', 'CanadaGlobal'), ('Help', 'VERB', 'help'), ('CenterReach', 'PROPN', 'CenterReach'), ('RoomContact', 'PROPN', 'RoomContact'), ('PermissionsMoreAbout', 'PROPN', 'PermissionsMoreAbout'), ('UsPrivacy', 'PROPN', 'UsPrivacy'), ('PolicyYour', 'PROPN', 'Polic

In [37]:
# Import the displacy module from spaCy for dependency visualization.
from spacy import displacy

# Render the dependency parse of the 21st sentence containing named entities and display it in Jupyter Notebook.
# Adjust the distance between tokens for better visualization.
displacy.render(nlp(str(sentences[1:3])), style='dep', jupyter=True, options={'distance': 200})