#Article_Feature_Extraction.ipynb
---
## Objective
The goal of **Article Feature Extraction** is to process an Excel file containing a column named `URL` with a list of article links. This process will generate a new Excel file that includes:

- **News Source**: Extracted from the URL to identify the originating website.
- **Header**: The main headline or title of the article.
- **Text**: The cleaned and concatenated body text of the article.
- **Authors**: A cleaned list of the article's authors.

This cleaned and structured dataset will be prepa

In [50]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import re
import spacy

In [51]:
file_name = "Articles_With_Text.xlsx"
col = "full_text"

In [52]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

def extract_entities_unique_string(text, entity_labels=["PRODUCT", "ORG", "GPE", "PERSON"], delimiter=','):
    """
    Function to extract unique specified entities from a text string
    and return them as a single string.

    Parameters:
    text (str): Unfiltered/normal text.
    entity_labels (list of str): List of entity labels to filter by.
    delimiter (str): Delimiter to use for joining the entities.

    Returns:
    str: A single string containing the unique extracted entities from the text.
    """
    # Create a spaCy Doc object from the input text
    doc = nlp(text)
    
    # Extract unique entities based on the specified labels
    unique_entities = set()
    for ent in doc.ents:
        if ent.label_ in entity_labels:
            unique_entities.add(ent.text.strip())
    
    # Join the unique entities using the specified delimiter
    unique_entities_string = delimiter.join(unique_entities)
    return unique_entities_string

In [53]:
def remove_extra_characters(text):
    '''
    Function that processes the extracted entities by removing possessive forms, converting to lowercase,
    and removing any additional punctuation, including bullet points and spaces used for indentation or listing.
    Ensures uniqueness of entities.
    
    Parameters:
    text (str): Extracted entities as a comma-separated string.
    
    Returns:
    unique_entities_string str: Processed, cleaned, and unique entities joined by a comma.
    '''
    # Define the delimiter
    delimiter = ','

    # Split the input string into a list of items based on commas
    items = text.split(',')
    
    cleaned_items = []
    
    for item in items:
        # Remove possessive forms 's
        item = item.replace("'s", "").replace("’s", "")
        # Remove bullet points and following spaces
        item = re.sub(r'[\u2022•]\s*', '', item)
        # Convert to lowercase
        item = item.lower()
        # Strip leading/trailing whitespace
        item = item.strip()
        cleaned_items.append(item)
    
    # Use a set to remove duplicates and convert it back to a list
    unique_items = list(set(cleaned_items))
    unique_entities_string = delimiter.join(unique_items)
    
    return unique_entities_string # Creating a custom list of stopwords

In [54]:
def extract_text(file):
    article_df = pd.read_excel(file)
    return article_df
text_df = extract_text(file_name)
text_df

Unnamed: 0,URL,News_Source,header,authors,full_text
0,https://www.nbcnews.com/politics/politics-news...,nbcnews,Newsom says California wildfires will be one o...,"Jacob Soboroff, Alexandra Marquez",California Gov. Gavin Newsom told NBC News’ “M...
1,https://www.nbcnews.com/news/world/magnitude-6...,nbcnews,Header not found,Astha Rajvanshi,A 6.6-magnitude earthquake has rattled the isl...
2,https://www.nbcnews.com/news/world/north-korea...,nbcnews,Header not found,"Stella Kim, Janis Mackey Frayer, Jennifer Jett","SEOUL, South Korea — About 300 North Korean tr..."
3,https://www.nbcnews.com/news/world/taliban-not...,nbcnews,Header not found,Astha Rajvanshi,Nobel Peace Prize laureate Malala Yousafzai de...
4,https://www.cnn.com/2025/01/13/middleeast/isra...,cnn,US officials say Gaza ceasefire deal is in sig...,"Abeer Salman, Kareem Khadder, Mike Schwartz, L...",American officials believe a ceasefire and hos...
5,https://www.cnn.com/2025/01/13/politics/pete-h...,cnn,Pete Hegseth says US military bases should res...,Andrew Kaczynski,"Pete Hegseth, President-elect Donald Trump’s p..."
6,https://www.cnn.com/2025/01/11/middleeast/leba...,cnn,Watershed moment for the Middle East after Leb...,Tamara Qiblawi,It was a last-minute push by Saudi Arabia that...


In [55]:
def extract_entities(df):
    df['entities'] = np.nan
    for index, row in df.iterrows(): 
        text = remove_extra_characters(row['full_text'])
        entities = extract_entities_unique_string(text)
        df.at[index, 'entities'] = entities
    return df

entites_df = extract_entities(text_df)       

  df.at[index, 'entities'] = entities


In [56]:
entites_df.to_excel("Entities_Extracted_From_Articles.xlsx", index = False)