#Article_Feature_Extraction.ipynb
---
## Objective
The goal of **Article Feature Extraction** is to process an Excel file containing a column named `URL` with a list of article links. This process will generate a new Excel file that includes:

- **News Source**: Extracted from the URL to identify the originating website.
- **Header**: The main headline or title of the article.
- **Text**: The cleaned and concatenated body text of the article.
- **Authors**: A cleaned list of the article's authors.

This cleaned and structured dataset will be prepa

In [58]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import re

In [59]:
def read_excel_file_return_url_dataframe(file):
    """
    Reads an Excel file and returns a DataFrame containing a column of URLs.

    Args:
        file (str): Path to the Excel file.

    Returns:
        pd.DataFrame: A DataFrame containing the data from the Excel file. The file must have a column labeled 'URL'.

    Raises:
        ValueError: If the file does not contain a column labeled 'URL'.
        Exception: For other file-related errors (e.g., file not found, incorrect format).

    Note:
        Ensure the Excel file contains a column labeled 'URL' with valid URL values before using this function.
    """
    try:
        # Read the Excel file into a DataFrame
        df = pd.read_excel(file)
        
        # Check if the 'URL' column exists
        if 'URL' not in df.columns:
            raise ValueError("The Excel file must contain a column labeled 'URL'.")
        
        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error


In [60]:
article_df = read_excel_file_return_url_dataframe('URL_Links_Folder/URL_Links.xlsx')

In [61]:
def sort_news_sites(article_df):
    """
    Sorts through the list of URL links and extracts the news agency source.

    This is important as each website has different HTML structures for their articles,
    requiring unique functions to extract features.

    Args:
        article_df (pd.DataFrame): A DataFrame containing a column 'URL' with article links.

    Returns:
        pd.DataFrame: The input DataFrame with a new column 'News_Source' containing the extracted news source.
    """
    # Add a new column to store the news source
    article_df['News_Source'] = article_df['URL'].apply(
        lambda article: re.search(r"https://www\.(.*?)\.com/", article).group(1) 
        if re.search(r"https://www\.(.*?)\.com/", article) else None
    )
    
    return article_df


In [62]:
sort_news_sites(article_df)
article_df

Unnamed: 0,URL,News_Source
0,https://www.nbcnews.com/politics/politics-news...,nbcnews
1,https://www.nbcnews.com/news/world/magnitude-6...,nbcnews
2,https://www.nbcnews.com/news/world/north-korea...,nbcnews
3,https://www.nbcnews.com/news/world/taliban-not...,nbcnews
4,https://www.cnn.com/2025/01/13/middleeast/isra...,cnn
5,https://www.cnn.com/2025/01/13/politics/pete-h...,cnn
6,https://www.cnn.com/2025/01/11/middleeast/leba...,cnn


In [63]:
def extract_article_features_NBC(url_command):
    """
    Extract the header, article body text, and authors from a news article.  Note this function has been created with the specifications of the NBC news website in mind.  I will need to create specific versions of this for each website since the layout of each news outlet's site is different.  

    Args:
        url_command (str): The URL of the news article.

    Returns:
        tuple: (header_text, full_text, authors)
            - header_text (str): The article's headline.
            - full_text (str): The concatenated text of all paragraphs in the article body.
            - authors (list): A list of authors' names.
    """
    try:
        # Fetch the HTML content
        response = requests.get(url_command)
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            return None, None, None

        # Parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract header
        header = soup.find("h1", class_="article-hero-headline__htag lh-none-print black-print article-hero-headline__htag--live-breaking")
        header_text = header.text.strip() if header else "Header not found"

        # Extract article body
        article_body = soup.find("div", class_="article-body__content")
        full_text = (
            " ".join(p.text.strip() for p in article_body.find_all("p") if p.text.strip())
            if article_body else "Article body not found"
        )

        # Extract authors
        author_spans = soup.find_all("span", class_="byline-name expanded-byline__name")
        authors = [span.text.strip() for span in author_spans] or ["Authors not found"]

        return header_text, full_text, authors

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None, None


In [64]:
def extract_article_features_CNN(url_command):
    """
    Extract the header, article body text, and authors from a news article.  Note this function has been created with the specifications of the NBC news website in mind.  I will need to create specific versions of this for each website since the layout of each news outlet's site is different.  

    Args:
        url_command (str): The URL of the news article.

    Returns:
        tuple: (header_text, full_text, authors)
            - header_text (str): The article's headline.
            - full_text (str): The concatenated text of all paragraphs in the article body.
            - authors (list): A list of authors' names.
    """
    try:
        # Fetch the HTML content
        response = requests.get(url_command)
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            return None, None, None

        # Parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract header
        header = soup.find("h1", class_="headline__text inline-placeholder vossi-headline-text")
        header_text = header.text.strip() if header else "Header not found"
        
        
        if header_text == "Header not found":
            new_header = re.search("")

        # Extract article body
        article_body = soup.find("div", class_="article__content")
        full_text = (
            " ".join(p.text.strip() for p in article_body.find_all("p") if p.text.strip())
            if article_body else "Article body not found"
        )

        # Extract authors
        author_spans = soup.find_all("span", class_="byline__name")
        authors = [span.text.strip() for span in author_spans] or ["Authors not found"]

        return header_text, full_text, authors

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None, None

In [65]:
def feature_extraction(article_df):
    """
    Extracts article features (header, authors, full_text) based on the News_Source column.

    Args:
        article_df (pd.DataFrame): DataFrame containing columns 'URL' and 'News_Source'.

    Returns:
        pd.DataFrame: The updated DataFrame with extracted features added as new columns.
    """
    # Add new columns for the extracted features
    article_df['header'] = np.nan
    article_df['authors'] = np.nan
    article_df['full_text'] = np.nan

    for index, row in article_df.iterrows():
        if row['News_Source'] == 'nbcnews':
            try:
                # Call the NBC-specific extraction function
                header, full_text, authors = extract_article_features_NBC(row['URL'])
                article_df.at[index, 'header'] = header
                article_df.at[index, 'full_text'] = full_text
                article_df.at[index, 'authors'] = ", ".join(authors) if authors else None
            except Exception as e:
                print(f"Error processing URL at index {index}: {e}")
                continue

        # Add additional conditions for other news sources as needed
        elif row['News_Source'] == 'cnn':
            try:
                # Call the NBC-specific extraction function
                header, full_text, authors = extract_article_features_CNN(row['URL'])
                article_df.at[index, 'header'] = header
                article_df.at[index, 'full_text'] = full_text
                article_df.at[index, 'authors'] = ", ".join(authors) if authors else None
            except Exception as e:
                print(f"Error processing URL at index {index}: {e}")
                continue
            

        # Handle unknown news sources
        else:
            print(f"News source '{row['News_Source']}' not recognized. Skipping index {index}.")

    return article_df

In [66]:
feature_extraction(article_df)

  article_df.at[index, 'header'] = header
  article_df.at[index, 'full_text'] = full_text
  article_df.at[index, 'authors'] = ", ".join(authors) if authors else None


Unnamed: 0,URL,News_Source,header,authors,full_text
0,https://www.nbcnews.com/politics/politics-news...,nbcnews,Newsom says California wildfires will be one o...,"Jacob Soboroff, Alexandra Marquez",California Gov. Gavin Newsom told NBC News’ “M...
1,https://www.nbcnews.com/news/world/magnitude-6...,nbcnews,Header not found,Astha Rajvanshi,A 6.6-magnitude earthquake has rattled the isl...
2,https://www.nbcnews.com/news/world/north-korea...,nbcnews,Header not found,"Stella Kim, Janis Mackey Frayer, Jennifer Jett","SEOUL, South Korea — About 300 North Korean tr..."
3,https://www.nbcnews.com/news/world/taliban-not...,nbcnews,Header not found,Astha Rajvanshi,Nobel Peace Prize laureate Malala Yousafzai de...
4,https://www.cnn.com/2025/01/13/middleeast/isra...,cnn,US officials say Gaza ceasefire deal is in sig...,"Abeer Salman, Kareem Khadder, Mike Schwartz, L...",American officials believe a ceasefire and hos...
5,https://www.cnn.com/2025/01/13/politics/pete-h...,cnn,Pete Hegseth says US military bases should res...,Andrew Kaczynski,"Pete Hegseth, President-elect Donald Trump’s p..."
6,https://www.cnn.com/2025/01/11/middleeast/leba...,cnn,Watershed moment for the Middle East after Leb...,Tamara Qiblawi,It was a last-minute push by Saudi Arabia that...


In [67]:
article_df.to_excel("Articles_With_Text.xlsx", index = False)