# News Bias Indicator
This notebook applies the PyTorch classification model created in the News_Bias_Classification_Model notebook to classify the political bias of a provided article. Then, after predicting the political bias, this notebook utilizes SerpAPI to webscrape and provide articles covering the same topic with similar and different political biases. 

# 1: Setup Environment

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from serpapi import GoogleSearch
from urllib.parse import urlparse
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
label_map = {0: "Left", 1: "Center", 2: "Right"}
blocked_websites = 'https://www.wsj.com/'
diff_HTML = ['reuters.com']
popup_sites = ['cnn.com']

In [None]:
# Adjust Input URL to desired article
input_url = 'https://www.theguardian.com/business/2025/sep/05/us-jobs-report-august-tariffs'
# Adjust API Key
api_key = 'Paste SERPAPI key here'

## Functions

In [None]:
def scrape_article(url, page_load_timeout=60, element_wait_timeout=30):
    """
    Extract title and article content from a URL safely.
    Relies on Selenium timeouts to prevent hanging.
    
    Parameters:
    - url: str, the URL to scrape
    - page_load_timeout: int, max seconds to wait for page to load
    - element_wait_timeout: int, max seconds to wait for elements to appear
    """
    driver = None
    try:
        exception = any(key.lower() in url.lower() for key in diff_HTML)
        popup_exists = any(key.lower() in url.lower() for key in popup_sites)
        
        # Initialize Chrome
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.set_page_load_timeout(page_load_timeout)
        driver.get(url)
        
        # Handle pop-up if it exists
        if popup_exists:
            try:
                WebDriverWait(driver, element_wait_timeout).until(
                    EC.element_to_be_clickable((By.ID, "close-pc-btn-handler"))
                ).click()
            except (TimeoutException, NoSuchElementException):
                try:
                    WebDriverWait(driver, element_wait_timeout).until(
                        EC.element_to_be_clickable((By.CLASS_NAME, "onetrust-close-btn-handler"))
                    ).click()
                except (TimeoutException, NoSuchElementException):
                    pass  # Pop-up not present, continue
        
        # Extract title safely
        try:
            title = WebDriverWait(driver, element_wait_timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "h1"))
            ).text
        except TimeoutException:
            title = None
        
        # Extract content
        content = ""
        if exception:
            paragraph_divs = driver.find_elements(By.XPATH, "//div[starts-with(@data-testid,'paragraph-')]")
            content = " ".join([div.text for div in paragraph_divs])
        else:
            paragraphs = driver.find_elements(By.TAG_NAME, "p")
            content = " ".join([p.text for p in paragraphs])
        
        return pd.DataFrame([{"url": url, "title": title, "content": content}])
    
    except (TimeoutException, WebDriverException) as e:
        print(f"Error scraping {url}: {e}")
        return pd.DataFrame([{"url": url, "title": None, "content": None}])
    
    except Exception as e:
        print(f"Unexpected error for {url}: {e}")
        return pd.DataFrame([{"url": url, "title": None, "content": None}])
    
    finally:
        if driver:
            driver.quit()

In [None]:
def predict_bias(texts, model, tokenizer, label_map=None):
    """
    Apply classification model to articles to determine bias
    """
    texts = ["" if t is None else str(t) for t in texts]
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True
    )
    
    # keep only the args that the model forward() actually supports
    valid_keys = model.forward.__code__.co_varnames
    inputs = {k: v for k, v in inputs.items() if k in valid_keys}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_ids = torch.argmax(logits, dim=-1).tolist()
    
    if label_map:
        return [label_map[i] for i in pred_ids]
    return pred_ids

In [None]:
def google_search(search_query, api_key, df, num_results=250, max_unique=14):
    """
    Search Google News with SerpAPI for articles on the same event.
    Returns a list of links from unique publishers (domains), excluding blocked websites
    and the domain of the first URL in df.
    
    Parameters:
    - search_query: string, the query to search
    - api_key: string, SerpAPI key
    - df: pandas DataFrame with at least a 'url' column
    - num_results: int, number of results to request
    - max_unique: int, maximum number of unique links to return
    """
    params = {
        "q": search_query,
        "tbm": "nws",
        "location": "United States",
        "hl": "en",
        "gl": "us",
        "google_domain": "google.com",
        "num": num_results,
        "api_key": api_key,
        "filter": 0
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    links = []
    seen_domains = set()

    # Add domain of first URL in df
    first_url_domain = urlparse(df['url'].iloc[0]).netloc
    seen_domains.add(first_url_domain)

    # Add blocked website domains
    seen_domains.add(urlparse(blocked_websites).netloc)

    # Collect unique links
    if "news_results" in results:
        for res in results["news_results"]:
            if "link" in res:
                domain = urlparse(res["link"]).netloc
                if domain not in seen_domains:
                    links.append(res["link"])
                    seen_domains.add(domain)
                if len(links) >= max_unique:
                    break

    return links

# 2: Webscrape Input Article

In [None]:
input_article = scrape_article(input_url)
input_article['text']  = input_article['title'] + " " + input_article['content']
X_articles = input_article['text'].tolist()

In [None]:
# Load your saved model and tokenizer
save_path = "C:/Users/mouct/Downloads/best_bias_model"
model = AutoModelForSequenceClassification.from_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(save_path)

# Put the model in evaluation mode
predictions = predict_bias(X_articles, model, tokenizer, label_map)
print(predictions)

In [None]:
input_article['bias'] = predictions
print(input_article[['url','title','bias']])

# 3: Webscrape Alternative/Similar Articles

In [None]:
# Split the title into words, take the first 8, and join them back
first_8_words = " ".join(input_article['title'].iloc[0].split()[:7])
search_query = first_8_words + " coverage"
current_bias = input_article['bias'].iloc[0]
print(search_query)

In [None]:
links = google_search(search_query, api_key)

print("Search results:")
for i, link in enumerate(links, 1):
    print(f"{i}. {link}")

In [None]:
dfs = []
for link in links:
    dfs.append(scrape_article(link))
alternative_articles = pd.concat(dfs, ignore_index=True)
alternative_articles['text'] = (alternative_articles['title'].fillna('') + " " + alternative_articles['content'].fillna(''))
X_articles = alternative_articles['text'].astype(str).tolist()
predictions = predict_bias(X_articles, model, tokenizer, label_map)
alternative_articles['bias'] = predictions
alternative_articles.dropna(inplace=True)
similar_articles = alternative_articles[alternative_articles['bias'] == current_bias]
alternative_articles = alternative_articles[
    (alternative_articles['bias'] != current_bias) & 
    (alternative_articles['bias'].notna())
]

# 4: Showcase All Articles

In [None]:
# Input Article
print("**Input Article:")
print('Title: ', input_article['title'].iloc[0])
print('URL: ', input_article['url'].iloc[0])
print('Bias: ', input_article['bias'].iloc[0])
print('')

# Blindspots:
print("**Blindspot Article(s):")
for i in range(len(alternative_articles)):
    print("Title: ", alternative_articles['title'].iloc[i])
    print("URL: ", alternative_articles['url'].iloc[i])
    print("Bias: ", alternative_articles['bias'].iloc[i])
    print('')

# Similar Bias:
print("**Similar Article(s):")
for i in range(len(similar_articles)):
    print("Title: ", similar_articles['title'].iloc[i])
    print("URL: ", similar_articles['url'].iloc[i])
    print("Bias: ", similar_articles['bias'].iloc[i])
    print('')