# Media Bias Detection

## Libraries Download

In [None]:
# pip install newsapi-python
# pip install readability-lxml
# pip install lxml-html-clean

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7


## Data Retrival From News API

In [None]:
from newsapi import NewsApiClient
from useful_tools import UsefulTools  # Updated import

class ArticlesReceiver:
    """
    Retrieves news articles for given topics using NewsAPI,
    with optional caching to a local JSON file preserving full response structure.

    The structure of retrival:
    {
    "status": "ok",
    "totalResults": 624,
    "articles": [
        {
        "source": {
            "id": "wired",
            "name": "Wired"
        },
        "author": "Adrienne So",
        "title": "Garmin Vivoactive 6 Review: Reliable, Real Intelligence",
        "description": "New subscription service notwithstanding, Garmin's latest entry-level tracker is still reliable and attractive and works great.",
        "url": "https://www.wired.com/review/garmin-vivoactive-6/",
        "urlToImage": "https://media.wired.com/photos/6802d3bd68bf21be6e9c8d99/191:100/w_1280,c_limit/Garmin-Vivoactive_042025_Lede.jpg",
        "publishedAt": "2025-04-19T13:33:00Z",
        "content": "Garmin, the maker of our fav..."
        },
        {...}
    ]

    }

    """
    def __init__(self, api_key: str):
        self.newsapi = NewsApiClient(api_key=api_key)
        self.all_responses = []

    def retrieve(self, topics: list[str], cache_file: str = None, page_size: int = 100):
        """
        Retrieves full API responses for each topic. If cache_file is provided
        and valid, returns cached responses without querying NewsAPI.

        :param topics: List of query strings
        :param cache_file: Optional path to JSON cache file
        :param page_size: Number of articles per topic (max 100)
        :return: List of response dicts, each containing status, totalResults, and articles
        """
        if cache_file:
            cached = UsefulTools.JsonCache.load(cache_file, expected_type=list)
            if cached is not None:
                return cached

        self.all_responses = []
        for topic in topics:
            try:
                response = self.newsapi.get_everything(
                    q=topic,
                    language='en',
                    sort_by='relevancy',
                    page_size=page_size
                )
                print(f"[{topic}]: {len(response.get('articles', []))} articles retrieved.")
                self.all_responses.append(response)
            except Exception as e:
                print(f"Error fetching articles for '{topic}': {e}")

        if cache_file:
            UsefulTools.JsonCache.save(self.all_responses, cache_file)

        return self.all_responses

# Example usage:
api_key = 'e85eb7bdd52b4cf98fc954ce1c09c25a'
receiver = ArticlesReceiver(api_key)
topics = [
    "Climate Change",
    "Israel-Palestine Conflict",
    "U.S. Presidential Election 2024 Aftermath",
    "China-Taiwan Relations",
    "Immigration Policy",
    "AI in the Job Market",
    "Cryptocurrency Regulation",
    "Global Inflation Trends",
    "Tech Company Antitrust Investigations",
    "Green Energy Investment",
    "OpenAI and GPT-5 Developments",
    "AI in Healthcare",
    "SpaceX Mars Missions",
    "Quantum Computing Breakthroughs",
    "Cybersecurity Threats and Hacks",
     "NATO Expansion and Military Spending",
    "TikTok Ban in the United States",
    "Mental Health and Social Media",
    "Electric Vehicle Market Competition",
    "Nuclear Fusion Breakthroughs",
    "Remote Work and Hybrid Offices",
    "Digital Currency by Central Banks",
    "Gun Control Legislation in the US",
    "Amazon Labor Union Movements",
    "Rise of Right-Wing Populism in Europe",
    "China's Belt and Road Initiative",
    "India's Role in Global Tech",
    "Climate Protests and Youth Activism",
    "Global Water Scarcity",
    "Meta's Push into the Metaverse",
    "Biotech in Agriculture",
    "Drug Policy Reform and Legalization",
    "Western Sanctions on Russia",
    "AI Deepfakes and Disinformation",
    "Elections in African Democracies"
]
all_articles = receiver.retrieve(topics, cache_file='./data/train_data_35_5000_raw.json')
print(f"Total articles retrieved: {len(all_articles)}")

[Climate Change]: 100 articles retrieved.
[Israel-Palestine Conflict]: 95 articles retrieved.
[U.S. Presidential Election 2024 Aftermath]: 98 articles retrieved.
[China-Taiwan Relations]: 95 articles retrieved.
[Immigration Policy]: 100 articles retrieved.
[AI in the Job Market]: 99 articles retrieved.
[Cryptocurrency Regulation]: 98 articles retrieved.
[Global Inflation Trends]: 99 articles retrieved.
[Tech Company Antitrust Investigations]: 26 articles retrieved.
[Green Energy Investment]: 89 articles retrieved.
[OpenAI and GPT-5 Developments]: 12 articles retrieved.
[AI in Healthcare]: 99 articles retrieved.
[SpaceX Mars Missions]: 92 articles retrieved.
[Quantum Computing Breakthroughs]: 81 articles retrieved.
[Cybersecurity Threats and Hacks]: 35 articles retrieved.
[NATO Expansion and Military Spending]: 58 articles retrieved.
[TikTok Ban in the United States]: 96 articles retrieved.
[Mental Health and Social Media]: 99 articles retrieved.
[Electric Vehicle Market Competition]: 9

## Try: Use Redability and BeautifulSoup to Retrieve Complete Content

In [22]:
import requests
from readability import Document
from bs4 import BeautifulSoup

url = all_articles[0]['articles'][0]['url']

def fetch_full_text(url):
    html = requests.get(url, headers={'User-Agent':'bot'}).text
    doc = Document(html)
    # doc.summary() is the <div> of cleaned HTML
    content_html = doc.summary()
    # strip tags for plain text
    return BeautifulSoup(content_html, 'html.parser').get_text(separator="\n\n")

fetch_full_text(url)


'\n\n\nAlmost 40 years ago, deep in the Pacific, a single voice called out a song unlike any other. The sound reverberated through the depths at 52 Hertz, puzzling those listening to this solo ringing out from the oceanâ\x80\x99s symphony. The frequency was much higher than a blue whale or its cousin, the fin, leaving scientists to ponder the mystery of \n\nWhale 52\n\n.\n\n \n\nThe leviathan has been heard many times since, but never seen. Some suspect it might have some deformation that alters its voice. Others think it might simply exhibit a highly unusual vocalization â\x80\x94 a tenor among baritones. But Marine biologist John Calambokidis of Cascadia Research Collective suggests another possibility: â\x80\x9cThe loneliest whale,â\x80\x9d so named because there may be no one to respond to its unique call, may not be an anomaly, but a clue.\n\n \n\nCalambokidis, who has spent more than 50 years studying cetaceans, suspects Whale 52 may be a hybrid: Part blue whale, part fin whale.\

## Process Raw Data

In [None]:
import pandas as pd
import spacy
import requests
from readability import Document
from bs4 import BeautifulSoup
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

"""
We are going to save articles in a dataframe with the following columns:
[id, topic, source, title, description, sentence]

sentence: represents one sentence form the article
id: represents a unique number for each article

example: for article 1, we have 3 sentences, we will get:
1, topic1, source1, title1, description1, sentence1
1, topic1, source1, title1, description1, sentence2
1, topic1, source1, title1, description1, sentence3


ARTICLES_PER_SOURCE stabilizes the number of articles per source
"""

ARTICLES_PER_SOURCE = 30
N_SOURCE = 10
SENT_PER_SOURCE = 5000

class Df_Builder():
    def __init__(self):
        self.session = self.make_session()
        self.HEADERS = HEADERS = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/112.0.0.0 Safari/537.36"
            ),
            "Accept-Language": "en-US,en;q=0.9",
        }

        # 2) Same NLP setup
        self.nlp = spacy.load("en_core_web_sm")
        self.data = pd.DataFrame()
        self.sources = {}

    # 1) Configure a retry‑enabled session and realistic headers
    def make_session(self, retries=3, backoff=0.5):
        sess = requests.Session()
        retry = Retry(
            total=retries,
            backoff_factor=backoff,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["GET"],
        )
        adapter = HTTPAdapter(max_retries=retry)
        sess.mount("http://", adapter)
        sess.mount("https://", adapter)
        return sess


    def get_full_article(self, url):
        try:
            resp = self.session.get(url, headers=self.HEADERS, timeout=10)
            resp.raise_for_status()
        except requests.exceptions.RequestException as e:
            # print(f"Failed to fetch {url}: {e}")
            return ""
        doc = Document(resp.text)
        content_html = doc.summary()
        return BeautifulSoup(content_html, 'html.parser') \
                .get_text(separator="\n\n")

    def sentences_tokenization(self, content):
        doc = self.nlp(content)
        return [sent.text.strip() for sent in doc.sents]

    def build(self, topics, all_articles, allowed_sources=[], cache_file: str = None):
        """
        Builds a DataFrame containing article sentence data.

        Parameters:
            topics (list): List of topic strings
            all_articles (list): List of response dicts from NewsAPI
            allowed_sources (list): Optional list of sources to include
            cache_file (str): Optional path to load/save the final DataFrame (as list of dicts)

        Returns:
            pd.DataFrame: DataFrame with columns [id, topic, source, title, description, sentence]
        """
        # Try loading from cache first
        if cache_file:
            cached_data = UsefulTools.JsonCache.load(cache_file, expected_type=list)
            if cached_data:
                print(f"Loaded processed data from cache: '{cache_file}'")
                self.data = pd.DataFrame(cached_data)
                return self.data

        rows = []
        article_id = 0
        n_source = 0

        for topic, articles in zip(topics, all_articles):
            visited_source = {}
            count_sent = 0
            count_articles = 0
            count_fails = 0

            for a in tqdm(articles['articles'], desc=f"[{topic}] Articles", unit="art", leave=False):
                source = a['source']['name']
                source_id = a['source']['id']
                author = a['author']

                if allowed_sources and source not in allowed_sources:
                    continue

                if visited_source.get(source, 0) > SENT_PER_SOURCE:
                    n_source += 1
                    continue
                
                if n_source >= N_SOURCE:
                    break

                full_content = self.get_full_article(a.get('url', ''))
                if not full_content:
                    count_fails += 1
                    continue
                
                source_count_sent = 0
                for sent in self.sentences_tokenization(full_content):
                    rows.append({
                        'id': article_id,
                        'topic': topic,
                        'source': source,
                        'title': a['title'],
                        'description': a['description'],
                        'sentence': sent,
                        'author': author
                    })
                    count_sent += 1
                    source_count_sent += 1
                count_articles += 1
                article_id += 1

                visited_source[source] = visited_source.get(source, 0) + source_count_sent
                
                if source not in self.sources:
                    self.sources[source] = source_id

            print(f"[{topic}]: {len(visited_source)} sources, {count_articles} articles/{len(articles['articles'])}, {count_sent} sentences, {count_fails} failed")

        self.data = pd.DataFrame(rows)

        print(f"\n Total: {len(topics)} topics, {article_id} articles, {len(self.data)} sentences")

        # Save to cache
        if cache_file:
            UsefulTools.JsonCache.save(self.data.to_dict(orient='records'), cache_file)

        return self.data



training_builder = Df_Builder()
training_data = training_builder.build(
    topics,
    all_articles,
    cache_file='./data/train_data_35_5000.json'
)


                                                                             

[Climate Change]: 25 sources, 98 articles/100, 3460 sentences, 2 failed


                                                                                      

[Israel-Palestine Conflict]: 49 sources, 91 articles/95, 3990 sentences, 4 failed


                                                                                                      

[U.S. Presidential Election 2024 Aftermath]: 11 sources, 98 articles/98, 3422 sentences, 0 failed


                                                                                   

[China-Taiwan Relations]: 45 sources, 89 articles/95, 3260 sentences, 6 failed


                                                                                 

[Immigration Policy]: 20 sources, 88 articles/100, 3600 sentences, 12 failed


                                                                                 

[AI in the Job Market]: 31 sources, 99 articles/99, 11726 sentences, 0 failed


                                                                                      

[Cryptocurrency Regulation]: 34 sources, 95 articles/98, 3328 sentences, 3 failed


                                                                                    

[Global Inflation Trends]: 27 sources, 99 articles/99, 5285 sentences, 0 failed


                                                                                                  

[Tech Company Antitrust Investigations]: 21 sources, 26 articles/26, 1563 sentences, 0 failed


                                                                                    

[Green Energy Investment]: 36 sources, 89 articles/89, 4249 sentences, 0 failed


                                                                                          

[OpenAI and GPT-5 Developments]: 8 sources, 11 articles/12, 8067 sentences, 1 failed


                                                                             

[AI in Healthcare]: 28 sources, 97 articles/99, 3592 sentences, 2 failed


                                                                                 

[SpaceX Mars Missions]: 42 sources, 88 articles/92, 5267 sentences, 4 failed


                                                                                            

[Quantum Computing Breakthroughs]: 48 sources, 79 articles/81, 5122 sentences, 2 failed


                                                                                            

[Cybersecurity Threats and Hacks]: 24 sources, 33 articles/35, 1823 sentences, 2 failed


                                                                                                 

[NATO Expansion and Military Spending]: 38 sources, 56 articles/58, 3512 sentences, 2 failed


                                                                                            

[TikTok Ban in the United States]: 64 sources, 96 articles/96, 4135 sentences, 0 failed


                                                                                           

[Mental Health and Social Media]: 36 sources, 97 articles/99, 6188 sentences, 2 failed


                                                                                                

[Electric Vehicle Market Competition]: 39 sources, 91 articles/94, 3472 sentences, 3 failed


                                                                                         

[Nuclear Fusion Breakthroughs]: 14 sources, 16 articles/17, 1785 sentences, 1 failed


                                                                                           

[Remote Work and Hybrid Offices]: 40 sources, 57 articles/57, 5722 sentences, 0 failed


                                                                                                

[Digital Currency by Central Banks]: 35 sources, 99 articles/100, 5114 sentences, 1 failed


                                                                                              

[Gun Control Legislation in the US]: 34 sources, 41 articles/42, 3796 sentences, 1 failed


                                                                                         

[Amazon Labor Union Movements]: 6 sources, 10 articles/10, 883 sentences, 0 failed


                                                                                                  

[Rise of Right-Wing Populism in Europe]: 16 sources, 17 articles/17, 1699 sentences, 0 failed


                                                                                             

[China's Belt and Road Initiative]: 32 sources, 93 articles/94, 3358 sentences, 1 failed


                                                                                        

[India's Role in Global Tech]: 18 sources, 73 articles/73, 3173 sentences, 0 failed


                                                                                              

[Climate Protests and Youth Activism]: 9 sources, 9 articles/9, 708 sentences, 0 failed


                                                                                  

[Global Water Scarcity]: 47 sources, 94 articles/95, 6523 sentences, 1 failed


                                                                                         

[Meta's Push into the Metaverse]: 3 sources, 3 articles/3, 91 sentences, 0 failed


                                                                                   

[Biotech in Agriculture]: 21 sources, 97 articles/98, 3659 sentences, 1 failed


                                                                                              

[Drug Policy Reform and Legalization]: 7 sources, 7 articles/8, 964 sentences, 1 failed


                                                                                        

[Western Sanctions on Russia]: 22 sources, 92 articles/95, 2670 sentences, 3 failed


                                                                                            

[AI Deepfakes and Disinformation]: 22 sources, 28 articles/30, 1794 sentences, 2 failed


                                                                                           

[Elections in African Democracies]: 3 sources, 5 articles/5, 631 sentences, 0 failed

 Total: 35 topics, 2261 articles, 127631 sentences
Saved data to cache file './data/train_data_35_5000.json'.


# Retrieve Data for Test

In [7]:
# Define test input
test_topics = [
    "AI in Criminal Justice",
    "Social Media Regulation",
    "Youth Unemployment in Europe",
    "U.S. Border Security Technology",
    "Ukraine-Russia War",
    "Nuclear Energy Debate",
    "Digital Privacy Laws",
    "Global Plastic Waste Trade",
    "Hollywood Streaming Wars",
    "Global Food Security",
    "5G Technology Expansion",
    "Deep Sea Mining Controversy",
    "Artificial Meat and Food Innovation",
    "Women's Rights in Iran",
    "Space Tourism Developments",
    "Child Labor in Global Supply Chains",
    "Facial Recognition and Surveillance",
    "Climate-Induced Migration",
    "Esports Industry Growth",
    "Education Inequality Post-Pandemic"
]

cache_path = './data/test_data_20_5000_raw.json'
# Retrieve articles
test_all_articles = receiver.retrieve(test_topics, cache_file=cache_path)

print(f"Total articles retrieved: {sum(len(resp['articles']) for resp in test_all_articles)}")


[AI in Criminal Justice]: 96 articles retrieved.
[Social Media Regulation]: 93 articles retrieved.
[Youth Unemployment in Europe]: 23 articles retrieved.
[U.S. Border Security Technology]: 99 articles retrieved.
[Ukraine-Russia War]: 100 articles retrieved.
[Nuclear Energy Debate]: 97 articles retrieved.
[Digital Privacy Laws]: 95 articles retrieved.
[Global Plastic Waste Trade]: 40 articles retrieved.
[Hollywood Streaming Wars]: 94 articles retrieved.
[Global Food Security]: 96 articles retrieved.
[5G Technology Expansion]: 100 articles retrieved.
[Deep Sea Mining Controversy]: 5 articles retrieved.
[Artificial Meat and Food Innovation]: 18 articles retrieved.
[Women's Rights in Iran]: 5 articles retrieved.
[Space Tourism Developments]: 43 articles retrieved.
[Child Labor in Global Supply Chains]: 34 articles retrieved.
[Facial Recognition and Surveillance]: 95 articles retrieved.
[Climate-Induced Migration]: 25 articles retrieved.
[Esports Industry Growth]: 50 articles retrieved.
[Ed

In [9]:
from useful_tools import UsefulTools
# Process test data
allowed_sources = training_builder.sources
print(f"Sources in training dataset include: {list(allowed_sources.keys())}")

ARTICLES_PER_SOURCE = 50
SENT_PER_SOURCE = 5000
test_builder = Df_Builder()
test_cache_file = './data/test_data_20_5000.json'

# Build and cache the processed test data
test_data = test_builder.build(
    test_topics,
    test_all_articles,
    allowed_sources=allowed_sources,
    cache_file=test_cache_file
)

# Final summary
print(f"Test data shape: {test_data.shape}")
test_data.head(3)  # preview first few rows


Sources in training dataset include: ['Gizmodo.com', 'BBC News', 'Slashdot.org', 'NPR', 'Wired', 'ABC News', 'Time', 'Yahoo Entertainment', 'The Verge', 'The New Republic', 'The Atlantic', 'Politicopro.com', 'Scientific American', 'Android Police', 'New Scientist', 'Business Insider', 'The Next Web', 'CNET', 'Android Central', 'Science Daily', 'Rolling Stone', 'Vox', 'Space.com', 'Phys.Org', 'Al Jazeera English', 'MetroWest Daily News', 'World Politics Review', '/FILM', 'Deadline', 'Just Jared', 'Kasurian.com', 'IndieWire', 'Raw Story', 'Practical Ecommerce', 'New York Post', 'Juancole.com', 'Naturalnews.com', 'Forbes', 'Nakedcapitalism.com', 'The Cut', 'RT', 'NBC News', 'Dazed', 'Inside Higher Ed', 'Knowyourmeme.com', 'CNA', 'The New York Review of Books', 'International Business Times', 'Democracy Now!', 'Mondediplo.com', 'The Intercept', 'Sputnikglobe.com', 'TheWrap', 'Israelnationalnews.com', 'Loudwire', 'CounterPunch', 'Cosmopolitan.com', 'USA Today', 'Thenation.com', 'Newsweek', 

                                                                                   

[AI in Criminal Justice]: 49 sources, 81 articles/96, 5110 sentences, 0 failed


                                                                                    

[Social Media Regulation]: 44 sources, 82 articles/93, 5620 sentences, 1 failed


                                                                                         

[Youth Unemployment in Europe]: 13 sources, 17 articles/23, 1881 sentences, 1 failed


                                                                                            

[U.S. Border Security Technology]: 37 sources, 88 articles/99, 5457 sentences, 0 failed


                                                                                 

[Ukraine-Russia War]: 12 sources, 96 articles/100, 3953 sentences, 3 failed


                                                                                  

[Nuclear Energy Debate]: 45 sources, 83 articles/97, 7065 sentences, 2 failed


                                                                                 

[Digital Privacy Laws]: 35 sources, 72 articles/95, 4432 sentences, 0 failed


                                                                                       

[Global Plastic Waste Trade]: 17 sources, 33 articles/40, 3949 sentences, 0 failed


                                                                                     

[Hollywood Streaming Wars]: 22 sources, 41 articles/94, 2664 sentences, 2 failed


                                                                                 

[Global Food Security]: 32 sources, 87 articles/96, 4954 sentences, 0 failed


                                                                                      

[5G Technology Expansion]: 20 sources, 86 articles/100, 8884 sentences, 0 failed


                                                                                      

[Deep Sea Mining Controversy]: 5 sources, 5 articles/5, 496 sentences, 0 failed


                                                                                                

[Artificial Meat and Food Innovation]: 12 sources, 17 articles/18, 2586 sentences, 0 failed


                                                                                 

[Women's Rights in Iran]: 4 sources, 5 articles/5, 304 sentences, 0 failed


                                                                                       

[Space Tourism Developments]: 19 sources, 35 articles/43, 2878 sentences, 0 failed


                                                                                                

[Child Labor in Global Supply Chains]: 18 sources, 23 articles/34, 3199 sentences, 1 failed


                                                                                                

[Facial Recognition and Surveillance]: 49 sources, 79 articles/95, 3954 sentences, 3 failed


                                                                                      

[Climate-Induced Migration]: 12 sources, 20 articles/25, 9374 sentences, 0 failed


                                                                                    

[Esports Industry Growth]: 12 sources, 42 articles/50, 2255 sentences, 0 failed


                                                                                               

[Education Inequality Post-Pandemic]: 8 sources, 12 articles/16, 3479 sentences, 0 failed

 Total: 20 topics, 1004 articles, 82494 sentences
Saved data to cache file './data/test_data_20_5000.json'.
Test data shape: (82494, 7)


Unnamed: 0,id,topic,source,title,description,sentence,author
0,0,AI in Criminal Justice,The Verge,This ICE-snitching app is actually promoting a...,Right-wing influencers are shilling an app tha...,Right-wing influencers are shilling an app tha...,Gaby Del Valle
1,0,AI in Criminal Justice,The Verge,This ICE-snitching app is actually promoting a...,Right-wing influencers are shilling an app tha...,"ICERAID, a “GovFi protocol that delegates inte...",Gaby Del Valle
2,0,AI in Criminal Justice,The Verge,This ICE-snitching app is actually promoting a...,Right-wing influencers are shilling an app tha...,"“It’s like a citizen’s arrest, but with Wi-Fi,...",Gaby Del Valle


In [29]:
# check the sources in test dataset are in training dataset
set(test_builder.sources.keys()).issubset(set(allowed_sources.keys()))

True