# Media Bias Detection

## Libraries Download

In [None]:
pip install newsapi-python

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7


In [None]:
pip install readability-lxml

Collecting readability-lxml
  Downloading readability_lxml-0.8.1-py3-none-any.whl.metadata (3.6 kB)
Collecting cssselect (from readability-lxml)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Downloading readability_lxml-0.8.1-py3-none-any.whl (20 kB)
Downloading cssselect-1.3.0-py3-none-any.whl (18 kB)
Installing collected packages: cssselect, readability-lxml
Successfully installed cssselect-1.3.0 readability-lxml-0.8.1


In [None]:
pip install lxml-html-clean



## Data Retrival From News API

In [None]:
import os
import json
from newsapi import NewsApiClient
"""
https://newsapi.org/docs/endpoints/everything
In this link you find description of parameters of newsapi.get_everything
"""



class ArticlesReceiver:
    """
    Retrieves news articles for given topics using NewsAPI,
    with optional caching to a local JSON file preserving full response structure.

    The structure of retrival:
    {
    "status": "ok",
    "totalResults": 624,
    "articles": [
        {
        "source": {
            "id": "wired",
            "name": "Wired"
        },
        "author": "Adrienne So",
        "title": "Garmin Vivoactive 6 Review: Reliable, Real Intelligence",
        "description": "New subscription service notwithstanding, Garmin’s latest entry-level tracker is still reliable and attractive and works great.",
        "url": "https://www.wired.com/review/garmin-vivoactive-6/",
        "urlToImage": "https://media.wired.com/photos/6802d3bd68bf21be6e9c8d99/191:100/w_1280,c_limit/Garmin-Vivoactive_042025_Lede.jpg",
        "publishedAt": "2025-04-19T13:33:00Z",
        "content": "Garmin, the maker of our fav..."
        },
        {...}
    ]

    }

      """
    def __init__(self, api_key: str):
        self.newsapi = NewsApiClient(api_key=api_key)
        # Will store full response for each topic
        self.all_responses = []

    def _load_from_file(self, filename: str):
        """
        Load full response list from cache file if present and valid.
        Returns list of responses or None if invalid or missing.
        """
        if not os.path.exists(filename):
            return None
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if isinstance(data, list):
                print(f"Loaded {len(data)} topic responses from cache file '{filename}'.")
                return data
            else:
                print(f"Cache file '{filename}' is invalid: expected a list of response objects.")
        except (json.JSONDecodeError, IOError) as e:
            print(f"Failed to load cache file '{filename}': {e}")
        return None

    def _save_to_file(self, responses: list, filename: str):
        """
        Save the list of full response dicts to a JSON file.
        """
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(responses, f, ensure_ascii=False, indent=2)
            print(f"Saved {len(responses)} topic responses to file '{filename}'.")
        except IOError as e:
            print(f"Failed to save responses to '{filename}': {e}")

    def retrieve(self, topics: list[str], cache_file: str = None, page_size: int = 100):
        """
        Retrieves full API responses for each topic. If cache_file is provided
        and valid, returns cached responses without querying NewsAPI.

        :param topics: List of query strings
        :param cache_file: Optional path to JSON cache file
        :param page_size: Number of articles per topic (max 100)
        :return: List of response dicts, each containing status, totalResults, and articles
        """
        # Try caching
        if cache_file:
            cached = self._load_from_file(cache_file)
            if cached is not None:
                return cached

        self.all_responses = []
        for topic in topics:
            try:
                response = self.newsapi.get_everything(
                    q=topic,
                    language='en',
                    sort_by='relevancy',
                    page_size=page_size
                )
                print(f"[{topic}]: {len(response.get('articles', []))} articles retrieved.")
                self.all_responses.append(response)
            except Exception as e:
                print(f"Error fetching articles for '{topic}': {e}")

        if cache_file:
            self._save_to_file(self.all_responses, cache_file)

        return self.all_responses

# Example usage:
api_key = 'e85eb7bdd52b4cf98fc954ce1c09c25a'
receiver = ArticlesReceiver(api_key)
topics = ['climate change', 'Artificial Intelligence', 'Donald Trump']
all_articles = receiver.retrieve(topics, cache_file='training_articles.json')
print(f"Total articles retrieved: {len(all_articles)}")


Loaded 3 topic responses from cache file 'training_articles.json'.
Total articles retrieved: 3


## Try: Use Redability and BeautifulSoup to Retrieve complete Content

In [None]:
import requests
from readability import Document
from bs4 import BeautifulSoup

url = all_articles[0]['articles'][0]['url']

def fetch_full_text(url):
    html = requests.get(url, headers={'User-Agent':'bot'}).text
    doc = Document(html)
    # doc.summary() is the <div> of cleaned HTML
    content_html = doc.summary()
    # strip tags for plain text
    return BeautifulSoup(content_html, 'html.parser').get_text(separator="\n\n")

fetch_full_text(url)


'A growing body of evidence points to mounting health risks posed by climate change. Despite this, it seems the National Institutes of Health (NIH) will quit funding that kind of research. \n\nProPublica\n\n first reported the news\n\n on Monday, citing internal records that the investigative outlet viewed.\n\nNIH gave directions to staff members last week that likely puts a stop to any more funding for new academic programs or research into the impacts climate change has on health, according to \n\nProPublica\n\n. It remains to be seen whether the new directive will affect active grants. But it follows news last month \n\nfrom \n\nMother Jones\n\n that the Department of Health and Human Services halted funding for three existing climate and health programs at NIH. That includes the \n\nClimate Change and Health Initiative\n\n launched in 2021 that supported research into the health effects of wildfires, heat stress, and virus transmissions through mosquitoes, to name a few projects. \

## Process Raw Data

In [None]:
import pandas as pd
import spacy
import requests
from readability import Document
from bs4 import BeautifulSoup
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

"""
We are going to save articles in a dataframe with the following columns:
[id, topic, source, title, description, sentence]

sentence: represents one sentence form the article
id: represents a unique number for each article

example: for article 1, we have 3 sentences, we will get:
1, topic1, source1, title1, description1, sentence1
1, topic1, source1, title1, description1, sentence2
1, topic1, source1, title1, description1, sentence3


ARTICLES_PER_SOURCE stabilizes the number of articles per source
"""

ARTICLES_PER_SOURCE = 5

class Df_Builder():
    def __init__(self):
        self.session = self.make_session()
        self.HEADERS = HEADERS = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/112.0.0.0 Safari/537.36"
            ),
            "Accept-Language": "en-US,en;q=0.9",
        }

        # 2) Same NLP setup
        self.nlp = spacy.load("en_core_web_sm")
        self.data = pd.DataFrame()
        self.sources = {}

    # 1) Configure a retry‑enabled session and realistic headers
    def make_session(self, retries=3, backoff=0.5):
        sess = requests.Session()
        retry = Retry(
            total=retries,
            backoff_factor=backoff,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["GET"],
        )
        adapter = HTTPAdapter(max_retries=retry)
        sess.mount("http://", adapter)
        sess.mount("https://", adapter)
        return sess


    def get_full_article(self, url):
        try:
            resp = self.session.get(url, headers=self.HEADERS, timeout=10)
            resp.raise_for_status()
        except requests.exceptions.RequestException as e:
            # print(f"Failed to fetch {url}: {e}")
            return ""
        doc = Document(resp.text)
        content_html = doc.summary()
        return BeautifulSoup(content_html, 'html.parser') \
                .get_text(separator="\n\n")

    def sentences_tokenization(self, content):
        doc = self.nlp(content)
        return [sent.text.strip() for sent in doc.sents]

    def build(self,topics,all_articles, allowed_sources = []):
        rows = []
        article_id = 0

        for topic, articles in zip(topics, all_articles):
            visited_source = {}
            count_sent = 0
            count_articles = 0
            count_fails = 0
            for a in tqdm(articles['articles'],
                        desc=f"[{topic}] Articles",
                        unit="art",
                        leave=False):
                source = a['source']['name']
                source_id = a['source']['id']

                # For test data building only
                # TODO move this functionality into request (I encountered source ID = None, ex: Gizmodo.com'id is None)
                if len(allowed_sources) > 0 and source not in allowed_sources:
                    # print(f'{source} is not allowed')
                    continue
                else:
                    # print(f'{source} is allowed')
                    pass

                if visited_source.get(source, 0) > ARTICLES_PER_SOURCE:
                    # print(f"for [{topic}] we already have {ARTICLES_PER_SOURCE} articles from [{source}] ")
                    continue


                visited_source[source] = visited_source.get(source, 0) + 1

                full_content = self.get_full_article(a.get('url', ''))
                if not full_content:
                    count_fails += 1
                    continue  # skip if we couldn't fetch

                for sent in self.sentences_tokenization(full_content):
                    rows.append({
                        'id': article_id,
                        'topic': topic,
                        'source': source,
                        'title': a['title'],
                        'description': a['description'],
                        'sentence': sent
                    })
                    count_sent += 1
                count_articles += 1
                article_id += 1

                if source not in self.sources:
                    self.sources[source] = source_id

            print(f"[{topic}]: {len(visited_source)} sources, {count_articles} articles (total: {len(articles['articles'])}), {count_sent} sentences,  {count_fails} failed")

        self.data = pd.DataFrame(rows)
        print(f"\n Total: {len(topics)} topics, {article_id} articles, {len(self.data)} sentences")
        return self.data


training_builder = Df_Builder()
training_data = training_builder.build(topics,all_articles)
# training_data




[climate change]: 27 sources, 75 articles (total: 100), 2696 sentences,  4 failed




[Artificial Intelligence]: 30 sources, 72 articles (total: 92), 2596 sentences,  6 failed


                                                                         

[Donald Trump]: 8 sources, 32 articles (total: 91), 1113 sentences,  1 failed

 Total: 3 topics, 179 articles, 6405 sentences




## Classification

### Retrieve Data for Test

In [None]:
# test data retrieval
test_topics = ["tax"]
test_all_articles = receiver.retrieve(test_topics, cache_file='test_articles.json')
print(f"Total articles retrieved: {len(all_articles)}")

[tax]: 95 articles retrieved.
Saved 1 topic responses to file 'test_articles.json'.
Total articles retrieved: 3


In [None]:
# process raw data
allowed_sources = training_builder.sources
print(f'sources in training dataset include: {allowed_sources}')
test_builder = Df_Builder()
test_data = test_builder.build(test_topics,test_all_articles,allowed_sources)

sources in training dataset include: {'The Verge': 'the-verge', 'Gizmodo.com': None, 'BBC News': None, 'Slashdot.org': None, 'NPR': None, 'Time': 'time', 'Wired': 'wired', 'The Atlantic': None, 'Politicopro.com': None, 'Scientific American': None, 'Android Police': None, 'Business Insider': 'business-insider', 'CNET': None, 'Science Daily': None, 'Yanko Design': None, 'Yahoo Entertainment': None, 'ABC News': 'abc-news', 'Phys.Org': None, 'Al Jazeera English': 'al-jazeera-english', 'Nakedcapitalism.com': None, 'The Next Web': 'the-next-web', 'Fox News': 'fox-news', 'CNN': 'cnn', 'ArchDaily': None, 'Vox': None, 'Grist': None, 'Android Central': None, 'MacRumors': None, 'Digital Trends': None, 'The New Yorker': None, 'Harvard Business Review': None, '9to5Mac': None, 'Space.com': None, 'Internet': None, 'Ietf.org': None, 'Techdirt': None, 'Openculture.com': None, 'AppleInsider': None, 'Computerhistory.org': None, 'kottke.org': None}


                                                                

[tax]: 12 sources, 53 articles (total: 95), 1892 sentences,  2 failed

 Total: 1 topics, 53 articles, 1892 sentences




In [None]:
# check the sources in test dataset are in training dataset
set(test_builder.sources.keys()).issubset(set(allowed_sources.keys()))

## Try 1


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ─── 1. Config & Hyperparameters ───────────────────────────────────────────────

MODEL_NAME        = "bert-base-uncased"
MAX_LEN           = 128
TRAIN_BATCH_SIZE  = 16
EVAL_BATCH_SIZE   = 32
NUM_EPOCHS        = 3
LEARNING_RATE     = 2e-5
DEVICE            = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ─── 2. Load & Encode Data ─────────────────────────────────────────────────────

train_df = training_data.copy()   # must have 'sentence' & 'source'
test_df  = test_data.copy()

le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["source"])
test_df ["label"] = le.transform(test_df["source"])
NUM_LABELS = len(le.classes_)

# ─── 3. Custom Dataset ──────────────────────────────────────────────────────────

class SentenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences = df["sentence"].tolist()
        self.labels    = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text  = self.sentences[idx]
        label = self.labels[idx]
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         torch.tensor(label, dtype=torch.long),
        }

# ─── 4. Prepare Tokenizer, Model, DataLoaders ──────────────────────────────────

tokenizer   = AutoTokenizer.from_pretrained(MODEL_NAME)
model       = AutoModelForSequenceClassification.from_pretrained(
                  MODEL_NAME,
                  num_labels=NUM_LABELS
              ).to(DEVICE)

train_ds    = SentenceDataset(train_df, tokenizer, MAX_LEN)
test_ds     = SentenceDataset(test_df,  tokenizer, MAX_LEN)
train_loader = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
eval_loader  = DataLoader(test_ds,  batch_size=EVAL_BATCH_SIZE)
optimizer    = AdamW(model.parameters(), lr=LEARNING_RATE)

# ─── 5. Training Loop with tqdm ────────────────────────────────────────────────

model.train()
for epoch in range(1, NUM_EPOCHS+1):
    total_loss = 0.0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS}", unit="batch")
    for batch in loop:
        optimizer.zero_grad()
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["labels"].to(DEVICE)

        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    print(f"→ Epoch {epoch} avg loss: {total_loss/len(train_loader):.4f}")

# ─── 6. Evaluation: Sentence‑Level ─────────────────────────────────────────────

model.eval()
all_logits, all_labels = [], []

with torch.no_grad():
    for batch in tqdm(eval_loader, desc="Evaluating", unit="batch"):
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["labels"].to(DEVICE)

        logits = model(input_ids, attention_mask=attention_mask).logits
        all_logits.append(logits.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# flatten
all_logits = np.vstack(all_logits)
all_labels = np.concatenate(all_labels)

# compute metrics
sent_preds = np.argmax(all_logits, axis=1)
acc   = accuracy_score(all_labels, sent_preds)
pr, rec, f1, _ = precision_recall_fscore_support(
    all_labels, sent_preds, average="weighted"
)

print(f"\nSentence‑level results → Accuracy: {acc:.4f}, F1: {f1:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3:   0%|          | 0/401 [00:00<?, ?batch/s]

KeyboardInterrupt: 