# MBFC Enrichment Notebook

This notebook enriches information sources with Media Bias Fact Check (MBFC) data by scraping bias and factual reporting ratings.

## Purpose
Automatically fetch MBFC bias and factual ratings for sources in the Google Sheet and update the sheet with this information.

## Requirements
- **Credentials**: `credentials.json` file in the root directory (Google service account)
- **Dependencies**: beautifulsoup4, requests, google-api-python-client
- **Sheet Columns**: The sheet must have `mbfc_bias` and `mbfc_factual` columns

## How it works
1. Connects to Google Sheets and loads source data
2. For each source without MBFC data:
   - Searches for the source on mediabiasfactcheck.com
   - Extracts bias rating and factual reporting rating
   - Updates the Google Sheet with the findings
3. Applies rate limiting to avoid overwhelming MBFC servers

In [None]:
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import re
from typing import Optional, Tuple

In [None]:
# Configuration
SERVICE_ACCOUNT_FILE = "credentials.json"
SPREADSHEET_ID = "1NywRL9IBR69R0eSrOE9T6mVUbfJHwaALL0vp2K0TLbY"
SHEET_RANGE = "main!A:H"
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]

# MBFC Configuration
MBFC_BASE_URL = "https://mediabiasfactcheck.com/"
DELAY_BETWEEN_REQUESTS = 2.0  # seconds

In [None]:
def extract_domain(url: str) -> str:
    """
    Extract domain name from URL and remove www prefix.
    
    Args:
        url: Full URL string
        
    Returns:
        Domain name without www prefix
    """
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or parsed.path
        # Remove www prefix
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception:
        return ""


def search_mbfc(source_name: str, source_url: str) -> Optional[str]:
    """
    Search for source on MBFC by trying different URL patterns.
    
    Args:
        source_name: Name of the source
        source_url: URL of the source
        
    Returns:
        MBFC page URL if found, None otherwise
    """
    # Convert source name to slug format (lowercase, replace spaces with hyphens)
    name_slug = source_name.lower().strip()
    name_slug = re.sub(r'[^a-z0-9\s-]', '', name_slug)
    name_slug = re.sub(r'\s+', '-', name_slug)
    name_slug = re.sub(r'-+', '-', name_slug)
    
    # Extract domain from URL
    domain = extract_domain(source_url)
    domain_slug = domain.replace('.', '-') if domain else ""
    
    # Try different URL patterns
    patterns_to_try = []
    if name_slug:
        patterns_to_try.append(name_slug)
    if domain_slug and domain_slug != name_slug:
        patterns_to_try.append(domain_slug)
    
    for pattern in patterns_to_try:
        try:
            mbfc_url = f"{MBFC_BASE_URL}{pattern}/"
            response = requests.get(mbfc_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200 and 'Bias Rating:' in response.text:
                return mbfc_url
        except Exception:
            continue
    
    return None


def extract_mbfc_data(mbfc_url: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Parse MBFC page HTML and extract bias and factual ratings.
    
    Args:
        mbfc_url: URL of the MBFC page
        
    Returns:
        Tuple of (bias_rating, factual_rating)
    """
    try:
        response = requests.get(mbfc_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        if response.status_code != 200:
            return None, None
        
        html_content = response.text
        
        # Extract bias rating
        bias_match = re.search(r'Bias Rating:\s*([A-Z\s-]+)', html_content, re.IGNORECASE)
        bias_rating = bias_match.group(1).strip() if bias_match else None
        
        # Extract factual reporting
        factual_match = re.search(r'Factual Reporting:\s*([A-Z\s-]+)', html_content, re.IGNORECASE)
        factual_rating = factual_match.group(1).strip() if factual_match else None
        
        return bias_rating, factual_rating
    except Exception as e:
        return None, None


def get_mbfc_ratings(source_name: str, source_url: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Combine search and extraction to get MBFC ratings for a source.
    
    Args:
        source_name: Name of the source
        source_url: URL of the source
        
    Returns:
        Tuple of (bias_rating, factual_rating)
    """
    mbfc_url = search_mbfc(source_name, source_url)
    if mbfc_url:
        return extract_mbfc_data(mbfc_url)
    return None, None

In [None]:
def load_sheet_data():
    """
    Load data from Google Sheets.
    
    Returns:
        Tuple of (sheets_service, headers, data_rows)
    """
    print("üîó Connecting to Google Sheets...")
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE,
        scopes=SCOPES
    )
    sheets_service = build("sheets", "v4", credentials=creds)
    print("‚úÖ Connected to Google Sheets")
    
    print("üìÇ Loading data from Google Sheet...")
    sheet = sheets_service.spreadsheets()
    result = sheet.values().get(
        spreadsheetId=SPREADSHEET_ID,
        range=SHEET_RANGE
    ).execute()
    
    values = result.get("values", [])
    
    if not values:
        raise ValueError("‚ùå No data found in sheet")
    
    # Parse headers and data
    headers = values[0]
    data_rows = []
    for i, row in enumerate(values[1:], start=1):
        # Pad row to match header length
        row_data = row + [''] * (len(headers) - len(row))
        row_dict = {headers[j]: row_data[j] for j in range(len(headers))}
        row_dict['_row_index'] = i + 1  # +1 for header row
        data_rows.append(row_dict)
    
    print(f"‚úÖ Loaded {len(data_rows)} sources")
    return sheets_service, headers, data_rows


def update_sheet_row(sheets_service, row_index: int, headers: list, row_data: dict):
    """
    Update MBFC columns in a specific row of the sheet.
    
    Args:
        sheets_service: Google Sheets service instance
        row_index: Row number in the sheet (1-indexed)
        headers: List of column headers
        row_data: Dictionary with column data including mbfc_bias and mbfc_factual
    """
    # Find column indices
    bias_col_idx = headers.index('mbfc_bias') if 'mbfc_bias' in headers else None
    factual_col_idx = headers.index('mbfc_factual') if 'mbfc_factual' in headers else None
    
    if bias_col_idx is None or factual_col_idx is None:
        raise ValueError("‚ùå Required columns 'mbfc_bias' and 'mbfc_factual' not found in sheet")
    
    # Convert column index to letter (0->A, 1->B, etc.)
    def col_to_letter(col_idx):
        result = ""
        while col_idx >= 0:
            result = chr(65 + (col_idx % 26)) + result
            col_idx = col_idx // 26 - 1
        return result
    
    bias_col = col_to_letter(bias_col_idx)
    factual_col = col_to_letter(factual_col_idx)
    
    # Update bias rating
    if row_data.get('mbfc_bias'):
        range_name = f"main!{bias_col}{row_index}"
        body = {'values': [[row_data['mbfc_bias']]]}
        sheets_service.spreadsheets().values().update(
            spreadsheetId=SPREADSHEET_ID,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()
    
    # Update factual rating
    if row_data.get('mbfc_factual'):
        range_name = f"main!{factual_col}{row_index}"
        body = {'values': [[row_data['mbfc_factual']]]}
        sheets_service.spreadsheets().values().update(
            spreadsheetId=SPREADSHEET_ID,
            range=range_name,
            valueInputOption='RAW',
            body=body
        ).execute()

In [None]:
def process_mbfc_enrichment():
    """
    Main workflow function that processes all sources and enriches them with MBFC data.
    """
    try:
        # Load sheet data
        sheets_service, headers, data_rows = load_sheet_data()
        
        # Verify required columns exist
        if 'mbfc_bias' not in headers or 'mbfc_factual' not in headers:
            print("‚ùå Error: Required columns 'mbfc_bias' and 'mbfc_factual' not found in sheet")
            print(f"üìã Available columns: {', '.join(headers)}")
            return
        
        # Count existing vs needed enrichment
        already_filled = sum(
            1 for row in data_rows 
            if row.get('mbfc_bias', '').strip() and row.get('mbfc_factual', '').strip()
        )
        needs_enrichment = len(data_rows) - already_filled
        
        print(f"üìä Status: {already_filled} already have MBFC data, {needs_enrichment} need enrichment")
        print(f"üöÄ Starting MBFC enrichment...\n")
        
        # Process each row
        start_time = time.time()
        updated_count = 0
        skipped_count = 0
        not_found_count = 0
        
        for idx, row in enumerate(data_rows):
            name = row.get('name', '').strip()
            url = row.get('url', '').strip()
            existing_bias = row.get('mbfc_bias', '').strip()
            existing_factual = row.get('mbfc_factual', '').strip()
            row_index = row.get('_row_index')
            
            # Skip rows with missing data
            if not name or not url:
                print(f"‚è≠Ô∏è  [{idx + 1}/{len(data_rows)}] Skipping row {row_index}: missing name or URL")
                continue
            
            # Skip rows that already have both MBFC fields
            if existing_bias and existing_factual:
                print(f"‚è≠Ô∏è  [{idx + 1}/{len(data_rows)}] Skipping {name}: already has MBFC data")
                skipped_count += 1
                continue
            
            print(f"üîç [{idx + 1}/{len(data_rows)}] Processing: {name}")
            print(f"   URL: {url}")
            
            # Fetch MBFC ratings
            bias_rating, factual_rating = get_mbfc_ratings(name, url)
            
            if bias_rating or factual_rating:
                # Update sheet with findings
                row['mbfc_bias'] = bias_rating or ""
                row['mbfc_factual'] = factual_rating or ""
                
                try:
                    update_sheet_row(sheets_service, row_index, headers, row)
                    updated_count += 1
                    print(f"   ‚úÖ Found: Bias={bias_rating}, Factual={factual_rating}")
                    print(f"   üìù Updated sheet\n")
                except Exception as e:
                    print(f"   ‚ùå Error updating sheet: {str(e)}\n")
            else:
                not_found_count += 1
                print(f"   ‚ùå Not found on MBFC\n")
            
            # Apply rate limiting
            time.sleep(DELAY_BETWEEN_REQUESTS)
        
        # Print summary
        elapsed = time.time() - start_time
        print(f"\n{'='*60}")
        print(f"üìä Summary")
        print(f"{'='*60}")
        print(f"‚úÖ Sources updated: {updated_count}")
        print(f"‚è≠Ô∏è  Sources skipped (already had data): {skipped_count}")
        print(f"‚ùå Sources not found on MBFC: {not_found_count}")
        print(f"‚è±Ô∏è  Total time elapsed: {elapsed/60:.1f} minutes")
        print(f"{'='*60}\n")
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()

In [None]:
# Execute the MBFC enrichment process
process_mbfc_enrichment()