<a href="https://colab.research.google.com/github/nikwif/update_user_agents/blob/main/update_user_agents_dataprox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
from bs4 import BeautifulSoup
import re
import json
from collections import Counter

# API Endpoint
API_URL = "https://api.thedataproxy.com/api/v1/user-agents/"

# UserAgents.me URL
SOURCE_URL = "https://www.useragents.me/"

# Headers for requests
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

def get_final_user_agent_list(user_agents):
    """Print a summary of user agents and return unique user agent strings."""
    ua_counter = Counter([ua["user_agent"] for ua in user_agents])
    duplicates = {ua: count for ua, count in ua_counter.items() if count > 1}
    unique_user_agents = list(ua_counter.keys())

    print("\n📌 **Final User Agent Summary**")
    print(f"➡️ Total Entries: {len(user_agents)}")
    print(f"✅ Unique User Agents: {len(unique_user_agents)}")
    print(f"⚠️ Duplicates Found: {len(duplicates)}\n")

    if duplicates:
        print("🔄 **Duplicate User Agents (Highlighted)**:")
        for ua, count in duplicates.items():
            print(f"  - {ua} (x{count})")

    return unique_user_agents

def detect_device(user_agent: str) -> str:
    """Determine the device type from the user agent string."""
    if "Mobile" in user_agent or "Android" in user_agent or "iPhone" in user_agent:
        return "mobile"
    elif "Tablet" in user_agent or "iPad" in user_agent:
        return "tablet"
    else:
        return "desktop"

def extract_browser_os(user_agent: str) -> tuple:
    """Extract browser and OS information."""
    browsers = ["Chrome", "Firefox", "Safari", "Edge", "Opera"]
    operating_systems = ["Windows", "Mac OS X", "Linux", "Android", "iOS"]
    browser = next((b for b in browsers if b in user_agent), "Unknown")
    os = next((o for o in operating_systems if o in user_agent), "Unknown")
    return browser, os

def scrape_user_agents():
    """Scrape the webpage and extract full user agent data."""
    response = requests.get(SOURCE_URL, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        print("❌ Failed to fetch the webpage")
        return []

    soup = BeautifulSoup(response.text, "lxml")
    user_agents = []
    tables = soup.find_all("table")
    print(f"📌 Found {len(tables)} tables")

    for table in tables:
        rows = table.find_all("tr")[1:]  # Skip header row
        print(f"📝 Processing {len(rows)} rows")
        for row in rows:
            columns = row.find_all("td")
            if len(columns) < 2:
                continue

            percentage_text = columns[0].text.strip()
            user_agent_element = columns[1]

            if user_agent_element.has_attr("data-full-ua"):
                user_agent_str = user_agent_element["data-full-ua"]
            else:
                user_agent_str = user_agent_element.text.strip()

            if "more info" in user_agent_str or len(user_agent_str) < 10:
                print(f"⚠️ Skipping Invalid UA: {user_agent_str}")
                continue

            percentage_match = re.search(r"^\d+(\.\d+)?", percentage_text)
            percentage = float(percentage_match.group()) if percentage_match else 0.0

            device = detect_device(user_agent_str)
            browser, os = extract_browser_os(user_agent_str)

            user_agents.append({
                "user_agent": user_agent_str,
                "device": device,
                "browser": browser,
                "os": os,
                "percentage": percentage
            })

    print(f"✅ Scraped {len(user_agents)} user agents")
    return user_agents

def get_unique_user_agents(user_agents):
    """Return a list of unique user agent entries, keeping the first occurrence."""
    unique_dict = {}
    for entry in user_agents:
        ua = entry["user_agent"]
        if ua not in unique_dict:
            unique_dict[ua] = entry
    return list(unique_dict.values())

def post_to_api(user_agents):
    """Send parsed user agents to the API with error handling and validation."""
    for entry in user_agents:
        if not isinstance(entry, dict) or not all(k in entry for k in ["user_agent", "device", "browser", "os", "percentage"]):
            print(f"❌ Skipping invalid entry: {entry}")
            continue

        try:
            response = requests.post(API_URL, json=entry, headers={"Content-Type": "application/json"})
            if response.status_code in [200, 201]:  # Treat both 200 and 201 as success
                print(f"✅ Added: {entry['user_agent']}")
            elif response.status_code == 409:
                print(f"⚠️ Duplicate Entry: {entry['user_agent']}")
            elif response.status_code == 422:
                print(f"❌ Validation Error 422: {response.text}")
                print(f"🔍 Sent Data: {json.dumps(entry, indent=2)}")
            else:
                print(f"❌ Unexpected Error {response.status_code}: {response.text}")
        except requests.exceptions.RequestException as e:
            print(f"❌ Request failed: {e}")

if __name__ == "__main__":
    ua_list = scrape_user_agents()
    if ua_list:
        # Print summary
        get_final_user_agent_list(ua_list)
        # Get unique entries
        unique_entries = get_unique_user_agents(ua_list)
        # Post to API
        post_to_api(unique_entries)
    else:
        print("❌ No user agents found.")

📌 Found 10 tables
📝 Processing 12 rows
📝 Processing 17 rows
⚠️ Skipping Invalid UA: K (more info...)
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: K (more info...)
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: K (more info...)
⚠️ Skipping Invalid UA: K (more info...)
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: iPhone
⚠️ Skipping Invalid UA: K (more info...)
⚠️ Skipping Invalid UA: K (more info...)
📝 Processing 7 rows
📝 Processing 6 rows
📝 Processing 11 rows
📝 Processing 4 rows
📝 Processing 3 rows
📝 Processing 2 rows
📝 Processing 11 rows
📝 Processing 1 rows
✅ Scraped 59 user agents

📌 **Final User Agent Summary**
➡️ Total Entries: 59
✅ Unique User Agents: 46
⚠️ Duplicates Found: 8

🔄 **Duplicate User Agents (Highlighted)**:
  - Chrome 134.0.0, Windows (x2)
  - Generic Smartphone (x2)
  - Chrome Mobile iOS 134.0.6998, iOS 17.7 (x2)
  - F