In [None]:
import pandas as pd
from newsapi import NewsApiClient
from sentence_transformers import SentenceTransformer, util
from datetime import timedelta
import time

# Load model for semantic comparison
embedder = SentenceTransformer("all-MiniLM-L6-v2")
API_KEY = "lorem ipsum"
# Init API
newsapi = NewsApiClient(api_key=API_KEY)  # ← use your real key

# Load and clean article data
df = pd.read_csv("articles.csv")
df.columns = df.columns.str.strip()
df = df.rename(columns={"Content:": "content", "Topic #:": "topic", "Pro-Anti:": "stance", "Title:": "title", "Date:": "date"})
df["stance"] = df["stance"].str.strip()
df = df[df["stance"] == "Pro"]
df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")

# List of known anti-trans leaning sources (adjust as needed)
suspected_sources = [
    "fox-news", "the-washington-times", "daily-wire", "national-review",
    "new-york-post", "breitbart-news", "the-american-conservative"
]

# Deduplication set
seen_urls = set()

# Store final matches
matches = []

# Process Pro articles
for i, row in df.iterrows():
    title = row["title"]
    topic = row["topic"]
    date = row["date"]
    if pd.isnull(date):
        continue

    from_date = (date - timedelta(days=7)).strftime("%Y-%m-%d")
    to_date   = (date + timedelta(days=7)).strftime("%Y-%m-%d")

    print(f"\n🔎 Searching for: {title[:60]} | Topic: {topic}")

    try:
        response = newsapi.get_everything(
            q=title,
            sources=",".join(suspected_sources),
            from_param=from_date,
            to=to_date,
            sort_by="relevancy",
            language="en",
            page_size=3  # Limit to 3 results per article
        )
        original_emb = embedder.encode(title, convert_to_tensor=True)

        for article in response["articles"]:
            url = article["url"]
            if url in seen_urls:
                continue
            seen_urls.add(url)

            candidate_title = article["title"]
            candidate_emb = embedder.encode(candidate_title, convert_to_tensor=True)
            similarity = util.cos_sim(original_emb, candidate_emb).item()

            if similarity > 0.6:  # Tune this threshold as needed
                matches.append({
                    "topic": topic,
                    "original_title": title,
                    "search_date": date.strftime("%Y-%m-%d"),
                    "matched_title": candidate_title,
                    "similarity": round(similarity, 3),
                    "source": article["source"]["name"],
                    "publishedAt": article["publishedAt"],
                    "url": url
                })
    except Exception as e:
        print("⚠️ NewsAPI error:", e)

    time.sleep(1.5)  # Be nice to the rate limiter

# Save final matches
matched_df = pd.DataFrame(matches)
matched_df.to_csv("semantic_matches.csv", index=False)
print(f"\n✅ Done! {len(matched_df)} high-similarity articles saved to semantic_matches.csv")

  df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")



🔎 Searching for: Mapping Attacks on LGBTQ Rights in U.S. State Legislatures i | Topic: 1

🔎 Searching for: The Human Toll of Trump's Anti-Trans Crusade | Topic: 2

🔎 Searching for: Perkins et al. v. State (HB 121) | Topic: 3

🔎 Searching for: Library Patrons Sue Greenville County Over Widespread Remova | Topic: 4
⚠️ NewsAPI error: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-03-19, but you have requested 2025-03-19. You may need to upgrade to a paid plan.'}

🔎 Searching for: State District Court of Appeals Blocks Ohio’s Ban on Gender- | Topic: 2
⚠️ NewsAPI error: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-03-19, but you have requested 2025-03-11. You may need to upgrade to a paid plan.'}

🔎 Searching for: The Supreme Court 

    new attempt

In [None]:
# pip install newsapi-python sentence-transformers pandas python-dateutil requests

import pandas as pd
from newsapi import NewsApiClient
from sentence_transformers import SentenceTransformer, util
from datetime import timedelta
import requests
import time

# ---------------------------------------
# SETUP
# ---------------------------------------


newsapi = NewsApiClient(api_key=API_KEY)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# ---------------------------------------
# FETCH SOURCES & FILTER
# ---------------------------------------

def fetch_sources():
    url = "https://newsapi.org/v2/top-headlines/sources"
    params = {"apiKey": API_KEY}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json().get("sources", [])
    else:
        print("Error fetching sources:", response.status_code)
        return []

def get_anti_trans_sources():
    all_sources = fetch_sources()
    keywords = ["conservative", "right", "republican", "libertarian", "nationalist"]
    filtered = []

    for src in all_sources:
        desc = (src.get("description") or "").lower()
        if any(word in desc for word in keywords):
            filtered.append(src["id"])
    
    return filtered

suspected_sources = get_anti_trans_sources()
print(f"✅ Loaded {len(suspected_sources)} filtered sources.")

# ---------------------------------------
# LOAD ARTICLES
# ---------------------------------------

df = pd.read_csv("articles.csv")
df.columns = df.columns.str.strip()
df = df.rename(columns={"Content:": "content", "Topic #:": "topic", "Pro-Anti:": "stance", "Title:": "title", "Date:": "date"})
df["stance"] = df["stance"].str.strip()
df = df[df["stance"] == "Pro"]
df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")

# ---------------------------------------
# MATCHING
# ---------------------------------------

matches = []
seen_urls = set()

for i, row in df.iterrows():
    title = row["title"]
    topic = row["topic"]
    content = row["content"]
    date = row["date"]

    if pd.isnull(date) or not title or not topic or not content:
        continue

    from_date = (date - timedelta(days=7)).strftime("%Y-%m-%d")
    to_date   = (date + timedelta(days=7)).strftime("%Y-%m-%d")

    print(f"\n🔎 Searching: {title[:60]} | Topic: {topic}")

    try:
        response = newsapi.get_everything(
            q=title,
            sources=",".join(suspected_sources),
            from_param=from_date,
            to=to_date,
            sort_by="relevancy",
            language="en",
            page_size=3
        )

        original_emb = embedder.encode(content, convert_to_tensor=True)

        candidates = []
        for article in response["articles"]:
            url = article["url"]
            if url in seen_urls:
                continue
            seen_urls.add(url)

            candidate_title = article["title"]
            candidate_emb = embedder.encode(candidate_title, convert_to_tensor=True)
            similarity = util.cos_sim(original_emb, candidate_emb).item()

            candidates.append({
                "topic": topic,
                "original_title": title,
                "search_date": date.strftime("%Y-%m-%d"),
                "matched_title": candidate_title,
                "similarity": round(similarity, 3),
                "source": article["source"]["name"],
                "publishedAt": article["publishedAt"],
                "url": url
            })

        # Keep those above threshold, or fallback to top 1 match
        filtered = [c for c in candidates if c["similarity"] > 0.5]
        if not filtered and candidates:
            filtered.append(max(candidates, key=lambda x: x["similarity"]))
        matches.extend(filtered)

    except Exception as e:
        print("⚠️ NewsAPI error:", e)

    time.sleep(1.5)

# ---------------------------------------
# EXPORT
# ---------------------------------------

matched_df = pd.DataFrame(matches)
matched_df.to_csv("semantic_matches.csv", index=False)
print(f"\n✅ Done! {len(matched_df)} articles saved to semantic_matches.csv")

✅ Loaded 2 filtered sources.

🔎 Searching: Mapping Attacks on LGBTQ Rights in U.S. State Legislatures i | Topic: 1


  df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")



🔎 Searching: The Human Toll of Trump's Anti-Trans Crusade | Topic: 2

🔎 Searching: Perkins et al. v. State (HB 121) | Topic: 3

🔎 Searching: Library Patrons Sue Greenville County Over Widespread Remova | Topic: 4
⚠️ NewsAPI error: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-03-19, but you have requested 2025-03-19. You may need to upgrade to a paid plan.'}

🔎 Searching: State District Court of Appeals Blocks Ohio’s Ban on Gender- | Topic: 2
⚠️ NewsAPI error: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-03-19, but you have requested 2025-03-11. You may need to upgrade to a paid plan.'}

🔎 Searching: The Supreme Court Case on Trans Health Care, Explained. | Topic: 2 or 5
⚠️ NewsAPI error: {'status': 'error', 'code': 'parameter

## RATE LIMITED
    50/12 hrs

### trying newsdata.io

In [None]:
NEWSDATA_API_KEY = "lorem ipsum"

In [8]:
# pip install newsdataapi newspaper3k sentence-transformers pandas

import pandas as pd
from newsdataapi import NewsDataApiClient
from newspaper import Article
from sentence_transformers import SentenceTransformer, util
from datetime import datetime, timedelta
import time

# -----------------------------
# Setup
# -----------------------------

API_KEY = NEWSDATA_API_KEY
newsdata = NewsDataApiClient(apikey=API_KEY)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# -----------------------------
# Scrape full article content
# -----------------------------

def scrape_article_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text.strip()
    except:
        return ""

# -----------------------------
# Search candidate articles (no from_date/to_date)
# -----------------------------

def search_newsdata_articles(query, language="en", country="us", limit=3):
    try:
        results = newsdata.news_api(
            q=query,
            language=language,
            country=country,
            page=0
        )
        return results.get("results", [])[:limit]
    except Exception as e:
        print("⚠️ NewsData API error:", e)
        return []

# -----------------------------
# Load your labeled dataset
# -----------------------------

df = pd.read_csv("articles.csv")
df.columns = df.columns.str.strip()
df = df.rename(columns={"Content:": "content", "Topic #:": "topic", "Pro-Anti:": "stance", "Title:": "title", "Date:": "date"})
df["stance"] = df["stance"].str.strip()
df = df[df["stance"] == "Pro"]
df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")

# -----------------------------
# Match & Score
# -----------------------------

matches = []

for _, row in df.iterrows():
    pro_title = row["title"]
    pro_content = row["content"]
    topic = row["topic"]
    date = row["date"]

    if pd.isnull(date) or not pro_content or not pro_title:
        continue

    print(f"\n🔍 Searching for: {pro_title[:60]}")

    candidates = search_newsdata_articles(query=pro_title)
    pro_emb = embedder.encode(pro_content, convert_to_tensor=True)

    for c in candidates:
        url = c.get("link")
        if not url:
            continue

        # Optional: filter by pubDate
        pub_date_str = c.get("pubDate", "")
        try:
            pub_date = pd.to_datetime(pub_date_str)
            if abs((pub_date - date).days) > 14:
                continue
        except:
            continue

        body = scrape_article_text(url)
        if len(body) < 300:
            continue

        candidate_emb = embedder.encode(body, convert_to_tensor=True)
        similarity = util.cos_sim(pro_emb, candidate_emb).item()

        matches.append({
            "topic": topic,
            "original_title": pro_title,
            "matched_title": c.get("title", ""),
            "url": url,
            "published": pub_date_str,
            "similarity": round(similarity, 3)
        })

        print(f"📝 Match (sim={similarity:.3f}) → {url}")
        time.sleep(1.0)

# -----------------------------
# Export results
# -----------------------------

if matches:
    out_df = pd.DataFrame(matches).sort_values(by="similarity", ascending=False)
    out_df.to_csv("newsdata_matched_articles.csv", index=False)
    print(f"\n✅ Done! {len(out_df)} articles saved to newsdata_matched_articles.csv")
else:
    print("⚠️ No matches found or fetched.")

  df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")
  results = newsdata.news_api(



🔍 Searching for: Mapping Attacks on LGBTQ Rights in U.S. State Legislatures i

🔍 Searching for: The Human Toll of Trump's Anti-Trans Crusade

🔍 Searching for: Perkins et al. v. State (HB 121)

🔍 Searching for: Library Patrons Sue Greenville County Over Widespread Remova

🔍 Searching for: State District Court of Appeals Blocks Ohio’s Ban on Gender-

🔍 Searching for: The Supreme Court Case on Trans Health Care, Explained.

🔍 Searching for: Montana Court Blocks State From Refusing to Correct Sex Mark

🔍 Searching for: Trump's Executive Orders Promoting Sex Discrimination, Expla

🔍 Searching for: Marquez v. State of Montana

🔍 Searching for: Fact Sheet: Mahmoud v. Taylor, the U.S. Supreme Court Case A

🔍 Searching for: 150+ Anti-LGBTQ Incidents Targeted Religious Communities in 

🔍 Searching for: Research Explores How Trans Technologies are Removing Barrie

🔍 Searching for: Coachella 2025: Troye Sivan, Billie Eilish, Trixie Mattel, V

🔍 Searching for: Star-Studded 2025 Latine Honors Lifts

KeyboardInterrupt: 

In [11]:
import pandas as pd
import requests
from newspaper import Article
from sentence_transformers import SentenceTransformer, util
from datetime import datetime, timedelta
import threading
import time
from tqdm import tqdm

# -----------------------------
# Setup
# -----------------------------

API_KEY = NEWSDATA_API_KEY
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# -----------------------------
# Timeout-safe scraping
# -----------------------------

def scrape_article_text(url, timeout=10):
    result = {}

    def fetch():
        try:
            article = Article(url)
            article.download()
            article.parse()
            result["text"] = article.text.strip()
        except:
            result["text"] = ""

    thread = threading.Thread(target=fetch)
    thread.start()
    thread.join(timeout)

    if thread.is_alive():
        print(f"⚠️ Timeout while scraping: {url}")
        return ""
    return result.get("text", "")

# -----------------------------
# Search NewsData.io (raw API)
# -----------------------------

def search_newsdata_articles(query, language="en", country="us", limit=3):
    url = "https://newsdata.io/api/1/news"
    params = {
        "apikey": API_KEY,
        "q": query,
        "language": language,
        "country": country,
        "page": 1
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json().get("results", [])[:limit]
    except Exception as e:
        print("⚠️ NewsData API request failed:", e)
        return []

# -----------------------------
# Load your Pro articles
# -----------------------------

df = pd.read_csv("articles.csv")
df.columns = df.columns.str.strip()
df = df.rename(columns={"Content:": "content", "Topic #:": "topic", "Pro-Anti:": "stance", "Title:": "title", "Date:": "date"})
df["stance"] = df["stance"].str.strip()
df = df[df["stance"] == "Pro"]
df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")

# -----------------------------
# Matching loop
# -----------------------------

matches = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    pro_title = row["title"]
    pro_content = row["content"]
    topic = row["topic"]
    date = row["date"]

    if pd.isnull(date) or not pro_content or not pro_title:
        continue

    print(f"\n🔍 Searching for: {pro_title[:60]}")
    candidates = search_newsdata_articles(query=pro_title)
    pro_emb = embedder.encode(pro_content, convert_to_tensor=True)

    for c in candidates:
        url = c.get("link")
        if not url:
            continue

        pub_date_str = c.get("pubDate", "")
        try:
            pub_date = pd.to_datetime(pub_date_str)
            if abs((pub_date - date).days) > 14:
                continue
        except:
            continue

        print(f"⏳ Scraping: {url}")
        body = scrape_article_text(url)
        if len(body) < 300:
            continue

        candidate_emb = embedder.encode(body, convert_to_tensor=True)
        similarity = util.cos_sim(pro_emb, candidate_emb).item()

        matches.append({
            "topic": topic,
            "original_title": pro_title,
            "matched_title": c.get("title", ""),
            "url": url,
            "published": pub_date_str,
            "similarity": round(similarity, 3)
        })

        print(f"📝 Match (sim={similarity:.3f}) → {url}")
        time.sleep(1.0)

# -----------------------------
# Save results
# -----------------------------

if matches:
    out_df = pd.DataFrame(matches).sort_values(by="similarity", ascending=False)
    out_df.to_csv("newsdata_matched_articles.csv", index=False)
    print(f"\n✅ Done! {len(out_df)} articles saved to newsdata_matched_articles.csv")
else:
    print("⚠️ No matches found or fetched.")

  df["date"] = pd.to_datetime(df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0], errors="coerce")
  0%|          | 0/63 [00:00<?, ?it/s]


🔍 Searching for: Mapping Attacks on LGBTQ Rights in U.S. State Legislatures i


  2%|▏         | 1/63 [00:00<00:20,  3.10it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Mapping+Attacks+on+LGBTQ+Rights+in+U.S.+State+Legislatures+in+2025&language=en&country=us&page=1

🔍 Searching for: The Human Toll of Trump's Anti-Trans Crusade


  3%|▎         | 2/63 [00:00<00:19,  3.15it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=The+Human+Toll+of+Trump%27s+Anti-Trans+Crusade&language=en&country=us&page=1

🔍 Searching for: Perkins et al. v. State (HB 121)


  5%|▍         | 3/63 [00:00<00:18,  3.18it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Perkins+et+al.+v.+State+%28HB+121%29&language=en&country=us&page=1

🔍 Searching for: Library Patrons Sue Greenville County Over Widespread Remova


  6%|▋         | 4/63 [00:01<00:21,  2.71it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Library+Patrons+Sue+Greenville+County+Over+Widespread+Removals+and+Restrictions+of+LGBTQ+Books&language=en&country=us&page=1

🔍 Searching for: State District Court of Appeals Blocks Ohio’s Ban on Gender-


  8%|▊         | 5/63 [00:01<00:20,  2.84it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=State+District+Court+of+Appeals+Blocks+Ohio%E2%80%99s+Ban+on+Gender-Affirming+Care+for+Trans+Minors%2C+in+an+Historic+Win+for+Families+and+Bodily+Autonomy&language=en&country=us&page=1

🔍 Searching for: The Supreme Court Case on Trans Health Care, Explained.


 10%|▉         | 6/63 [00:02<00:19,  2.98it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=The+Supreme+Court+Case+on+Trans+Health+Care%2C+Explained.&language=en&country=us&page=1

🔍 Searching for: Montana Court Blocks State From Refusing to Correct Sex Mark


 11%|█         | 7/63 [00:02<00:18,  3.10it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Montana+Court+Blocks+State+From+Refusing+to+Correct+Sex+Markers+on+Transgender+People%27s+Birth+Certificates+and+Driver%27s+Licenses&language=en&country=us&page=1

🔍 Searching for: Trump's Executive Orders Promoting Sex Discrimination, Expla


 13%|█▎        | 8/63 [00:02<00:17,  3.16it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Trump%27s+Executive+Orders+Promoting+Sex+Discrimination%2C+Explained&language=en&country=us&page=1

🔍 Searching for: Marquez v. State of Montana


 14%|█▍        | 9/63 [00:02<00:17,  3.16it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Marquez+v.+State+of+Montana&language=en&country=us&page=1

🔍 Searching for: Fact Sheet: Mahmoud v. Taylor, the U.S. Supreme Court Case A


 16%|█▌        | 10/63 [00:03<00:16,  3.15it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Fact+Sheet%3A+Mahmoud+v.+Taylor%2C+the+U.S.+Supreme+Court+Case+About+Banning+LGBTQ-inclusive+Books&language=en&country=us&page=1

🔍 Searching for: 150+ Anti-LGBTQ Incidents Targeted Religious Communities in 


 17%|█▋        | 11/63 [00:03<00:16,  3.15it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=150%2B+Anti-LGBTQ+Incidents+Targeted+Religious+Communities+in+the+US%2C+According+to+Newly+Released+Data+from+GLAAD%E2%80%99s+ALERT+Desk&language=en&country=us&page=1

🔍 Searching for: Research Explores How Trans Technologies are Removing Barrie


 19%|█▉        | 12/63 [00:04<00:18,  2.79it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Research+Explores+How+Trans+Technologies+are+Removing+Barriers+to+Access+and+Visibility&language=en&country=us&page=1

🔍 Searching for: Coachella 2025: Troye Sivan, Billie Eilish, Trixie Mattel, V


 21%|██        | 13/63 [00:04<00:20,  2.49it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Coachella+2025%3A+Troye+Sivan%2C+Billie+Eilish%2C+Trixie+Mattel%2C+VINCINT%2C+Anitta%2C+Japanese+Breakfast%2C+Clairo%2C+and+More+LGBTQ+Artists+Take+the+Desert+Stage&language=en&country=us&page=1

🔍 Searching for: Star-Studded 2025 Latine Honors Lifts Up Queer Latine Repres


 22%|██▏       | 14/63 [00:05<00:21,  2.26it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Star-Studded+2025+Latine+Honors+Lifts+Up+Queer+Latine+Representation+Across+Entertainment+Ahead+of+36th+GLAAD+Media+Awards&language=en&country=us&page=1

🔍 Searching for: Election 2025: Wisconsin’s Supreme Court Candidates on the R


 24%|██▍       | 15/63 [00:05<00:19,  2.47it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Election+2025%3A+Wisconsin%E2%80%99s+Supreme+Court+Candidates+on+the+Record&language=en&country=us&page=1

🔍 Searching for: Trump Accountability Tracker
255 attacks ON LGBTQ PEOPLE


 25%|██▌       | 16/63 [00:05<00:18,  2.57it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Trump+Accountability+Tracker%0A255+attacks+ON+LGBTQ+PEOPLE&language=en&country=us&page=1

🔍 Searching for: LGBTQ Alabamians and Our History Will Not Be Erased


 27%|██▋       | 17/63 [00:06<00:18,  2.42it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=LGBTQ+Alabamians+and+Our+History+Will+Not+Be+Erased&language=en&country=us&page=1

🔍 Searching for: New York Times Removes Mention of Transgender Man from Headl


 29%|██▊       | 18/63 [00:06<00:17,  2.60it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=New+York+Times+Removes+Mention+of+Transgender+Man+from+Headline&language=en&country=us&page=1

🔍 Searching for: Missouri AG’s Lawsuit Aims to Undo Protections for LGBTQ You


 30%|███       | 19/63 [00:06<00:16,  2.73it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Missouri+AG%E2%80%99s+Lawsuit+Aims+to+Undo+Protections+for+LGBTQ+Youth%2C+Defends+Harmful+Practice+of+Conversion+Therapy%2C+as+Local+LGBTQ+Advocates+Respond&language=en&country=us&page=1

🔍 Searching for: Transitioning teen cannot change name, Mississippi Supreme C


 32%|███▏      | 20/63 [00:07<00:14,  2.91it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Transitioning+teen+cannot+change+name%2C+Mississippi+Supreme+Court+rules&language=en&country=us&page=1

🔍 Searching for: Transgender women in Britain fear ruling could place toilets


 33%|███▎      | 21/63 [00:07<00:14,  2.97it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Transgender+women+in+Britain+fear+ruling+could+place+toilets%2C+sports+and+hospitals+off+limits&language=en&country=us&page=1

🔍 Searching for: Trump administration sues Maine over participation of transg


 35%|███▍      | 22/63 [00:07<00:13,  3.05it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Trump+administration+sues+Maine+over+participation+of+transgender+athletes+in+girls+sports&language=en&country=us&page=1

🔍 Searching for: A look at recent global actions limiting legal recognitions 


 37%|███▋      | 23/63 [00:08<00:13,  3.07it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=A+look+at+recent+global+actions+limiting+legal+recognitions+for+transgender+people&language=en&country=us&page=1

🔍 Searching for: Hungary passes constitutional amendment to ban LGBTQ+ public


 38%|███▊      | 24/63 [00:08<00:12,  3.03it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Hungary+passes+constitutional+amendment+to+ban+LGBTQ%2B+public+events%2C+seen+as+a+major+blow+to+rights&language=en&country=us&page=1

🔍 Searching for: Montana’s anti-transgender bathroom restrictions are on hold


 40%|███▉      | 25/63 [00:08<00:14,  2.64it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Montana%E2%80%99s+anti-transgender+bathroom+restrictions+are+on+hold+under+a+judge%E2%80%99s+order&language=en&country=us&page=1

🔍 Searching for: California lawmakers reject bills to ban trans athletes’ par


 41%|████▏     | 26/63 [00:09<00:13,  2.65it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=California+lawmakers+reject+bills+to+ban+trans+athletes%E2%80%99+participation+in+girls+sports&language=en&country=us&page=1

🔍 Searching for: Trump administration cancels at least 68 grants focused on L


 43%|████▎     | 27/63 [00:09<00:12,  2.83it/s]

⚠️ NewsData API request failed: 422 Client Error: UNPROCESSABLE ENTITY for url: https://newsdata.io/api/1/news?apikey=pub_81904f0d64c334af6b5d15cc83c78a3356865&q=Trump+administration+cancels+at+least+68+grants+focused+on+LGBTQ+health+questions&language=en&country=us&page=1

🔍 Searching for: Denmark advises transgender people to contact US Embassy bef


 43%|████▎     | 27/63 [00:09<00:13,  2.75it/s]


KeyboardInterrupt: 

In [17]:
import re
import pandas as pd
import requests
from newspaper import Article
from sentence_transformers import SentenceTransformer, util
from datetime import datetime
import threading
import time
from tqdm import tqdm

# -----------------------------
# Configuration
# -----------------------------
API_KEY = NEWSDATA_API_KEY # 🔐 Replace with your real key
BASE_URL = "https://newsdata.io/api/1/news"
MAX_WORDS = 6
MIN_ARTICLE_LENGTH = 300
SIMILARITY_THRESHOLD = 0.5
OUTPUT_FILE = "newsdata_matched_articles.csv"
FAILED_QUERY_LOG = "failed_queries.csv"

# -----------------------------
# Setup
# -----------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def clean_query(text, max_words=MAX_WORDS):
    """Removes symbols and truncates to max_words"""
    cleaned = re.sub(r"[^\w\s]", "", text)
    words = cleaned.strip().split()
    return " ".join(words[:max_words])

def scrape_article_text(url, timeout=10):
    """Scrapes article using newspaper3k with timeout"""
    result = {}
    def fetch():
        try:
            article = Article(url)
            article.download()
            article.parse()
            result["text"] = article.text.strip()
        except:
            result["text"] = ""
    thread = threading.Thread(target=fetch)
    thread.start()
    thread.join(timeout)
    return result.get("text", "")

def search_newsdata_articles(query, language="en", country="us", limit=3):
    url = BASE_URL
    params = {
        "apikey": API_KEY,
        "q": clean_query(query),
        "language": language,
        "country": country,
        "page": 1
    }
    try:
        response = requests.get(url, params=params)
        time.sleep(35)
        print(" ⏱ Pause to respect rate limi ")
        if response.status_code == 422:
            return []
        response.raise_for_status()
        return response.json().get("results", [])[:limit]
    except Exception as e:
        print(f"⚠️ NewsData API request failed: {e}")
        return []
# -----------------------------
# Load and filter dataset
# -----------------------------
df = pd.read_csv("articles.csv")
df.columns = df.columns.str.strip()
df = df.rename(columns={
    "Content:": "content",
    "Topic #:": "topic",
    "Pro-Anti:": "stance",
    "Title:": "title",
    "Date:": "date"
})
df["stance"] = df["stance"].str.strip()
df = df[df["stance"] == "Pro"]
df["date"] = pd.to_datetime(
    df["date"].str.extract(r'(\d{1,2}/\d{1,2}/\d{2,4})')[0],
    errors="coerce"
)

# -----------------------------
# Loop over articles
# -----------------------------
matches = []
failed_queries = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    title = row["title"]
    content = row["content"]
    topic = row["topic"]
    date = row["date"]

    if pd.isnull(date) or not title or not content:
        continue

    print(f"\n🔍 Searching for: {title[:60]}")
    time.sleep(2)
    candidates = search_newsdata_articles(title)
    if not candidates:
        print(f"⚠️ Retrying with topic: {topic}")
        candidates = search_newsdata_articles(topic)
        if not candidates:
            failed_queries.append(title)
            continue

    query_emb = embedder.encode(content, convert_to_tensor=True)

    for c in candidates:
        url = c.get("link", "")
        pub_date = c.get("pubDate", "")
        try:
            if abs((pd.to_datetime(pub_date) - date).days) > 14:
                continue
        except:
            continue

        print(f"⏳ Scraping: {url}")
        text = scrape_article_text(url)
        if len(text) < MIN_ARTICLE_LENGTH:
            continue

        cand_emb = embedder.encode(text, convert_to_tensor=True)
        sim = util.cos_sim(query_emb, cand_emb).item()

        if sim >= SIMILARITY_THRESHOLD:
            matches.append({
                "topic": topic,
                "original_title": title,
                "matched_title": c.get("title", ""),
                "url": url,
                "published": pub_date,
                "similarity": round(sim, 3)
            })
            print(f"📝 Match (sim={sim:.3f}) → {url}")
        time.sleep(1)

# -----------------------------
# Save results
# -----------------------------
if matches:
    pd.DataFrame(matches).sort_values(by="similarity", ascending=False).to_csv(OUTPUT_FILE, index=False)
    print(f"\n✅ Done! {len(matches)} matches saved to {OUTPUT_FILE}")

if failed_queries:
    pd.Series(failed_queries).to_csv(FAILED_QUERY_LOG, index=False)
    print(f"⚠️ {len(failed_queries)} queries failed. Logged to {FAILED_QUERY_LOG}")

  df["date"] = pd.to_datetime(
  0%|          | 0/63 [00:00<?, ?it/s]


🔍 Searching for: Mapping Attacks on LGBTQ Rights in U.S. State Legislatures i
 ⏱ Pause to respect rate limi 
⚠️ Retrying with topic: 1


  0%|          | 0/63 [00:42<?, ?it/s]


KeyboardInterrupt: 