## Scraping each topic's human content

Global Imports

In [None]:
import pandas as pd
import time
from gdeltdoc import GdeltDoc, Filters
from datetime import datetime, timedelta
import trafilatura
import re
import tiktoken
from openai import OpenAI
import pandas as pd
import time
from tqdm import tqdm

#### Economics

In [None]:
# List of economic topics
topics = [
    "economic policy", "economy", "inflation", "interest rates",
    "recession", "employment", "unemployment", "federal reserve", "stock market"
]

# Trusted news domains
trusted_sources = [
    "bbc.com", "reuters.com", "cnn.com", "theguardian.com", "techcrunch.com",
    "theverge.com", "foxnews.com", "npr.org", "apnews.com", "aljazeera.com",
    "politico.com", "axios.com", "cbsnews.com", "abcnews.go.com"
]

## --- DATE RANGE CONFIGURATION ---

# Set date range and manually change after each run
# Date range for this topic was 10/24 through 2/25
start_date = datetime.strptime("2025-02-17", "%Y-%m-%d")
end_date = datetime.strptime("2025-03-01", "%Y-%m-%d")
# Seconds between each request to avoid rate limiting
sleep_time = 8  

# Generate daily date ranges from start_date to end_date
def generate_daily_ranges(start, end):
    ranges = []
    current = start
    while current < end:
        next_day = current + timedelta(days=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d")))
        current = next_day
    return ranges

# Extract the full article from the url using trafilatura
def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None

# --- MAIN SCRAPER ---

# Initialize the GDELT wrapper
gd = GdeltDoc()
all_articles = []
date_ranges = generate_daily_ranges(start_date, end_date)

for topic in topics:
    for start, end in date_ranges:
        retries = 0
        success = False

        # Adds error handling
        while not success and retries < 3:
            try:
                filters = Filters(keyword=topic, start_date=start, end_date=end)
                articles = gd.article_search(filters)
                time.sleep(sleep_time)

                if not articles.empty:
                    articles["topic"] = topic
                    all_articles.append(articles)
                success = True

            except Exception as e:
                retries += 1
                print(f"Retry {retries}: Failed {topic} from {start} to {end} - {e}")
                time.sleep(sleep_time * 2)

# --- FILTER AND SAVE RESULTS ---

if all_articles:
    # Combines all the results into a single df
    combined = pd.concat(all_articles, ignore_index=True)
    # Filter for only english articles, from the sources, and a long enough title
    filtered = combined[
        (combined["language"] == "English") &
        (combined["domain"].isin(trusted_sources)) &
        (combined["title"].str.len() > 30)
    ].copy()
    # Applying function to get full article content
    filtered["content"] = filtered["url"].apply(get_article_text)
    final_df = filtered.dropna(subset=["content"])[["title", "url", "content", "topic"]]
else:
    final_df = pd.DataFrame(columns=["title", "url", "content", "topic"])

This cell runs only after the first time the code above is ran to make concatenation possible

In [None]:
econ_df  = final_df

Concats the articles from the different date ranges together. Block needs to be run each time the article generation code is run 

In [None]:
econ_df = pd.concat([econ_df, final_df], ignore_index=True)

econ_df = econ_df.drop_duplicates(subset=["title"])

econ_df

Examining the data

In [None]:
econ_df["topic"].value_counts()

In [None]:
econ_df

This code filters out unsuitable articles based on specific characterisitics

In [None]:
def is_suitable(row):
    title = str(row['title']).strip()
    content = str(row['content']).strip()

    # if the title has too few words
    if len(title.split()) < 5:
        return False
    # if title is a question
    if '?' in title:
        return False
    # if title indicates that the article content isn't journalistic
    if title.lower().startswith(('opinion', 'analysis', 'explainer', 'live', 'editorial', 'fact check', 'why ', 'how ', 'what ', 'who ')):
        return False
    # if article content is too short
    if len(content.split()) < 100:
        return False  
    # if the article has too few sentences
    if content.count('.') < 3:
        return False
    # if articles starts with a number
    if re.match(r'^\d+\s', title):  
        return False
    # if title contains non-journalistic phrases
    if any(phrase in title.lower() for phrase in ['this week', 'meet the press', 'face the nation', 'moderated by', 'featuring', "everything to know", "what you need to know", "pros and cons",
    "how to make the most of", "top trends in", "step-by-step guide", "ways to", "reasons to", "review", "word of the year", "transcript", "photo collection", "briefing"]):
        return False
    
    return True
df_econ = econ_df[econ_df.apply(is_suitable, axis=1)]

Examining the data

In [None]:
df_econ

In [None]:
df_econ["title"].values

Saving the human econ data to a csv

In [None]:
df_econ.to_csv("human_econ.csv", index=False)

Examining summary stats for token usage on the article data

In [None]:
def token_encode(df):
    encoding = tiktoken.encoding_for_model("gpt-4")
    df["token_count"] = df["content"].apply(lambda x: len(encoding.encode(str(x))))

    return df.describe()

In [None]:
token_encode(df_econ)

#### Political

In [None]:
# Political topics
topics = [
    "presidential election", "voter turnout", "election fraud",
    "ballot access", "voting rights", "political campaigns",
    "political debates", "swing states", "presidential inauguration", "Congress", "policy agenda", "executive orders"
]

# Trusted news domains  
trusted_sources = [
    "bbc.com", "reuters.com", "cnn.com", "theguardian.com", "techcrunch.com",
    "theverge.com", "foxnews.com", "npr.org", "apnews.com", "aljazeera.com",
    "politico.com", "axios.com", "cbsnews.com", "abcnews.go.com"
]

# Set date range and manually change after each run
# Date range for this topic was 10/24 through 2/25
start_date = datetime.strptime("2025-02-26", "%Y-%m-%d")
end_date = datetime.strptime("2025-03-01", "%Y-%m-%d") 
# Seconds between each request to avoid rate limiting
sleep_time = 8

# Generate daily date ranges from start_date to end_date
def generate_daily_ranges(start, end):
    ranges = []
    current = start
    while current < end:
        next_day = current + timedelta(days=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d")))
        current = next_day
    return ranges

# Extract the full article from the url using trafilatura
def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None

# --- MAIN SCRAPER ---

# Initialize the GDELT wrapper
gd = GdeltDoc()
all_articles = []
date_ranges = generate_daily_ranges(start_date, end_date)

for topic in topics:
    for start, end in date_ranges:
        retries = 0
        success = False

        while not success and retries < 3:
            try:
                filters = Filters(keyword=topic, start_date=start, end_date=end)
                articles = gd.article_search(filters)
                time.sleep(sleep_time)

                if not articles.empty:
                    articles["topic"] = topic
                    all_articles.append(articles)
                success = True

            except Exception as e:
                retries += 1
                print(f"Retry {retries}: Failed {topic} from {start} to {end} - {e}")
                time.sleep(sleep_time * 2)

# --- FILTER AND SAVE RESULTS ---

if all_articles:
    # Combines all the results into a single df
    combined = pd.concat(all_articles, ignore_index=True)
    # Filter for only english articles, from the sources, and a long enough title
    filtered = combined[
        (combined["language"] == "English") &
        (combined["domain"].isin(trusted_sources)) &
        (combined["title"].str.len() > 30)
    ].copy()
    # Applying function to get full article content
    filtered["content"] = filtered["url"].apply(get_article_text)
    final_pol_df = filtered.dropna(subset=["content"])[["title", "url", "content", "topic"]]
else:
    final_pol_df = pd.DataFrame(columns=["title", "url", "content", "topic"])

This cell runs only after the first time the code above is ran to make concatenation possible

In [None]:
pol_df = final_pol_df

Concats the articles from the different date ranges together. Block needs to be run each time the article generation code is run 

In [None]:
pol_df = pd.concat([pol_df, final_pol_df], ignore_index=True)

pol_df = pol_df.drop_duplicates(subset=["title"])

pol_df

Examining the data 

In [None]:
pol_df

In [None]:
pol_df["topic"].value_counts()

This code filters out unsuitable articles based on specific characterisitics

In [None]:
def is_suitable(row):
    title = str(row['title']).strip()
    content = str(row['content']).strip()

    # if the title has too few words
    if len(title.split()) < 5:
        return False
    # if title is a question
    if '?' in title:
        return False
    # if title indicates that the article content isn't journalistic
    if title.lower().startswith(('opinion', 'analysis', 'explainer', 'live', 'editorial', 'fact check', 'why ', 'how ', 'what ', 'who ')):
        return False
    # if article content is too short
    if len(content.split()) < 100:
        return False  
    # if the article has too few sentences
    if content.count('.') < 3:
        return False
    # if articles starts with a number
    if re.match(r'^\d+\s', title):  
        return False
    # if title contains non-journalistic phrases
    if any(phrase in title.lower() for phrase in ['this week', 'meet the press', 'face the nation', 'moderated by', 'featuring', "everything to know", "what you need to know", "pros and cons",
    "how to make the most of", "top trends in", "step-by-step guide", "ways to", "reasons to", "review", "word of the year", "transcript", "photo collection", "briefing"]):
        return False
    
    return True
df_pol = pol_df[pol_df.apply(is_suitable, axis=1)]

Examining the data

In [None]:
df_pol

In [None]:
df_pol["title"].values

Saving the pol df to a csv

In [None]:
df_pol.to_csv("human_pol.csv", index=False)

Examining summary stats for token usage on the article data

In [None]:
def token_encode(df):
    encoding = tiktoken.encoding_for_model("gpt-4")
    df["token_count"] = df["content"].apply(lambda x: len(encoding.encode(str(x))))

    return df.describe()

In [None]:
token_encode(df_pol)

#### Global Conflicts

In [None]:
# List of conflict topics
topics = [
    "global conflict", "Ukraine war", "Israel Hamas", "military escalation", "foreign policy", 
    "diplomatic tensions", "international sanctions", "tariffs", "trade war", "United Nations"

]

# Trusted news domains  
trusted_sources = [
    "bbc.com", "reuters.com", "cnn.com", "theguardian.com", "techcrunch.com",
    "theverge.com", "foxnews.com", "npr.org", "apnews.com", "aljazeera.com",
    "politico.com", "axios.com", "cbsnews.com", "abcnews.go.com"
]

# Set date range and manually change after each run
# Date range for this topic was 10/24 through 2/25
start_date = datetime.strptime("2025-02-15", "%Y-%m-%d")
end_date = datetime.strptime("2025-03-01", "%Y-%m-%d")
# Seconds between each request to avoid rate limiting
sleep_time = 8

# Generate daily date ranges from start_date to end_date
def generate_daily_ranges(start, end):
    ranges = []
    current = start
    while current < end:
        next_day = current + timedelta(days=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d")))
        current = next_day
    return ranges

# Extract the full article from the url using trafilatura
def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None

# --- MAIN SCRAPER ---

# Initialize the GDELT wrapper
gd = GdeltDoc()
all_articles = []
date_ranges = generate_daily_ranges(start_date, end_date)

for topic in topics:
    for start, end in date_ranges:
        retries = 0
        success = False

        # Adds error handling
        while not success and retries < 3:
            try:
                filters = Filters(keyword=topic, start_date=start, end_date=end)
                articles = gd.article_search(filters)
                time.sleep(sleep_time)

                if not articles.empty:
                    articles["topic"] = topic
                    all_articles.append(articles)
                success = True

            except Exception as e:
                retries += 1
                print(f"Retry {retries}: Failed {topic} from {start} to {end} - {e}")
                time.sleep(sleep_time * 2)

# --- FILTER AND SAVE RESULTS ---

if all_articles:
    # Combines all the results into a single df
    combined = pd.concat(all_articles, ignore_index=True)
    # Filter for only english articles, from the sources, and a long enough title
    filtered = combined[
        (combined["language"] == "English") &
        (combined["domain"].isin(trusted_sources)) &
        (combined["title"].str.len() > 30)
    ].copy()
    # Applying function to get full article content
    filtered["content"] = filtered["url"].apply(get_article_text)
    final_con_df = filtered.dropna(subset=["content"])[["title", "url", "content", "topic"]]
else:
    final_con_df = pd.DataFrame(columns=["title", "url", "content", "topic"])

This cell runs only after the first time the code above is ran to make concatenation possible

In [None]:
con_df = final_con_df

Concats the articles from the different date ranges together. Block needs to be run each time the article generation code is run 

In [None]:
con_df = pd.concat([con_df, final_con_df], ignore_index=True)

con_df = con_df.drop_duplicates(subset=["title"])

con_df

Examining the topics

In [None]:
con_df["topic"].value_counts()

This code filters out unsuitable articles based on specific characterisitics

In [None]:
def is_suitable(row):
    title = str(row['title']).strip()
    content = str(row['content']).strip()

    # if the title has too few words
    if len(title.split()) < 5:
        return False
    # if title is a question
    if '?' in title:
        return False
    # if title indicates that the article content isn't journalistic
    if title.lower().startswith(('opinion', 'analysis', 'explainer', 'live', 'editorial', 'fact check', 'why ', 'how ', 'what ', 'who ')):
        return False
    # if article content is too short
    if len(content.split()) < 100:
        return False  
    # if the article has too few sentences
    if content.count('.') < 3:
        return False
    # if articles starts with a number
    if re.match(r'^\d+\s', title):  
        return False
    # if title contains non-journalistic phrases
    if any(phrase in title.lower() for phrase in ['this week', 'meet the press', 'face the nation', 'moderated by', 'featuring', "everything to know", "what you need to know", "pros and cons",
    "how to make the most of", "top trends in", "step-by-step guide", "ways to", "reasons to", "review", "word of the year", "transcript", "photo collection", "briefing"]):
        return False
    
    return True
df_con = con_df[con_df.apply(is_suitable, axis=1)]

Examining the data

In [None]:
df_con

In [None]:
df_con["title"].values

Saving the conflict df to a csv

In [None]:
df_con.to_csv("human_con.csv", index=False)

Examining summary stats for token usage on the article data

In [None]:
def token_encode(df):
    encoding = tiktoken.encoding_for_model("gpt-4")
    df["token_count"] = df["content"].apply(lambda x: len(encoding.encode(str(x))))

    return df.describe()

In [None]:
token_encode(df_con)

#### Artificial Intelligence

In [None]:
# List of AI topics
topics = [
    "artificial intelligence", "AI regulation", "machine learning", "AI ethics", "deepfakes", "data privacy", "social media moderation", "algorithmic bias", "Large Language Models",
    "Chat-GPT", "Deepseek"
]


# Trusted news domains  
trusted_sources = [
    "bbc.com", "reuters.com", "cnn.com", "theguardian.com", "techcrunch.com",
    "theverge.com", "foxnews.com", "npr.org", "apnews.com", "aljazeera.com",
    "politico.com", "axios.com", "cbsnews.com", "abcnews.go.com"
]


# Set date range and manually change after each run
# Date range for this topic was 10/24 through 3/25
start_date = datetime.strptime("2024-10-15", "%Y-%m-%d")
end_date = datetime.strptime("2024-11-01", "%Y-%m-%d") 
# Seconds between each request to avoid rate limiting
sleep_time = 8


# Generate daily date ranges from start_date to end_date
def generate_daily_ranges(start, end):
    ranges = []
    current = start
    while current < end:
        next_day = current + timedelta(days=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d")))
        current = next_day
    return ranges

# Extract the full article from the url using trafilatura
def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None

# --- MAIN SCRAPER ---

# Initialize the GDELT wrapper
gd = GdeltDoc()
all_articles = []
date_ranges = generate_daily_ranges(start_date, end_date)

for topic in topics:
    for start, end in date_ranges:
        retries = 0
        success = False

        # Adds error handling
        while not success and retries < 3:
            try:
                filters = Filters(keyword=topic, start_date=start, end_date=end)
                articles = gd.article_search(filters)
                time.sleep(sleep_time)

                if not articles.empty:
                    articles["topic"] = topic
                    all_articles.append(articles)
                success = True

            except Exception as e:
                retries += 1
                print(f"Retry {retries}: Failed {topic} from {start} to {end} - {e}")
                time.sleep(sleep_time * 2)

# --- FILTER AND SAVE RESULTS ---

if all_articles:
    # Combines all the results into a single df
    combined = pd.concat(all_articles, ignore_index=True)
    # Filter for only english articles, from the sources, and a long enough title
    filtered = combined[
        (combined["language"] == "English") &
        (combined["domain"].isin(trusted_sources)) &
        (combined["title"].str.len() > 30)
    ].copy()

    # Applying function to get full article content
    filtered["content"] = filtered["url"].apply(get_article_text)
    final_ai_df = filtered.dropna(subset=["content"])[["title", "url", "content", "topic"]]
else:
    final_ai_df = pd.DataFrame(columns=["title", "url", "content", "topic"])

This cell runs only after the first time the code above is ran to make concatenation possible

In [None]:
ai_df = final_ai_df

Examining the data

In [None]:
ai_df

Concats the articles from the different date ranges together. Block needs to be run each time the article generation code is run 

In [None]:
ai_df = pd.concat([ai_df, final_ai_df], ignore_index=True)

ai_df = ai_df.drop_duplicates(subset=["title"])

ai_df

Examining the resulting topic counts

In [None]:
ai_df["topic"].value_counts()

This code filters out unsuitable articles based on specific characterisitics

In [None]:
def is_suitable(row):
    title = str(row['title']).strip()
    content = str(row['content']).strip()

    # if the title has too few words
    if len(title.split()) < 5:
        return False
    # if title is a question
    if '?' in title:
        return False
    # if title indicates that the article content isn't journalistic
    if title.lower().startswith(('opinion', 'analysis', 'explainer', 'live', 'editorial', 'fact check', 'why ', 'how ', 'what ', 'who ')):
        return False
    # if article content is too short
    if len(content.split()) < 100:
        return False  
    # if the article has too few sentences
    if content.count('.') < 3:
        return False
    # if articles starts with a number
    if re.match(r'^\d+\s', title):  
        return False
    # if title contains non-journalistic phrases
    if any(phrase in title.lower() for phrase in ['this week', 'meet the press', 'face the nation', 'moderated by', 'featuring', "everything to know", "what you need to know", "pros and cons",
    "how to make the most of", "top trends in", "step-by-step guide", "ways to", "reasons to", "review", "word of the year", "transcript", "photo collection", "briefing"]):
        return False
    
    return True
df_ai = ai_df[ai_df.apply(is_suitable, axis=1)]

Examines the resulting data

In [None]:
df_ai

In [None]:
df_ai["title"].values

Saving the AI df to a csv

In [None]:
df_ai.to_csv("human_ai.csv", index = False)

Applying the previous tokenizing summary statistics function

In [None]:
token_encode(df_ai)

#### Climate

In [None]:
# List of Climate topics
topics = [
    "climate change", "climate policy", "global warming", "carbon emissions", "green energy", "climate crisis", "extreme weather", "renewable energy",
    "natural disaster"
]


# Trusted news domains  
trusted_sources = [
    "bbc.com", "reuters.com", "cnn.com", "theguardian.com", "techcrunch.com",
    "theverge.com", "foxnews.com", "npr.org", "apnews.com", "aljazeera.com",
    "politico.com", "axios.com", "cbsnews.com", "abcnews.go.com"
]


# Set date range and manually change after each run
# Date range for this topic was 10/24 through 2/25
start_date = datetime.strptime("2025-02-15", "%Y-%m-%d")
end_date = datetime.strptime("2025-03-01", "%Y-%m-%d")  
# Seconds between each request to avoid rate limiting
sleep_time = 8 


# Generate daily date ranges from start_date to end_date
def generate_daily_ranges(start, end):
    ranges = []
    current = start
    while current < end:
        next_day = current + timedelta(days=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d")))
        current = next_day
    return ranges

# Extract the full article from the url using trafilatura
def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None

# --- MAIN SCRAPER ---

# Initialize the GDELT wrapper
gd = GdeltDoc()
all_articles = []
date_ranges = generate_daily_ranges(start_date, end_date)

for topic in topics:
    for start, end in date_ranges:
        retries = 0
        success = False

        # Adds error handling
        while not success and retries < 3:
            try:
                filters = Filters(keyword=topic, start_date=start, end_date=end)
                articles = gd.article_search(filters)
                time.sleep(sleep_time)

                if not articles.empty:
                    articles["topic"] = topic
                    all_articles.append(articles)
                success = True

            except Exception as e:
                retries += 1
                print(f"Retry {retries}: Failed {topic} from {start} to {end} - {e}")
                time.sleep(sleep_time * 2)

# --- FILTER AND SAVE RESULTS ---

if all_articles:
    # Combines all the results into a single df
    combined = pd.concat(all_articles, ignore_index=True)
    # Filter for only english articles, from the sources, and a long enough title
    filtered = combined[
        (combined["language"] == "English") &
        (combined["domain"].isin(trusted_sources)) &
        (combined["title"].str.len() > 30)
    ].copy()

    # Applying function to get full article content
    filtered["content"] = filtered["url"].apply(get_article_text)
    final_climate_df = filtered.dropna(subset=["content"])[["title", "url", "content", "topic"]]
else:
    final_climate_df = pd.DataFrame(columns=["title", "url", "content", "topic"])

This cell runs only after the first time the code above is ran to make concatenation possible

In [None]:
climate_df = final_climate_df

Concats the articles from the different date ranges together. Block needs to be run each time the article generation code is run 

In [None]:
climate_df = pd.concat([climate_df, final_climate_df], ignore_index=True)

climate_df = climate_df.drop_duplicates(subset=["title"])

climate_df

Examining the resulting topic counts

In [None]:
climate_df["topic"].value_counts()

This code filters out unsuitable articles based on specific characterisitics

In [None]:
def is_suitable(row):
    title = str(row['title']).strip()
    content = str(row['content']).strip()

    # if the title has too few words
    if len(title.split()) < 5:
        return False
    # if title is a question
    if '?' in title:
        return False
    # if title indicates that the article content isn't journalistic
    if title.lower().startswith(('opinion', 'analysis', 'explainer', 'live', 'editorial', 'fact check', 'why ', 'how ', 'what ', 'who ')):
        return False
    # if article content is too short
    if len(content.split()) < 100:
        return False  
    # if the article has too few sentences
    if content.count('.') < 3:
        return False
    # if articles starts with a number
    if re.match(r'^\d+\s', title):  
        return False
    # if title contains non-journalistic phrases
    if any(phrase in title.lower() for phrase in ['this week', 'meet the press', 'face the nation', 'moderated by', 'featuring', "everything to know", "what you need to know", "pros and cons",
    "how to make the most of", "top trends in", "step-by-step guide", "ways to", "reasons to", "review", "word of the year", "transcript", "photo collection", "briefing"]):
        return False
    
    return True
df_climate = climate_df[climate_df.apply(is_suitable, axis=1)]

Examining the data

In [None]:
df_climate

In [None]:
df_climate["title"].values

Saving the climate df to a csv

In [None]:
df_climate.to_csv("human_climate.csv", index = False)

Applying the previous tokenizing summary statistics function

In [None]:
token_encode(df_climate)

#### Health

In [None]:
# List of Health topics
topics = [
     "Cancer research", "medical breakthroughs", "Obesity", "Vaccine development", "Infectious diseases", 
     "Mental health crisis", "Health disparities", "opiod epidemic"
]

# Trusted news domains  
trusted_sources = [
    "bbc.com", "reuters.com", "cnn.com", "theguardian.com", "techcrunch.com",
    "theverge.com", "foxnews.com", "npr.org", "apnews.com", "aljazeera.com",
    "politico.com", "axios.com", "cbsnews.com", "abcnews.go.com"
]

# Set date range and manually change after each run
# Date range for this topic was 9/24 through 3/25
start_date = datetime.strptime("2024-09-01", "%Y-%m-%d") 
end_date = datetime.strptime("2024-09-10", "%Y-%m-%d") 
# Seconds between each request to avoid rate limiting
sleep_time = 8

# Generate daily date ranges from start_date to end_date
def generate_daily_ranges(start, end):
    ranges = []
    current = start
    while current < end:
        next_day = current + timedelta(days=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d")))
        current = next_day
    return ranges

# Extract the full article from the url using trafilatura
def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None

# --- MAIN SCRAPER ---

# Initialize the GDELT wrapper
gd = GdeltDoc()
all_articles = []
date_ranges = generate_daily_ranges(start_date, end_date)

for topic in topics:
    for start, end in date_ranges:
        retries = 0
        success = False
        # Adds error handling
        while not success and retries < 3:
            try:
                filters = Filters(keyword=topic, start_date=start, end_date=end)
                articles = gd.article_search(filters)
                time.sleep(sleep_time)

                if not articles.empty:
                    articles["topic"] = topic
                    all_articles.append(articles)
                success = True

            except Exception as e:
                retries += 1
                print(f"Retry {retries}: Failed {topic} from {start} to {end} - {e}")
                time.sleep(sleep_time * 2)

# --- FILTER AND SAVE RESULTS ---

if all_articles:
    # Combines all the results into a single df
    combined = pd.concat(all_articles, ignore_index=True)
    # Filter for only english articles, from the sources, and a long enough title
    filtered = combined[
        (combined["language"] == "English") &
        (combined["domain"].isin(trusted_sources)) &
        (combined["title"].str.len() > 30)
    ].copy()

    # Applying function to get full article content
    filtered["content"] = filtered["url"].apply(get_article_text)
    final_health_df = filtered.dropna(subset=["content"])[["title", "url", "content", "topic"]]
else:
    final_health_df = pd.DataFrame(columns=["title", "url", "content", "topic"])

This cell runs only after the first time the code above is ran to make concatenation possible

In [None]:
health_df = final_health_df

Concats the articles from the different date ranges together. Block needs to be run each time the article generation code is run 

In [None]:
health_df = pd.concat([health_df, final_health_df], ignore_index=True)

health_df = health_df.drop_duplicates(subset=["title"])

health_df

Examining the resulting topic counts

In [None]:
health_df["topic"].value_counts()

This code filters out unsuitable articles based on specific characterisitics

In [None]:
def is_suitable(row):
    title = str(row['title']).strip()
    content = str(row['content']).strip()

    # if the title has too few words
    if len(title.split()) < 5:
        return False
    # if title is a question
    if '?' in title:
        return False
    # if title indicates that the article content isn't journalistic
    if title.lower().startswith(('opinion', 'analysis', 'explainer', 'live', 'editorial', 'fact check', 'why ', 'how ', 'what ', 'who ')):
        return False
    # if article content is too short
    if len(content.split()) < 100:
        return False  
    # if the article has too few sentences
    if content.count('.') < 3:
        return False
    # if articles starts with a number
    if re.match(r'^\d+\s', title):  
        return False
    # if title contains non-journalistic phrases
    if any(phrase in title.lower() for phrase in ['this week', 'meet the press', 'face the nation', 'moderated by', 'featuring', "everything to know", "what you need to know", "pros and cons",
    "how to make the most of", "top trends in", "step-by-step guide", "ways to", "reasons to", "review", "word of the year", "transcript", "photo collection", "briefing"]):
        return False
    
    return True
df_health = health_df[health_df.apply(is_suitable, axis=1)]

Examining the data

In [None]:
df_health

In [None]:
df_health["title"].values

Saving the Health df to a csv

In [None]:
df_health.to_csv("human_health.csv", index = False)

Applying the previous tokenizing summary statistics function

In [None]:
token_encode(df_health)

#### Law

In [None]:
# List of Law topics
topics = [
  "Supreme Court ruling", "abortion ban", "student debt decision", "judicial review", "constitutional law", "civil rights cases", "Roe V. Wade", "Constitutional Ammendment",
]

# Trusted news domains
trusted_sources = [
    "bbc.com", "reuters.com", "cnn.com", "theguardian.com", "techcrunch.com",
    "theverge.com", "foxnews.com", "npr.org", "apnews.com", "aljazeera.com",
    "politico.com", "axios.com", "cbsnews.com", "abcnews.go.com"
]

# Set date range and manually change after each run
# Date range for this topic was 10/24 through 3/25
start_date = datetime.strptime("2025-02-01", "%Y-%m-%d")
end_date = datetime.strptime("2025-02-15", "%Y-%m-%d") 
# Seconds between each request to avoid rate limiting
sleep_time = 8 


# Generate daily date ranges from start_date to end_date
def generate_daily_ranges(start, end):
    ranges = []
    current = start
    while current < end:
        next_day = current + timedelta(days=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d")))
        current = next_day
    return ranges

# Extract the full article from the url using trafilatura
def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None

# --- MAIN SCRAPER ---

# Initialize the GDELT wrapper
gd = GdeltDoc()
all_articles = []
date_ranges = generate_daily_ranges(start_date, end_date)

for topic in topics:
    for start, end in date_ranges:
        retries = 0
        success = False
        # Adds error handling
        while not success and retries < 3:
            try:
                filters = Filters(keyword=topic, start_date=start, end_date=end)
                articles = gd.article_search(filters)
                time.sleep(sleep_time)

                if not articles.empty:
                    articles["topic"] = topic
                    all_articles.append(articles)
                success = True

            except Exception as e:
                retries += 1
                print(f"Retry {retries}: Failed {topic} from {start} to {end} - {e}")
                time.sleep(sleep_time * 2)

# --- FILTER AND SAVE RESULTS ---

if all_articles:
    # Combines all the results into a single df
    combined = pd.concat(all_articles, ignore_index=True)
    # Filter for only english articles, from the sources, and a long enough title
    filtered = combined[
        (combined["language"] == "English") &
        (combined["domain"].isin(trusted_sources)) &
        (combined["title"].str.len() > 30)
    ].copy()

    # Applying function to get full article content
    filtered["content"] = filtered["url"].apply(get_article_text)
    final_law_df = filtered.dropna(subset=["content"])[["title", "url", "content", "topic"]]
else:
    final_law_df = pd.DataFrame(columns=["title", "url", "content", "topic"])

This cell runs only after the first time the code above is ran to make concatenation possible

In [None]:
law_df = final_law_df

Concats the articles from the different date ranges together. Block needs to be run each time the article generation code is run 

In [None]:
law_df = pd.concat([law_df, final_law_df], ignore_index=True)

law_df = law_df.drop_duplicates(subset=["title"])

law_df

Examines the resulting topic value counts

In [None]:
law_df["topic"].value_counts()

This code filters out unsuitable articles based on specific characterisitics

In [None]:
def is_suitable(row):
    title = str(row['title']).strip()
    content = str(row['content']).strip()

    # if the title has too few words
    if len(title.split()) < 5:
        return False
    # if title is a question
    if '?' in title:
        return False
    # if title indicates that the article content isn't journalistic
    if title.lower().startswith(('opinion', 'analysis', 'explainer', 'live', 'editorial', 'fact check', 'why ', 'how ', 'what ', 'who ')):
        return False
    # if article content is too short
    if len(content.split()) < 100:
        return False  
    # if the article has too few sentences
    if content.count('.') < 3:
        return False
    # if articles starts with a number
    if re.match(r'^\d+\s', title):  
        return False
    # if title contains non-journalistic phrases
    if any(phrase in title.lower() for phrase in ['this week', 'meet the press', 'face the nation', 'moderated by', 'featuring', "everything to know", "what you need to know", "pros and cons",
    "how to make the most of", "top trends in", "step-by-step guide", "ways to", "reasons to", "review", "word of the year", "transcript", "photo collection", "briefing"]):
        return False
    
    return True
df_law = law_df[law_df.apply(is_suitable, axis=1)]

Examines the data

In [None]:
df_law

In [None]:
df_law["content"].values

Saves the Law df to a csv

In [None]:
df_law.to_csv("human_law.csv", index = False)

Applying the previous tokenizing summary statistics function

In [None]:
token_encode(df_law)

## Generating the AI text based on human title content

#### Economic AI Articles

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key="your personal api key")

# Define function that'll prompt ChatGPT-4o to write the article
def generate_gpt_article(headline):
    prompt = (
        f"You are a journalist writing a full news article based only on the headline below. "
        f"The event occurred after April 2024, and you have no access to real-world information about what actually happened. "
        f"Do not include a byline or dateline — just write the full article text. "
        f"Headline: \"{headline}\"\n\n"
        f"Article:"
    )
    # Specifying parameters and returning the written article
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=850,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f" Error on headline: {headline[:60]}... — {str(e)}")
        return None

# Make sure gpt_text column exists
if "gpt_text" not in df_econ.columns:
    df_econ["gpt_text"] = None

# Loop to generate GPT articles
# tdqm provides a status bar
for idx, row in tqdm(df_econ.iterrows(), total=len(df_econ)):
    if pd.isna(row["gpt_text"]) and pd.notna(row["title"]):
        gpt_article = generate_gpt_article(row["title"])
        df_econ.at[idx, "gpt_text"] = gpt_article

        # Save progress every 50 rows
        if idx % 50 == 0:
            df_econ.to_csv("econ_with_gpt_text_partial.csv", index=False)

        time.sleep(1.1)

# Final save to a csv
df_econ.to_csv("econ_gpt_text_final.csv", index=False)
print("All GPT articles generated and saved under 'econ_gpt_text.csv'.")


Examining the first and second article that was generated

In [None]:
first_article = df_econ["gpt_text"].dropna().iloc[0]
print(len(first_article))

In [None]:
first_article = df_econ["gpt_text"].dropna().iloc[1]
# print(first_article)
print(len(first_article))

Recreating the token summarization function but for the gpt generated text

In [None]:
def token_encode_gpt(df):
    encoding = tiktoken.encoding_for_model("gpt-4")
    df["token_count_ai"] = df["gpt_text"].apply(lambda x: len(encoding.encode(str(x))))

    return df.describe()

token_encode_gpt(df_econ)

#### Political AI Articles

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key="your personal api key")

# Define function that'll prompt ChatGPT-4o to write the article
def generate_gpt_article(headline):
    prompt = (
        f"You are a journalist writing a full news article based only on the headline below. "
        f"The event occurred after April 2024, and you have no access to real-world information about what actually happened. "
        f"Do not include a byline or dateline — just write the full article text. "
        f"Headline: \"{headline}\"\n\n"
        f"Article:"
    )
    # Specifying parameters and returning the written article
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1000,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f" Error on headline: {headline[:60]}... — {str(e)}")
        return None

# Make sure 'gpt_text' column exists
if "gpt_text" not in df_pol.columns:
    df_pol["gpt_text"] = None

# Loop to generate GPT articles
# tdqm provides a status bar
for idx, row in tqdm(df_pol.iterrows(), total=len(df_pol)):
    if pd.isna(row["gpt_text"]) and pd.notna(row["title"]):
        gpt_article = generate_gpt_article(row["title"])
        df_pol.at[idx, "gpt_text"] = gpt_article

        # Save progress every 50 rows
        if idx % 50 == 0:
            df_pol.to_csv("pol_with_gpt_text_partial.csv", index=False)

        time.sleep(1.1)

# Final save to a csv
df_pol.to_csv("pol_gpt_text_final.csv", index=False)
print("All GPT articles generated and saved under 'pol_gpt_text.csv'.")

Examine first article

In [None]:
first_article = df_pol["gpt_text"].dropna().iloc[0]
print(first_article)

Examine the resulting df

In [None]:
df_pol

Applying the gpt tokenization function

In [None]:
token_encode_gpt(df_pol)

#### Global Conflicts AI Articles

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key="your personal api key")

# Define function that'll prompt ChatGPT-4o to write the article
def generate_gpt_article(headline):
    prompt = (
        f"You are a journalist writing a full news article based only on the headline below. "
        f"The event occurred after April 2024, and you have no access to real-world information about what actually happened. "
        f"Do not include a byline or dateline — just write the full article text. "
        f"Headline: \"{headline}\"\n\n"
        f"Article:"
    )
    # Specifying parameters and returning the written article
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=950,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f" Error on headline: {headline[:60]}... — {str(e)}")
        return None

# Make sure 'gpt_text' column exists
if "gpt_text" not in df_con.columns:
    df_con["gpt_text"] = None

# Loop to generate GPT articles
# tdqm provides a status bar
for idx, row in tqdm(df_con.iterrows(), total=len(df_con)):
    if pd.isna(row["gpt_text"]) and pd.notna(row["title"]):
        gpt_article = generate_gpt_article(row["title"])
        df_con.at[idx, "gpt_text"] = gpt_article

        # Save progress every 50 rows
        if idx % 50 == 0:
            df_con.to_csv("conflicts_with_gpt_text_partial.csv", index=False)

        time.sleep(1.1)

# Final save to a csv
df_con.to_csv("conflicts_gpt_text_final.csv", index=False)
print("All GPT articles generated and saved under 'conflicts_gpt_text.csv'.")


Examining the first articles generated

In [None]:
first_article = df_con["gpt_text"].dropna().iloc[0]
print(first_article)

Applying the gpt tokenization function

In [None]:
token_encode_gpt(df_con)

In [None]:
# df_ai = pd.read_csv("human_ai.csv")

#### Artificial Intelligence AI Articles

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key="your personal api key")

# Define function that'll prompt ChatGPT-4o to write the article
def generate_gpt_article(headline):
    prompt = (
        f"You are a journalist writing a full news article based only on the headline below. "
        f"The event occurred after April 2024, and you have no access to real-world information about what actually happened. "
        f"Do not include a byline or dateline — just write the full article text. "
        f"Headline: \"{headline}\"\n\n"
        f"Article:"
    )
    # Specifying parameters and returning the written article
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1000,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f" Error on headline: {headline[:60]}... — {str(e)}")
        return None

# Make sure 'gpt_text' column exists
if "gpt_text" not in df_ai.columns:
    df_ai["gpt_text"] = None

# Loop to generate GPT articles
# tdqm provides a status bar
for idx, row in tqdm(df_ai.iterrows(), total=len(df_ai)):
    if pd.isna(row["gpt_text"]) and pd.notna(row["title"]):
        gpt_article = generate_gpt_article(row["title"])
        df_ai.at[idx, "gpt_text"] = gpt_article

        # Save progress every 50 rows
        if idx % 50 == 0:
            df_ai.to_csv("ai_with_gpt_text_partial.csv", index=False)

        time.sleep(1.1)

# Final save to a csv
df_ai.to_csv("ai_gpt_text_final.csv", index=False)
print("All GPT articles generated and saved under 'ai_gpt_text.csv'.")

Examine the first article generated

In [None]:
first_article = df_ai["gpt_text"].dropna().iloc[0]
print(first_article)

Applying the gpt tokenization function

In [None]:
token_encode_gpt(df_ai)

#### Climate AI Articles

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key="your personal api key")

# Define function that'll prompt ChatGPT-4o to write the article
def generate_gpt_article(headline):
    prompt = (
        f"You are a journalist writing a full news article based only on the headline below. "
        f"The event occurred after April 2024, and you have no access to real-world information about what actually happened. "
        f"Do not include a byline or dateline — just write the full article text. "
        f"Headline: \"{headline}\"\n\n"
        f"Article:"
    )
    # Specifying parameters and returning the written article
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1050,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f" Error on headline: {headline[:60]}... — {str(e)}")
        return None

# Make sure 'gpt_text' column exists
if "gpt_text" not in df_climate.columns:
    df_climate["gpt_text"] = None

# Loop to generate GPT articles
# tdqm provides a status bar
for idx, row in tqdm(df_climate.iterrows(), total=len(df_climate)):
    if pd.isna(row["gpt_text"]) and pd.notna(row["title"]):
        gpt_article = generate_gpt_article(row["title"])
        df_climate.at[idx, "gpt_text"] = gpt_article

        # Save progress every 50 rows
        if idx % 50 == 0:
            df_climate.to_csv("climate_with_gpt_text_partial.csv", index=False)

        time.sleep(1.1) 

# Final save to a csv
df_climate.to_csv("climate_gpt_text_final.csv", index=False)
print("All GPT articles generated and saved under 'climate_gpt_text.csv'.")

Examining the first article generated

In [None]:
first_article = df_climate["gpt_text"].dropna().iloc[0]
print(first_article)

Applying the gpt tokenization function

In [None]:
token_encode_gpt(df_climate)

#### Health AI Articles

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key="your personal api key")

# Define function that'll prompt ChatGPT-4o to write the article
def generate_gpt_article(headline):
    prompt = (
        f"You are a journalist writing a full news article based only on the headline below. "
        f"The event occurred after April 2024, and you have no access to real-world information about what actually happened. "
        f"Do not include a byline or dateline — just write the full article text. "
        f"Headline: \"{headline}\"\n\n"
        f"Article:"
    )
    # Specifying parameters and returning the written article
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1050,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f" Error on headline: {headline[:60]}... — {str(e)}")
        return None

# Make sure 'gpt_text' column exists
if "gpt_text" not in df_health.columns:
    df_health["gpt_text"] = None

# Loop to generate GPT articles
# tdqm provides a status bar
for idx, row in tqdm(df_health.iterrows(), total=len(df_health)):
    if pd.isna(row["gpt_text"]) and pd.notna(row["title"]):
        gpt_article = generate_gpt_article(row["title"])
        df_health.at[idx, "gpt_text"] = gpt_article

        # Save progress every 50 rows
        if idx % 50 == 0:
            df_health.to_csv("health_with_gpt_text_partial.csv", index=False)

        time.sleep(1.1) 

# Final save to a csv
df_health.to_csv("health_gpt_text_final.csv", index=False)
print("All GPT articles generated and saved under 'health_gpt_text.csv'.")


Examining the first article generated

In [None]:
first_article = df_health["gpt_text"].dropna().iloc[0]
print(first_article)

Applying the gpt tokenization function

In [None]:
token_encode_gpt(df_health)

#### Law AI Articles

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key="your personal api key")

# Define function that'll prompt ChatGPT-4o to write the article
def generate_gpt_article(headline):
    prompt = (
        f"You are a journalist writing a full news article based only on the headline below. "
        f"The event occurred after April 2024, and you have no access to real-world information about what actually happened. "
        f"Do not include a byline or dateline — just write the full article text. "
        f"Headline: \"{headline}\"\n\n"
        f"Article:"
    )
    # Specifying parameters and returning the written article
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1100,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f" Error on headline: {headline[:60]}... — {str(e)}")
        return None

# Make sure 'gpt_text' column exists
if "gpt_text" not in df_law.columns:
    df_law["gpt_text"] = None

# Loop to generate GPT articles
# tdqm provides a status bar
for idx, row in tqdm(df_law.iterrows(), total=len(df_law)):
    if pd.isna(row["gpt_text"]) and pd.notna(row["title"]):
        gpt_article = generate_gpt_article(row["title"])
        df_law.at[idx, "gpt_text"] = gpt_article

        # Save progress every 50 rows
        if idx % 50 == 0:
            df_law.to_csv("law_with_gpt_text_partial.csv", index=False)

        time.sleep(1.1)

# Final save to a csv
df_law.to_csv("law_gpt_text_final.csv", index=False)
print("All GPT articles generated and saved under 'law_gpt_text.csv'.")


Applying the gpt tokenization function

In [None]:
token_encode_gpt(df_law)