In [10]:
from atproto import Client, models
import os
import sys
import csv
import datetime
import time
from tqdm.notebook import tqdm, trange

In [11]:
# --- PARAMETERS ---
# Maximum posts to retrieve per keyword
MAX_POSTS_PER_KEYWORD = 1000

In [12]:
OUTPUT_DIR = "./downloaded_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [13]:
# Date Range Filter (Y-M-D)
# Only posts created within this range will be saved.
DATE_FILTER_START = "2019-01-01"
DATE_FILTER_END = "2026-01-01" # Set to a future date to include everything up to now

In [14]:
# Authenticated Bluesky Client Credentials
HANDLE = os.getenv("BLUESKY_HANDLE", "yourhandle.bsky.social")
PASSWORD = os.getenv("BLUESKY_PASSWORD", "your-app-password")

In [None]:
# Keyword Groups by "Country"/Language
# Each group creates a separate dataset.
KEYWORD_GROUPS = {
    "Global": { # English
        "lang": "en",
        "keywords": [
            "green transition", "greenhouse effect", "loss of biodiversity", "extreme weather events",
            "co2", "emissions", "global warming", "melting glaciers", "renewable energy",
            "misinformation", "catastrophe", 
            "anthropogenic ",
             "sustainable biofuels",
            "desertification", "deforestation", "clean energy",
            "sea level rise", "extinction"
        ]
    },
    "Spain": { # Spanish
        "lang": "es",
        "keywords": [
            "transición verde", "efecto invernadero", "pérdida de biodiversidad", "eventos climáticos extremos",
            "co2", "emisiones", "calentamiento global", "derretimiento de glaciares", "energía renovable",
            "desinformación", "catástrofe", "antropogénico", "biocombustibles sostenibles",
            "desertificación", "deforestación", "energía limpia",
            "aumento del nivel del mar", "extinción"
        ]
    },
    "Portugal": { # Portuguese
        "lang": "pt",
        "keywords": [
            "transição verde", "efeito estufa", "perda de biodiversidade", "eventos climáticos extremos",
            "co2", "emissões", "aquecimento global", "derretimento das geleiras", "energia renovável",
            "desinformação", "catástrofe", "antropogênico", "biocombustíveis sustentáveis",
            "desertificação", "desmatamento", "energia limpa",
            "aumento do nível do mar", "extinção"
        ]
    },
    "Italy": { # Italian
        "lang": "it",
        "keywords": [
            "transizione verde", "effetto serra", "perdita di biodiversità", "eventi meteorologici estremi",
            "co2", "emissioni", "riscaldamento globale", "scioglimento dei ghiacciai", "energia rinnovabile",
            "disinformazione", "catastrofe", "antropogenico", "biocarburanti sostenibili",
            "desertificazione", "deforestazione", "energia pulita",
            "innalzamento del livello del mare", "estinzione"
        ]
    },
    "Germany": { # German
        "lang": "de",
        "keywords": [
            "grüne wende", "treibhauseffekt", "biodiversitätsverlust", "extreme wetterereignisse",
            "co2", "emissionen", "erderwärmung", "gletscherschmelze", "erneuerbare energien",
            "fehlinformationen", "katastrophe", "anthropogen", "nachhaltige biokraftstoffe",
            "wüstenbildung", "abholzung", "saubere energie",
            "meeresspiegelanstieg", "aussterben"
        ]
    }
}

In [16]:
def download_posts_by_keywords(keywords: list[str], handle: str, password: str, max_posts: int = 100, lang: str = None, start_date: str = None, end_date: str = None):
    """
    Downloads posts from Bluesky based on a list of keywords.
    Applies client-side date filtering if start_date/end_date provided (ISO strings YYYY-MM-DD).
    """
    client = Client()
    try:
        client.login(handle, password)
    except Exception as e:
        print(f"Failed to login: {e}")
        return []

    all_found_posts = []
    
    # Iterate through keywords with a progress bar
    # Use leave=False so it clears after completion if desired, or True to keep it.
    for keyword in tqdm(keywords, desc=f"Keywords ({lang})", leave=False):
        posts_for_keyword = []
        cursor = None
        
        # Loop until we have enough *valid* posts
        # Safety break to prevent infinite loops
        request_count = 0
        MAX_REQUESTS = 20 

        # Inner progress bar for posts collection
        with tqdm(total=max_posts, desc=f"Downloading '{keyword}'", leave=False) as post_pbar:
            
            while len(posts_for_keyword) < max_posts and request_count < MAX_REQUESTS:
                request_count += 1
                try:
                    # Calculate limit request
                    remaining = max_posts - len(posts_for_keyword)
                    # Always ask for more to filter by date efficiently, capped at 100
                    batch_limit = 100
                    
                    params = models.AppBskyFeedSearchPosts.Params(
                        q=keyword,
                        limit=batch_limit,
                        cursor=cursor,
                        lang=lang
                    )
                    
                    response = client.app.bsky.feed.search_posts(params)

                    if response and response.posts:
                        new_valid_posts = 0
                        for post in response.posts:
                            # Check Date Filter
                            created_at = post.record.created_at
                            if start_date and created_at < start_date:
                                continue
                            if end_date and created_at > end_date:
                                continue
                                
                            # Construct URL
                            rkey = post.uri.split('/')[-1]
                            web_url = f"https://bsky.app/profile/{post.author.handle}/post/{rkey}"
                            
                            post_data = {
                                'url': web_url,
                                'author_handle': post.author.handle,
                                'text': post.record.text,
                                'created_at': created_at,
                                'like_count': post.like_count,
                                'repost_count': post.repost_count,
                                'reply_count': post.reply_count,
                                'lang': lang,
                                'keyword': keyword
                            }
                            
                            # Add if we haven't hit the limit yet
                            if len(posts_for_keyword) < max_posts:
                                 posts_for_keyword.append(post_data)
                                 new_valid_posts += 1
                                 post_pbar.update(1)
                            else:
                                break
                        
                        # Update cursor for next page
                        if hasattr(response, 'cursor') and response.cursor and len(posts_for_keyword) < max_posts:
                            cursor = response.cursor
                        else:
                            break 
                    else:
                        break # No posts found
                    
                    time.sleep(0.5)
                    
                except Exception as e:
                    print(f"Error searching for '{keyword}': {e}")
                    break
        
        all_found_posts.extend(posts_for_keyword)

    return all_found_posts

In [17]:
def save_posts_to_csv(posts, filename):
    """
    Saves a list of dictionaries to a CSV file.
    Replaces double quotes with single quotes in all string values.
    """
    if not posts:
        print(f"No posts to save for {filename}.")
        return

    fieldnames = list(posts[0].keys())
    
    sanitized_posts = []
    for post in posts:
        sanitized_post = {}
        for key, value in post.items():
            if isinstance(value, str):
                sanitized_post[key] = value.replace('"', "'")
            else:
                sanitized_post[key] = value
        sanitized_posts.append(sanitized_post)

    try:
        with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(sanitized_posts)
        print(f"Successfully saved {len(posts)} posts to '{filename}'")
    except IOError as e:
        print(f"Error saving to CSV: {e}")

In [18]:
# Execution
if __name__ == '__main__':
    if "yourhandle" in HANDLE or "your-app-password" in PASSWORD:
        print("WARNING: It looks like you are using placeholder credentials.")
        print("Please update the 'parameters' cell above with your actual Handle and Password.")
    else:
        print(f"Starting multi-language download loop...")
        print(f"Max posts per keyword: {MAX_POSTS_PER_KEYWORD}")
        print(f"Date Range: {DATE_FILTER_START} to {DATE_FILTER_END}")
        
        # Top level progress bar for separate datasets/languages
        # We use tqdm on KEYWORD_GROUPS.items()
        for group_name, config in tqdm(KEYWORD_GROUPS.items(), desc="Processing Datasets"):
            target_lang = config['lang']
            keywords = config['keywords']
            
            # print(f"Processing Group: {group_name}")
            
            posts = download_posts_by_keywords(
                keywords, 
                HANDLE, 
                PASSWORD, 
                max_posts=MAX_POSTS_PER_KEYWORD, 
                lang=target_lang,
                start_date=DATE_FILTER_START,
                end_date=DATE_FILTER_END
            )
            

            output_filename = f"{OUTPUT_DIR}{group_name}_bluesky_posts.csv"
            save_posts_to_csv(posts, output_filename)

Starting multi-language download loop...
Max posts per keyword: 1000
Date Range: 2019-01-01 to 2026-01-01


Processing Datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Keywords (en):   0%|          | 0/18 [00:00<?, ?it/s]

Downloading 'green transition':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'greenhouse effect':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'loss of biodiversity':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'extreme weather events':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'co2':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'emissions':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'global warming':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'melting glaciers':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'renewable energy':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'misinformation':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'catastrophe':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'anthropogenic ':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'sustainable biofuels':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'desertification':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'deforestation':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'clean energy':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'sea level rise':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'extinction':   0%|          | 0/1000 [00:00<?, ?it/s]

Successfully saved 17285 posts to './downloaded_data/Global_bluesky_posts.csv'


Keywords (es):   0%|          | 0/18 [00:00<?, ?it/s]

Downloading 'transición verde':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'efecto invernadero':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'pérdida de biodiversidad':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'eventos climáticos extremos':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'co2':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'emisiones':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'calentamiento global':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'derretimiento de glaciares':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'energía renovable':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'desinformación':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'catástrofe':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'antropogénico':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'biocombustibles sostenibles':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'desertificación':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'deforestación':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'energía limpia':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'aumento del nivel del mar':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'extinción':   0%|          | 0/1000 [00:00<?, ?it/s]

Successfully saved 12233 posts to './downloaded_data/Spain_bluesky_posts.csv'


Keywords (pt):   0%|          | 0/18 [00:00<?, ?it/s]

Downloading 'transição verde':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'efeito estufa':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'perda de biodiversidade':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'eventos climáticos extremos':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'co2':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'emissões':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'aquecimento global':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'derretimento das geleiras':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'energia renovável':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'desinformação':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'catástrofe':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'antropogênico':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'biocombustíveis sustentáveis':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'desertificação':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'desmatamento':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'energia limpa':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'aumento do nível do mar':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'extinção':   0%|          | 0/1000 [00:00<?, ?it/s]

Successfully saved 12409 posts to './downloaded_data/Portugal_bluesky_posts.csv'


Keywords (it):   0%|          | 0/18 [00:00<?, ?it/s]

Downloading 'transizione verde':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'effetto serra':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'perdita di biodiversità':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'eventi meteorologici estremi':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'co2':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'emissioni':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'riscaldamento globale':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'scioglimento dei ghiacciai':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'energia rinnovabile':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'disinformazione':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'catastrofe':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'antropogenico':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'biocarburanti sostenibili':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'desertificazione':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'deforestazione':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'energia pulita':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'innalzamento del livello del mare':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'estinzione':   0%|          | 0/1000 [00:00<?, ?it/s]

Successfully saved 6039 posts to './downloaded_data/Italy_bluesky_posts.csv'


Keywords (de):   0%|          | 0/18 [00:00<?, ?it/s]

Downloading 'grüne Wende':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Treibhauseffekt':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Biodiversitätsverlust':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'extreme Wetterereignisse':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'co2':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Emissionen':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Erderwärmung':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Gletscherschmelze':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'erneuerbare Energien':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Fehlinformationen':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Katastrophe':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'anthropogen':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'nachhaltige Biokraftstoffe':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Wüstenbildung':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Abholzung':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'saubere Energie':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Meeresspiegelanstieg':   0%|          | 0/1000 [00:00<?, ?it/s]

Downloading 'Aussterben':   0%|          | 0/1000 [00:00<?, ?it/s]

Successfully saved 10766 posts to './downloaded_data/Germany_bluesky_posts.csv'
