In [0]:
import requests

# uploaded stackoverflow api key using databricks-cli (docker image)
api_key = dbutils.secrets.get('so_api_scope', 'so_api_key')

def fetch_page(page):
    url = "https://api.stackexchange.com/2.3/questions"
    params = {
        "page": page,
        "pagesize": 100,
        "order": "desc",
        "sort": "creation",
        "site": "stackoverflow",
        "key": api_key
    }
    data = requests.get(url, params=params).json()
    return data.get("items", [])

In [0]:
# example
fetch_page(1)

{'items': [{'tags': ['css', 'dom', 'google-chrome-extension', 'gmail', 'inboxsdk'], 'owner': {'account_id': 3016866, 'reputation': 863, 'user_id': 2558764, 'user_type': 'registered', 'accept_rate': 72, 'profile_image': 'https://www.gravatar.com/avatar/81db839a7ab31ef6790518a9851782af?s=256&d=identicon&r=PG', 'display_name': 'kecman', 'link': 'https://stackoverflow.com/users/2558764/kecman'}, 'is_answered': False, 'view_count': 1, 'answer_count': 0, 'score': 0, 'last_activity_date': 1759375271, 'creation_date': 1759375271, 'question_id': 79780605, 'content_license': 'CC BY-SA 4.0', 'link': 'https://stackoverflow.com/questions/79780605/adding-custom-div-to-compose-window-with-inboxsdk-makes-issues-with-height-when', 'title': 'Adding custom div to compose window with InboxSDK makes issues with height when going to fullscreen mode of compose window'}, {'tags': ['python', 'web-scraping', 'youtube', 'youtube-data-api'], 'owner': {'account_id': 44168359, 'reputation': 1, 'user_id': 31612690, 

[{'tags': ['css', 'dom', 'google-chrome-extension', 'gmail', 'inboxsdk'],
  'owner': {'account_id': 3016866,
   'reputation': 863,
   'user_id': 2558764,
   'user_type': 'registered',
   'accept_rate': 72,
   'profile_image': 'https://www.gravatar.com/avatar/81db839a7ab31ef6790518a9851782af?s=256&d=identicon&r=PG',
   'display_name': 'kecman',
   'link': 'https://stackoverflow.com/users/2558764/kecman'},
  'is_answered': False,
  'view_count': 1,
  'answer_count': 0,
  'score': 0,
  'last_activity_date': 1759375271,
  'creation_date': 1759375271,
  'question_id': 79780605,
  'content_license': 'CC BY-SA 4.0',
  'link': 'https://stackoverflow.com/questions/79780605/adding-custom-div-to-compose-window-with-inboxsdk-makes-issues-with-height-when',
  'title': 'Adding custom div to compose window with InboxSDK makes issues with height when going to fullscreen mode of compose window'},
 {'tags': ['python', 'web-scraping', 'youtube', 'youtube-data-api'],
  'owner': {'account_id': 44168359,
  

In [0]:
from pyspark.sql import functions as F
import json

def process_page(page):
    items = fetch_page(page)
    if not items:
        return
    
    # convert problematic nested fields to JSON strings
    for item in items:
        complex_fields = ['migrated_to', 'migrated_from', 'posted_by_collectives', 'owner', 'closed_details']
        
        for field in complex_fields:
            if field in item and item[field] is not None:
                item[field] = json.dumps(item[field])

    df = spark.createDataFrame(items) \
            .withColumn("question_id", F.col("question_id").cast("long")) \
            .withColumn("creation_date", F.col("creation_date").cast("long")) \
            .withColumn("creation_period", F.floor(F.col("creation_date") / 1e6).cast("long")) \
            .withColumn("last_activity_date", F.col("last_activity_date").cast("long"))
    # dates are in unix time (unix epoch, seconds since 1970-01-01), so we divide by 1e7 ~= 100 days, to partition data later using this column
    df = df.select("question_id", "creation_date", "creation_period", "last_activity_date",
                   *[c for c in df.columns if c not in ["question_id", "creation_date", "creation_period", "last_activity_date"]])
    
    return df

In [0]:
process_page(1).display()

question_id,creation_date,creation_period,last_activity_date,answer_count,content_license,is_answered,link,owner,score,tags,title,view_count,last_edit_date,closed_date,closed_reason,accepted_answer_id
79780634,1759381831,1759,1759381831,0,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780634/apt-update-fails-with-conflicting-values-set-for-option-signed-by-after-adding,"Map(reputation -> 1, display_name -> Mahavir vikrambhai Dodiya, user_id -> 31613051, account_id -> 43916646, user_type -> registered, link -> https://stackoverflow.com/users/31613051/mahavir-vikrambhai-dodiya, profile_image -> https://www.gravatar.com/avatar/20b395c743bde7b37ea5ac7dde88968d?s=256&d=identicon&r=PG&f=y&so-version=2)",0,"List(linux, ubuntu)","apt update fails with ""Conflicting values set for option Signed-By"" after adding ROS 2 Jazzy repository on Ubuntu 24.04",2,,,,
79780632,1759381729,1759,1759381729,0,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780632/what-is-the-use-case-for-tanstack-start-useserverfn-hook,"Map(reputation -> 41028, accept_rate -> 75, display_name -> Michał Turczyn, user_id -> 7132550, account_id -> 9606741, user_type -> registered, link -> https://stackoverflow.com/users/7132550/micha%c5%82-turczyn, profile_image -> https://graph.facebook.com/1063981417044495/picture?type=large)",0,"List(reactjs, tanstackreact-query, tanstack, tanstack-start)",What is the use case for TanStack Start useServerFn hook,3,,,,
79780631,1759381584,1759,1759381584,0,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780631/how-to-remove-the-arrow-in-the-shadcn-tooltip-component-without-changing-the-mai,"Map(reputation -> 29, display_name -> m112120, user_id -> 30132998, account_id -> 41126044, user_type -> registered, link -> https://stackoverflow.com/users/30132998/m112120, profile_image -> https://www.gravatar.com/avatar/75964ea76c37bcc48af3b05b178320e7?s=256&d=identicon&r=PG&f=y&so-version=2)",0,List(shadcnui),How to remove the arrow in the shadcn tooltip component without changing the main imported code?,3,,,,
79780627,1759381033,1759,1759381054,0,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780627/get-monthly-descending-date,"Map(reputation -> 1, display_name -> San Pedro Rhenz Idol II, user_id -> 28289568, account_id -> 37523449, user_type -> registered, link -> https://stackoverflow.com/users/28289568/san-pedro-rhenz-idol-ii, profile_image -> https://www.gravatar.com/avatar/e0c0f08c8e181357b467d0e28f9f13a6?s=256&d=identicon&r=PG&f=y&so-version=2)",0,"List(google-sheets, excel-formula, google-sheets-formula, formula)",Get Monthly Descending Date,5,1759381054.0,,,
79780625,1759380637,1759,1759380637,0,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780625/vs-code-remote-ssh-2fa-issue,"Map(reputation -> 13, display_name -> Ab_Ad , user_id -> 19476808, account_id -> 25719653, user_type -> registered, link -> https://stackoverflow.com/users/19476808/ab-ad, profile_image -> https://lh3.googleusercontent.com/a/AATXAJxBAEmG3gCdTq7kGLpSSnqxTULQNnUaN2ldfTMdKw=k-s256)",0,"List(linux, visual-studio-code, vscode-remote-ssh)",VS code Remote SSH: 2FA issue,8,,,,
79780620,1759379061,1759,1759380101,0,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780620/access-to-external-ip-from-docker-container-via-linux-vm-via-host-windows-11-usi,"Map(reputation -> 1, display_name -> Сергей Новиков, user_id -> 12337598, account_id -> 17052216, user_type -> registered, link -> https://stackoverflow.com/users/12337598/%d0%a1%d0%b5%d1%80%d0%b3%d0%b5%d0%b9-%d0%9d%d0%be%d0%b2%d0%b8%d0%ba%d0%be%d0%b2, profile_image -> https://lh3.googleusercontent.com/-tcdCfbnNAnM/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3rf9HcyXOqF8YIvpv4WSt7BKmfFNpQ/s256-rj/photo.jpg)",0,"List(linux, windows, docker, vagrant, virtualbox)",Access to external IP from docker container via Linux VM via host Windows 11 using Vagrant,20,1759380101.0,,,
79780616,1759378232,1759,1759379100,1,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780616/vba-in-excel-to-auto-send-emails-skip-over-recipients-without-attachments,"Map(reputation -> 11, display_name -> user31611822, user_id -> 31611822, account_id -> 44166461, user_type -> registered, link -> https://stackoverflow.com/users/31611822/user31611822, profile_image -> https://www.gravatar.com/avatar/d20e0ee407ea121f39817575aeee8f1a?s=256&d=identicon&r=PG&f=y&so-version=2)",1,"List(excel, vba, email, automation, outlook)",VBA in Excel to auto send emails skip over recipients without attachments,16,,,,
79780615,1759377953,1759,1759377953,0,,False,https://stackoverflow.com/questions/79780615/self-hosting-map-server-for-real-time-tracker-web-app,"Map(reputation -> 1, display_name -> nzfaaaaaa, user_id -> 31319665, account_id -> 43596168, user_type -> registered, link -> https://stackoverflow.com/users/31319665/nzfaaaaaa, profile_image -> https://www.gravatar.com/avatar/4aec6b2099a7dcc4875d9ff68a773fab?s=256&d=identicon&r=PG&f=y&so-version=2)",-2,"List(server, self-hosting, mapserver)",Self-Hosting Map Server for Real-Time Tracker Web App,22,,1759379474.0,Not suitable for this site,
79780613,1759377024,1759,1759377024,0,CC BY-SA 4.0,False,https://stackoverflow.com/questions/79780613/why-xero-prompts-for-company-selection-even-when-using-only-openid-email,"Map(reputation -> 94, display_name -> Thomas Lo, user_id -> 5388148, account_id -> 7033131, user_type -> registered, link -> https://stackoverflow.com/users/5388148/thomas-lo, profile_image -> https://graph.facebook.com/10153283560269037/picture?type=large)",0,List(xero-api),"Why Xero Prompts for Company Selection Even When Using Only 'openid', 'email', 'profile' Scopes for Login",21,,,,
79780612,1759376828,1759,1759379151,1,CC BY-SA 4.0,True,https://stackoverflow.com/questions/79780612/how-do-i-remove-duplicate-column-in-same-row-excel,"Map(reputation -> 1, display_name -> lRosettaStoned, user_id -> 31574244, account_id -> 44094443, user_type -> registered, link -> https://stackoverflow.com/users/31574244/lrosettastoned, profile_image -> https://www.gravatar.com/avatar/3140e2b1f7862a9bd1b27d244fedbc7b?s=256&d=identicon&r=PG&f=y&so-version=2)",0,"List(excel, excel-formula, multiple-columns)",How do i remove duplicate column in same row Excel?,18,1759376889.0,,,


In [0]:
%sql
CREATE DATABASE IF NOT EXISTS bronze;
CREATE TABLE IF NOT EXISTS bronze.questions
USING DELTA
LOCATION '/mnt/bronze/questions';



[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4817661498311894>, line 1[0m
[0;32m----> 1[0m get_ipython()[38;5;241m.[39mrun_cell_magic([38;5;124m'[39m[38;5;124msql[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124m'[39m, [38;5;124m"[39m[38;5;124mCREATE DATABASE IF NOT EXISTS bronze;[39m[38;5;130;01m\n[39;00m[38;5;124mCREATE DATABASE IF NOT EXISTS silver;[39m[38;5;130;01m\n[39;00m[38;5;124mCREATE DATABASE IF NOT EXISTS gold;[39m[38;5;130;01m\n[39;00m[38;5;130;01m\n[39;00m[38;5;124mCREATE TABLE IF NOT EXISTS bronze.questions[39m[38;5;130;01m\n[39;00m[38;5;124mUSING DELTA[39m[38;5;130;01m\n[39;00m[38;5;124mPARTITIONED BY (creation_period)[39m[38;5;130;01m\n[39;00m[38;5;124mLOCATION [39m[38;5;124m'[39m[38;5;124m/mnt/bronze/questions[39m[38;5;124m'[39m[38;5;124m;[39m[38;5;130;01m\n[39;00m[38;5;

With a given (API key, IP) pair, we have 1000 API calls per day.

Each call gives us 100 entries (questions).

To avoid hitting the limit (since we've made some calls already), let's say we have 900.

What we could do is to use 10 workers in parallel...

In [0]:
from concurrent.futures import ThreadPoolExecutor
from typing import List
from functools import reduce

def fetch_and_write_parallel(
    pages: List[int],
    table_name: str = "bronze.questions",
    max_workers: int = 10
):
    
    # fetch  in parallel
    dfs = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(process_page, pages)
        dfs = [df for df in results if df is not None]
    
    # combine using unionByName (handles different columns)
    combined_df = reduce(
        lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True),
        dfs
    )
    
    # write to delta lake
    combined_df.write \
        .format("delta") \
        .mode("append") \
        .partitionBy("creation_period") \
        .saveAsTable("bronze.questions")

fetch_and_write_parallel(
    pages=range(1,901),
    table_name="bronze.questions",
    max_workers=10
)

✅ Written 90000 rows to /mnt/bronze/questions


In [0]:
%sql
DESCRIBE DETAIL bronze.questions;

format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics,clusterByAuto
delta,a263128f-fee4-4665-847a-6c321fb79d64,,,dbfs:/mnt/bronze/questions,2025-10-02T05:27:33.096Z,2025-10-02T05:42:09Z,List(creation_period),List(),3620,52815757,Map(delta.enableDeletionVectors -> true),3,7,"List(appendOnly, deletionVectors, invariants)","Map(numRowsDeletedByDeletionVectors -> 0, numDeletionVectors -> 0)",False


In [0]:
%sql
OPTIMIZE bronze.questions 
ZORDER BY (question_id)

path,metrics
dbfs:/mnt/bronze/questions,"List(21, 3620, List(344470, 1756914, 965396.5238095238, 21, 20273327), List(7341, 24111, 14589.988121546961, 3620, 52815757), 21, List(minCubeSize(107374182400), List(0, 0), List(3620, 52815757), 0, List(3620, 52815757), 21, null), null, 0, 1, 3620, 0, false, 0, 0, 1759390099002, 1759390109957, 4, 21, null, List(0, 0), null, 24, 24, 4565, 0, null)"


In [0]:
%sql
DESCRIBE DETAIL bronze.questions

format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics,clusterByAuto
delta,a263128f-fee4-4665-847a-6c321fb79d64,spark_catalog.bronze.questions,,dbfs:/mnt/bronze/questions,2025-10-02T05:27:33.096Z,2025-10-02T07:28:29Z,List(creation_period),List(),21,20273327,Map(delta.enableDeletionVectors -> true),3,7,"List(appendOnly, deletionVectors, invariants)","Map(numRowsDeletedByDeletionVectors -> 0, numDeletionVectors -> 0)",False


In [0]:
%sql SELECT * FROM silver.questions LIMIT 20

question_id,creation_date,creation_period,last_activity_date,tags,answer_count,is_answered,view_count
79484450,2025-03-04T17:32:49Z,1741,2025-03-06T13:40:40Z,"List(reactjs, three.js, react-three-fiber)",1,False,63
79507704,2025-03-13T21:48:47Z,1741,2025-03-13T21:48:47Z,"List(python-unittest, python-unittest.mock, odoo-17)",0,False,21
79502686,2025-03-12T05:10:11Z,1741,2025-03-12T07:00:15Z,"List(c#, wpf)",0,False,70
79490443,2025-03-06T19:12:41Z,1741,2025-04-15T14:45:05Z,"List(apache, http, devops, artifactory)",0,False,114
79496652,2025-03-09T21:50:33Z,1741,2025-03-10T17:02:46Z,"List(flutter, dart, debugging, intellij-idea, dart-pub)",2,True,108
79496633,2025-03-09T21:34:26Z,1741,2025-03-10T11:42:07Z,"List(javascript, konvajs)",1,True,86
79496615,2025-03-09T21:12:29Z,1741,2025-03-09T22:47:09Z,"List(html, css, alignment, text-align)",2,True,73
79501928,2025-03-11T19:35:39Z,1741,2025-06-30T08:08:35Z,"List(c#, multithreading, asynchronous, concurrency, task-parallel-library)",2,True,119
79501845,2025-03-11T18:58:39Z,1741,2025-03-14T12:39:19Z,"List(python, hdf5, h5py)",1,False,51
79507917,2025-03-14T00:19:56Z,1741,2025-03-14T00:19:56Z,"List(stm32, stm32cubeide, stm32f1, stm32cubemx)",0,False,110
