In [0]:
import requests, json

base_url = "https://api.stackexchange.com/2.3/questions"
page = 1
while True:
    params = {
        "order": "desc",
        "sort": "creation",
        "site": "stackoverflow",
        "pagesize": 100,
        "page": page,
        "key": "rl_M1ZjooB6SNUAif5gAuSiS4Uzw" # TODO: store this in key vault later
    }
    resp = requests.get(base_url, params=params).json()
    if not resp.get("items"):
        break
    
    path = f"/mnt/so/bronze/questions_page={page}.json"
    dbutils.fs.put(path, json.dumps(resp), overwrite=True)
    
    if not ( resp.get("has_more", False) or resp.get("quota_remaining", 0) > 0 ):
        break
    page += 1


In [0]:
from pyspark.sql.functions import col, explode

df = spark.read.json("/mnt/so/bronze/*.json")

# Items array → rows
df_items = df.select(explode("items").alias("q"))

# Flatten minimal schema
df_flat = df_items.select(
    col("q.question_id").alias("id"),
    col("q.title"),
    col("q.view_count"),
    col("q.creation_date"),
    col("q.tags")
)

df_flat.write.format("delta").mode("overwrite").save("/mnt/so/silver/questions")


In [0]:
%sql
CREATE TABLE IF NOT EXISTS stackoverflow_questions
USING DELTA
LOCATION '/mnt/so/silver/questions';


In [0]:
%sql
SELECT * FROM stackoverflow_questions ORDER BY id DESC LIMIT 10;

In [0]:
%sql
SELECT * FROM stackoverflow_questions ORDER BY id LIMIT 10;

In [0]:
%sql
SELECT COUNT(*) FROM stackoverflow_questions;

In [0]:
files = dbutils.fs.ls("/mnt/so/bronze/")
print(sum([f.size for f in files])/(1024**3), 'gb')

In [0]:
%sql
select dayish, count(dayish) c from (select int(creation_date/100000) dayish from stackoverflow_questions) x group by dayish;

In [0]:
from pyspark.sql.functions import avg
_sqldf.agg(avg('c')).collect()[0][0]
# esto me da la cantidad de preguntas promedio por 'dayish' (10^5 segundos = 1.16 dias)

In [0]:
path = f"/mnt/so/bronze/questions_page=1.json"
dbutils.fs.head(path)

la idea ahora es guardar por dayish

In [0]:
%fs ls /mnt/so/bronze/

In [0]:
import json

with open("/dbfs/mnt/so/bronze/questions_page=1.json") as f:
    j = json.load(f)
    print(j)
    print(len(j.get('items')))

what i'll do next is to change how/where i store the data;

before i was storing data indexed per page, but this is not a good way to do it if i want to do incremental updates later, the pages will shift and i will have to re-run everything

instead i will store the data having the same creation_date/10**5 (this is, questions created more or less in the same day... let's call this creation_dayish) in a file \<creation_dayish\>.json

so for each question_id, i get the creation_dayish and know where to find this record (if it exists). if it doesnt exist, i know where to save it. if it does, i have to compare the 'last_edit_date' field and update only if these are different

one more thing to consider: the recommendation is to have these files in the bronze layer with size 32–512 MB

with 100 items, they are about 78000 bytes in size ---- that'd be ok, i might have 10x that storing per 'dayish'... a bit above the recommended range, but that's ok

In [0]:
import os

with open("/dbfs/mnt/so/bronze/questions_page=1.json") as f:
    items = json.load(f).get('items')

    for item in items[0:2]:
        print(item.get('creation_date'))


In [0]:
%sh mkdir /dbfs/mnt/so/bronze/days

In [0]:
%fs ls /mnt/so/bronze/days

In [0]:
%fs ls /mnt/so/bronze


In [0]:
import os, json
from collections import defaultdict

input_dir = "/dbfs/mnt/so/bronze"
output_dir = "/dbfs/mnt/so/bronze/days"
os.makedirs(output_dir, exist_ok=True)



with open("/dbfs/mnt/so/bronze/questions_page=1.json") as f:
    items = json.load(f).get('items')

    for item in items:
        dayish = item.get('creation_date')//100000
        filepath = f"/dbfs/mnt/so/bronze/days/{dayish}.json"
        if os.path.exists(filepath):
            with open(filepath, 'r') as dailysh_f:
                d_items = json.load(dailysh_f)
                d_items.extend(item)
            with open(filepath, 'w') as dailysh_f:
                d_items = json.dump(d_items, dailysh_f)
        else:
            with open(filepath, 'w') as dailysh_f:
                json.dump([item], dailysh_f)


In [0]:
%fs ls /mnt/so/bronze/days

In [0]:
%fs head /mnt/so/bronze/days/17590.json

In [0]:
from pyspark.sql import functions as functions

df = spark.read.json("/mnt/so/bronze/pages/*.json")

In [0]:
df.select('quota_remaining').show()

In [0]:
from pyspark.sql import functions as F

items_df = df.select(F.explode("items").alias("item"))

In [0]:
items_df.display()

In [0]:
itemswithdayish_df = items_df.withColumn("creation_dayish", (F.col("item.creation_date")/100000).cast("long"))

In [0]:
itemswithdayish_df.write.partitionBy("creation_dayish").format("delta").mode("append").save("/mnt/so/bronze/days/questions")

In [0]:
%sql OPTIMIZE delta.`/mnt/so/bronze/days/questions`

In [0]:
%sql VACUUM delta.`/mnt/so/bronze/days/questions` RETAIN 168 HOURS

In [0]:
%sql DESCRIBE DETAIL delta.`/mnt/so/bronze/days/questions`

In [0]:
%sql SELECT * FROM delta.`/mnt/so/bronze/days/questions`

In [0]:
_sqldf.display()

In [0]:
itemswithdayish_df.write.partitionBy("creation_dayish")

In [0]:
df = spark.read.format("delta").load("/mnt/so/bronze/days/questions")

In [0]:
a = df.head()

In [0]:
a.asDict()['item']

In [0]:
dbutils.secrets.listScopes()

In [0]:
apikey = dbutils.secrets.get('so_api_scope', 'so_api_key')

In [0]:
len(apikey)