In [0]:
import os
import requests
import json
import yfinance as yf
import pandas as pd
from datetime import date

from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, explode, col, from_unixtime, lit, map_from_arrays

In [0]:
%fs ls dbfs:/FileStore/bronze/yahoo

In [0]:
bronze_root_path = "/dbfs/FileStore/bronze/yahoo"
silver_root_path = "/dbfs/FileStore/silver/yahoo"
spark_bronze_root_path = "dbfs:/FileStore/bronze/yahoo"
spark_silver_root_path = "dbfs:/FileStore/silver/yahoo"
my_portfolio = ('GOOG', 'SOPH', 'PYPL', 'NOV', 'KRN', 'AMZN', 'NVDA', 'SQN', 'TGT')

In [0]:
def get_raw_yahoo_financial_statements(src_root_path, ticker):
    try:
        current_ticker = yf.Ticker(ticker)
        today = date.today()
        current_ticker.get_balance_sheet().to_json(f'{src_root_path}/{ticker}_balance_sheet_{date.today()}.json')
        current_ticker.get_income_stmt().to_json(f'{src_root_path}/{ticker}_income_stmt_sheet_{date.today()}.json')
        current_ticker.get_cashflow().to_json(f'{src_root_path}/{ticker}_cashflow_{date.today()}.json')

    except (requests.ConnectionError, requests.Timeout) as e:
        print(f"Connection error for {ticker}: {e}")
    except Exception as e:
        print(f"Unexpected error for {ticker}: {e}")


In [0]:
for stock in my_portfolio:
    print(f'processing stock {stock}')
    get_raw_yahoo_financial_statements(bronze_root_path, stock)

In [0]:
def unpivot_income_statement_pyspark():
    files_path_to_unpivot = [f'{spark_bronze_root_path}/{f}' for f in os.listdir(bronze_root_path) if f.endswith('.json')]
    for file_name in files_path_to_unpivot:
        print(f'processing file {file_name} from bronze -> silver')
        spark_df = spark.read.option('multiline', 'true').json(file_name)

In [0]:
unpivot_income_statement_pyspark()

In [0]:
raw_df = bspark.read.option("multiline", "true").json('dbfs:/FileStore/bronze/yahoo/AMZN_balance_sheet_2025-10-15.json')

In [0]:
cols = raw_df.columns
m = raw_df.select(map_from_arrays(
    array(*[lit(c) for c in cols]),
    array(*[col(c) for c in cols])
  ).alias("m")
)
kv = m.selectExpr("explode(m) as (unix_ms, data)")

# 3) Convert ms -> timestamp and flatten inner struct
df = kv.select(
    from_unixtime((col("unix_ms").cast("bigint")/1000)).alias("date_time"),
    col("data.*")
)

df.show(5, truncate=False)

In [0]:
spark.read.json('dbfs:/FileStore/bronze/yahoo/AMZN_balance_sheet_2025-10-15.json').show()

In [0]:
with open('/dbfs/FileStore/bronze/yahoo/AMZN_balance_sheet_2025-10-15.json', 'r') as file:
    data = json.load(file)
    print(data.get('1735603200000'))

In [0]:
# usage of spark tables to stores
def unpivot_income_statement(bronze_path):
    files_path_to_unpivot = [f for f in os.listdir(bronze_path) if f.endswith('.json')]
    for file in files_path_to_unpivot:
        print(f'processing file {file} from bronze -> silver')
        df = pd.read_json(f'{bronze_root_path}/{file}')
        unpivoted = df.\
            reset_index().\
            melt(id_vars='index', var_name='date', value_name='value').\
            rename(columns={'index':'metric'})
        unpivoted.to

In [0]:
unpivot_income_statement(bronze_root_path)

In [0]:
unpivot_income_statement = income_statement.\
    reset_index().\
    melt(id_vars='index', var_name='date', value_name='value').\
    rename(columns={'index':'metric'})

In [0]:
%fs ls "dbfs:/FileStore/bronze/yahoo"