In [1]:
import os
import requests
import json
import yfinance as yf
import pandas as pd
from datetime import date

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, explode, col, from_unixtime, lit, map_from_arrays

In [2]:
import sys, os
# Point Spark to this exact Python
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [3]:
spark = (SparkSession.builder
         .master("local[*]")
         .appName("LocalPySpark")
         .config("spark.pyspark.python", sys.executable)
         .config("spark.pyspark.driver.python", sys.executable)
         .getOrCreate())

In [4]:
local_root = r"C:\Users\alexa\OneDrive\Bureau\Data Engineering\Yahoo DataBricks Pipeline"
db_root_python = "/dbfs"
db_root_spark = "dbfs:"

In [7]:
bronze_path = "/FileStore/bronze/yahoo"
silver_path = "/FileStore/silver/yahoo"
my_portfolio = ('GOOG', 'SOPH', 'PYPL', 'NOV', 'KRN', 'AMZN', 'NVDA', 'SQN', 'TGT')

In [8]:
def unpivot_pandas_statement(ticker, statement):
    temp = statement.\
    reset_index().\
    melt(id_vars='index', var_name='date', value_name='value').\
    rename(columns={'index':'metric'})
    temp['ticker'] = ticker
    return temp

In [24]:
def get_raw_yahoo_financial_statements(src_root_path, ticker):
    try:
        current_ticker = yf.Ticker(ticker)
        unpivot_pandas_statement(ticker, current_ticker.get_balance_sheet()).\
            to_json(f'{src_root_path}/{ticker}_balance_sheet_{date.today()}.json', orient='records')
        unpivot_pandas_statement(ticker, current_ticker.get_income_stmt())\
            .to_json(f'{src_root_path}/{ticker}_income_stmt_sheet_{date.today()}.json', orient='records')
        unpivot_pandas_statement(ticker, current_ticker.get_cashflow()).\
            to_json(f'{src_root_path}/{ticker}_cashflow_{date.today()}.json', orient='records')

    except (requests.ConnectionError, requests.Timeout) as e:
        print(f"Connection error for {ticker}: {e}")
    except Exception as e:
        print(f"Unexpected error for {ticker}: {e}")


In [25]:
for stock in my_portfolio:
    print(f'processing stock {stock}')
    get_raw_yahoo_financial_statements(local_root+bronze_path, stock)

processing stock GOOG
processing stock SOPH
processing stock PYPL
processing stock NOV
processing stock KRN
processing stock AMZN
processing stock NVDA
processing stock SQN
processing stock TGT


In [26]:
raw_df = spark.read.option("multiline", "true").json(f'{local_root}{bronze_path}/AMZN_balance_sheet_2025-10-17.json')

In [29]:
raw_df.show()

+-------------+--------------------+------+----------+
|         date|              metric|ticker|     value|
+-------------+--------------------+------+----------+
|1735603200000|TreasurySharesNumber|  AMZN|    5.15E8|
|1735603200000|OrdinarySharesNumber|  AMZN| 1.0593E10|
|1735603200000|         ShareIssued|  AMZN| 1.1108E10|
|1735603200000|             NetDebt|  AMZN|      NULL|
|1735603200000|           TotalDebt|  AMZN|  1.309E11|
|1735603200000|   TangibleBookValue|  AMZN|2.54294E11|
|1735603200000|     InvestedCapital|  AMZN|3.38593E11|
|1735603200000|      WorkingCapital|  AMZN| 1.1436E10|
|1735603200000|   NetTangibleAssets|  AMZN|2.54294E11|
|1735603200000|CapitalLeaseOblig...|  AMZN| 7.8277E10|
|1735603200000|   CommonStockEquity|  AMZN| 2.8597E11|
|1735603200000| TotalCapitalization|  AMZN|3.38593E11|
|1735603200000|TotalEquityGrossM...|  AMZN| 2.8597E11|
|1735603200000|  StockholdersEquity|  AMZN| 2.8597E11|
|1735603200000|GainsLossesNotAff...|  AMZN|    -3.4E7|
|173560320

In [28]:
pd.read_json(f'{local_root}{bronze_path}/AMZN_balance_sheet_2025-10-17.json')

Unnamed: 0,metric,date,value,ticker
0,TreasurySharesNumber,2024-12-31,5.150000e+08,AMZN
1,OrdinarySharesNumber,2024-12-31,1.059300e+10,AMZN
2,ShareIssued,2024-12-31,1.110800e+10,AMZN
3,NetDebt,2024-12-31,,AMZN
4,TotalDebt,2024-12-31,1.309000e+11,AMZN
...,...,...,...,...
300,AllowanceForDoubtfulAccountsReceivable,2020-12-31,,AMZN
301,GrossAccountsReceivable,2020-12-31,,AMZN
302,CashCashEquivalentsAndShortTermInvestments,2020-12-31,,AMZN
303,OtherShortTermInvestments,2020-12-31,,AMZN


In [0]:
def unpivot_income_statement_pyspark():
    files_path_to_unpivot = [f'{spark_bronze_root_path}/{f}' for f in os.listdir(bronze_root_path) if f.endswith('.json')]
    for file_name in files_path_to_unpivot:
        print(f'processing file {file_name} from bronze -> silver')
        spark_df = spark.read.option('multiline', 'true').json(file_name)

In [0]:
unpivot_income_statement_pyspark()

In [0]:
spark.read.json('dbfs:/FileStore/bronze/yahoo/AMZN_balance_sheet_2025-10-15.json').show()

In [0]:
# usage of spark tables to stores
def unpivot_income_statement(bronze_path):
    files_path_to_unpivot = [f for f in os.listdir(bronze_path) if f.endswith('.json')]
    for file in files_path_to_unpivot:
        print(f'processing file {file} from bronze -> silver')
        df = pd.read_json(f'{bronze_root_path}/{file}')
        unpivoted = df.\
            reset_index().\
            melt(id_vars='index', var_name='date', value_name='value').\
            rename(columns={'index':'metric'})
        unpivoted.to

In [0]:
unpivot_income_statement(bronze_root_path)

In [0]:
%fs ls "dbfs:/FileStore/bronze/yahoo"