In [1]:
import yfinance as yf

# choose ticker
ticker = "AAPL"

# pull 1 year of daily OHLCV data
df_api = yf.download(ticker, period="1y", interval="1d")

print(df_api.head())
print(df_api.dtypes)


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Price            Close        High         Low        Open    Volume
Ticker            AAPL        AAPL        AAPL        AAPL      AAPL
Date                                                                
2024-08-19  224.843582  224.943125  222.006778  224.674371  40687800
2024-08-20  225.460693  226.117640  224.405606  224.724131  30299000
2024-08-21  225.351212  226.923894  224.007474  225.470666  34765500
2024-08-22  223.489868  227.282216  222.862782  226.734761  43695300
2024-08-23  225.789154  227.162766  223.290787  224.614628  38677300
Price   Ticker
Close   AAPL      float64
High    AAPL      float64
Low     AAPL      float64
Open    AAPL      float64
Volume  AAPL        int64
dtype: object





In [5]:
import pandas as pd

# If yfinance returned MultiIndex columns like ('High','AAPL'), collapse them
if isinstance(df_api.columns, pd.MultiIndex):
    # If it's a single ticker, just take the first level names: High, Low, ...
    if df_api.columns.nlevels == 2 and len(df_api.columns.levels[1]) == 1:
        df_api.columns = df_api.columns.get_level_values(0)
    else:
        # otherwise flatten by joining levels with '_'
        df_api.columns = ['_'.join([str(x) for x in tup if x and str(x) != 'nan'])
                          for tup in df_api.columns]

# Make sure Date is a column (yfinance index is DatetimeIndex)
df_api = df_api.reset_index()

# Normalize column names: lower + underscores
df_api.columns = [str(c).lower().replace(' ', '_') for c in df_api.columns]

# Coerce numeric columns; cast volume to Int64 (nullable int)
num_cols = ["open", "high", "low", "close", "adj_close", "volume"]
for c in num_cols:
    if c in df_api.columns:
        df_api[c] = pd.to_numeric(df_api[c], errors="coerce")

if "volume" in df_api.columns:
    df_api["volume"] = df_api["volume"].astype("Int64")

print(df_api.dtypes.head(10))
print(df_api.head())


index                   int64
date           datetime64[ns]
close_aapl            float64
high_aapl             float64
low_aapl              float64
open_aapl             float64
volume_aapl             int64
dtype: object
   index       date  close_aapl   high_aapl    low_aapl   open_aapl  \
0      0 2024-08-19  224.843582  224.943125  222.006778  224.674371   
1      1 2024-08-20  225.460693  226.117640  224.405606  224.724131   
2      2 2024-08-21  225.351212  226.923894  224.007474  225.470666   
3      3 2024-08-22  223.489868  227.282216  222.862782  226.734761   
4      4 2024-08-23  225.789154  227.162766  223.290787  224.614628   

   volume_aapl  
0     40687800  
1     30299000  
2     34765500  
3     43695300  
4     38677300  


In [9]:
import pandas as pd, os

# 1) If MultiIndex, flatten (safe to re-run)
if isinstance(df_api.columns, pd.MultiIndex):
    if df_api.columns.nlevels == 2 and len(df_api.columns.levels[1]) == 1:
        df_api.columns = df_api.columns.get_level_values(0)
    else:
        df_api.columns = ['_'.join([str(x) for x in tup if pd.notna(x)]) for tup in df_api.columns]

# 2) Make Date a column and normalize names
df_api = df_api.reset_index()
df_api.columns = [str(c).lower().replace(' ', '_') for c in df_api.columns]

# 3) Strip the ticker suffix like "_aapl" if present
ticker_suffixes = ("_aapl", "_msft", "_spy")  # include others if you change tickers later
def strip_suffix(c):
    for s in ticker_suffixes:
        if c.endswith(s):
            return c[: -len(s)]
    return c

df_api.columns = [strip_suffix(c) for c in df_api.columns]

# 4) Coerce numerics; cast volume to Int64
num_cols = ["open", "high", "low", "close", "adj_close", "volume"]
for c in num_cols:
    if c in df_api.columns:
        df_api[c] = pd.to_numeric(df_api[c], errors="coerce")
if "volume" in df_api.columns:
    df_api["volume"] = df_api["volume"].astype("Int64")

# 5) Validate required columns & NA counts (don’t index before confirming presence)
req = ["date", "open", "high", "low", "close", "adj_close", "volume"]
missing = [c for c in req if c not in df_api.columns]
print("Missing:", missing)
if not missing:
    print("NA counts:\n", df_api[req].isna().sum())
else:
    print("Skip NA count until columns fixed.")

# 6) Save
os.makedirs("data/raw", exist_ok=True)
out_path = "data/raw/AAPL.csv"
df_api.to_csv(out_path, index=False)
print("Saved:", out_path)

# (optional) quick peek
df_api.head()


Missing: ['adj_close']
Skip NA count until columns fixed.
Saved: data/raw/AAPL.csv


Unnamed: 0,level_0,index,date,close,high,low,open,volume
0,0,0,2024-08-19,224.843582,224.943125,222.006778,224.674371,40687800
1,1,1,2024-08-20,225.460693,226.11764,224.405606,224.724131,30299000
2,2,2,2024-08-21,225.351212,226.923894,224.007474,225.470666,34765500
3,3,3,2024-08-22,223.489868,227.282216,222.862782,226.734761,43695300
4,4,4,2024-08-23,225.789154,227.162766,223.290787,224.614628,38677300
