In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import os

print("Libraries imported successfully.")

# --- Load the processed data from Notebook 1 ---
DATA_DIR = 'data'
PRICES_FILE = os.path.join(DATA_DIR, 'monthly_prices.csv')
RETURNS_FILE = os.path.join(DATA_DIR, 'monthly_excess_returns.csv')

# Load the data, ensuring the 'Date' column is parsed correctly as dates
monthly_prices = pd.read_csv(PRICES_FILE, index_col='Date', parse_dates=True)
monthly_excess_returns = pd.read_csv(RETURNS_FILE, index_col='Date', parse_dates=True)

# Get our list of tickers from the data
tickers = monthly_prices.columns.tolist()

print("Data from Notebook 1 loaded successfully.")


Libraries imported successfully.
Data from Notebook 1 loaded successfully.


In [2]:
# --- Download Necessary Data for Factor Calculation ---

# To calculate our factors, I need Market Cap (for Size), and Book Value (for Value).
# Fot this illustrative project, I can get this from yfinance's 'info' 
# attribute for each ticker.

# Create an empty dictionary to store the data
ticker_info = {}

print("Fetching financial data for each ticker from yfinance...")
for ticker in tickers:
    # yf.Ticker() creates a Ticker object that we can get info from
    stock_info = yf.Ticker(ticker).info
    ticker_info[ticker] = stock_info
    print(f"  ...fetched data for {ticker}")

print("Financial data fetched successfully.")


Fetching financial data for each ticker from yfinance...
  ...fetched data for AAPL
  ...fetched data for AMZN
  ...fetched data for GOOGL
  ...fetched data for JNJ
  ...fetched data for JPM
  ...fetched data for MSFT
  ...fetched data for PG
  ...fetched data for TSLA
  ...fetched data for UNH
  ...fetched data for XOM
Financial data fetched successfully.


In [12]:
ticker_info['AAPL']

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '(408) 996-1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple Vision Pro, Apple TV, Apple Watch, Beats products, and HomePod, as well as Apple branded and third-party accessories. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download app

In [19]:
# --- Create a DataFrame of Raw Factor Descriptors ---

# I'll extract the specific pieces of information we need.
# Note: yfinance keys can sometimes change. These are the common ones as of late 2023.
descriptors = pd.DataFrame(index=tickers)

descriptors['market_cap'] = [info.get('marketCap', np.nan) for ticker, info in ticker_info.items()]
descriptors['book_value'] = [info.get('bookValue', np.nan) * info.get('sharesOutstanding', np.nan) for ticker, info in ticker_info.items()]
descriptors['book_to_market'] = descriptors['book_value'] / descriptors['market_cap']

# For Momentum, I calculate the past 12-month return, excluding the most recent month.
momentum_period = monthly_prices.pct_change(periods=11).shift(1) # 11-month change, shifted by 1 month

# I'll just grab the most recent momentum value for this static example.
# A full-blown model would calculate this for every month in our history.
descriptors['momentum_12m_1m'] = momentum_period.iloc[-1]

print("Raw Descriptor DataFrame:")
descriptors

Raw Descriptor DataFrame:


Unnamed: 0,market_cap,book_value,book_to_market,momentum_12m_1m
AAPL,4056108761088,73748780000.0,0.018182,0.470113
AMZN,2667836342272,369742500000.0,0.138593,0.739167
GOOGL,3499663032320,186368000000.0,0.053253,0.502097
JNJ,463162900480,79379050000.0,0.171385,-0.097916
JPM,862276550656,340165700000.0,0.394497,0.198602
MSFT,3774681710592,362997300000.0,0.096166,0.593986
PG,345357516800,52499390000.0,0.152015,0.038647
TSLA,1449957392384,80012560000.0,0.055183,0.949018
UNH,294171082752,95769790000.0,0.325558,0.055088
XOM,507262926848,260551600000.0,0.513642,-0.036738


In [20]:
# --- Define the Standardization Function ---

def standardize_cap_weighted(series, weights):
    """
    Performs capitalization-weighted standardization (creates Z-scores).
    
    Args:
        series (pd.Series): A series of raw factor values (e.g., book-to-market ratios).
        weights (pd.Series): A series of market capitalization weights for the same stocks.
        
    Returns:
        pd.Series: The cap-weighted standardized Z-scores.
    """
    # Ensure indices match
    series = series.reindex(weights.index)
    
    # Calculate the cap-weighted mean
    mean = (series * weights).sum()
    
    # De-mean the series
    de_meaned_series = series - mean
    
    # Calculate the cap-weighted standard deviation
    squared_devs = (de_meaned_series**2) * weights
    std_dev = np.sqrt(squared_devs.sum())
    
    # Create the Z-scores
    z_scores = de_meaned_series / std_dev
    
    return z_scores

print("Standardization function defined.")


Standardization function defined.


In [None]:
# --- Build the Final Factor Exposure Matrix (X) ---

# Calculate the market cap weights
total_market_cap = descriptors['market_cap'].sum()
cap_weights = descriptors['market_cap'] / total_market_cap

# Create our final exposure matrix
X = pd.DataFrame(index=tickers)

# Standardize each of our factors using the function
# For Size, we standardize the log of market cap
X['Size'] = standardize_cap_weighted(np.log(descriptors['market_cap']), cap_weights)
X['Value'] = standardize_cap_weighted(descriptors['book_to_market'], cap_weights)
X['Momentum'] = standardize_cap_weighted(descriptors['momentum_12m_1m'], cap_weights)

print("Final Factor Exposure Matrix (X) for the most recent date:")
X

Final Factor Exposure Matrix (X) for the most recent date:


Unnamed: 0,Size,Value,Momentum
AAPL,0.673549,-0.791782,-0.229023
AMZN,0.072566,0.292714,0.913754
GOOGL,0.461882,-0.475909,-0.093175
JNJ,-2.439123,0.588061,-2.641663
JPM,-1.5476,2.597559,-1.382237
MSFT,0.570399,-0.089405,0.297117
PG,-2.86014,0.413602,-2.061628
TSLA,-0.802083,-0.45853,1.805074
UNH,-3.090257,1.976648,-1.991795
XOM,-2.308657,3.670656,-2.381819


In [23]:
# --- Sanity Check: Verify Benchmark Neutrality ---
# The cap-weighted average exposure of the market to our style factors should be zero.

market_exposures = (X * cap_weights.values.reshape(-1, 1)).sum()

print("Market's Exposure to each factor:")
print(market_exposures)


Market's Exposure to each factor:
Size        1.096345e-15
Value       1.387779e-17
Momentum    6.938894e-17
dtype: float64


In [24]:
# --- Save the Factor Exposure Matrix ---
# In the actual model, we would have an X matrix for every month.
# For this toy project, I'll save this single, most recent X matrix.
X_FILE = os.path.join(DATA_DIR, 'factor_exposures.csv')
X.to_csv(X_FILE)

print(f"\nFactor exposure matrix saved to {X_FILE}")
print("Notebook 2 is complete.")



Factor exposure matrix saved to data\factor_exposures.csv
Notebook 2 is complete.
