In [2]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import yfinance as yf
from fpylll import IntegerMatrix, LLL
from scipy.optimize import linprog

In [None]:
def fetch_stock_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date)
    data = data.dropna(axis=1, how='all')  # Drop columns with all NaN values
    data = data.dropna(axis=0, how='all')  # Drop rows with all NaN values
    data = data.ffill()  # Fill NaN values with the previous valid observation
    return data

In [125]:
def standardise_risk_matrix(raw_risk_matrix):
    risk_matrix = raw_risk_matrix.copy()
    # # Min-max scale Volatility and Amihud Illiquidity to [0, 1]
    # for col in ['Volatility', 'Amihud Illiquidity']:
    #     if col in risk_matrix.columns:
    #         min_val = risk_matrix[col].min()
    #         max_val = risk_matrix[col].max()
    #         risk_matrix[col] = (risk_matrix[col] - min_val) / (max_val - min_val)
    return risk_matrix

In [267]:
def calculate_risk_matrix(data):
    close_data = data['Close'].dropna(axis=1, how='any')
    volume_data = data['Volume'].dropna(axis=1, how='any')
    returns = close_data.pct_change().dropna()  # Calculate daily returns
    sp500_returns = returns['^GSPC']  # S&P 500 returns
    betas = returns.corrwith(sp500_returns) # Beta of each stock
    volatilities = returns.std()  # Volatility of each stock
    amihud_illiquidity = ((1e6)*(np.abs(returns)/volume_data).mean())  # Amihud Illiquidity Measure
    left_threshold = returns['^GSPC'].quantile(0.05)
    right_threshold = returns['^GSPC'].quantile(0.95)
    tail_data = returns[(returns['^GSPC'] >= right_threshold) | (returns['^GSPC'] <= left_threshold)]
    tail_corr = tail_data.corr()
    tail_risk = tail_corr['^GSPC']
    raw_risk_matrix = pd.DataFrame({
        'Beta': betas,
        'Volatility': volatilities,
        'Amihud Illiquidity': amihud_illiquidity,
        'Tail Risk': tail_risk
    })
    # standardize the risk matrix using min-max scaling
    risk_matrix = standardise_risk_matrix(raw_risk_matrix)
    # drop GSPC row
    risk_matrix = risk_matrix.drop(index='^GSPC', errors='ignore')

    return risk_matrix

In [185]:
def find_short_vector_and_coeffs_rows(risk_matrix):
    # Scale to integers if needed (LLL requires integer matrix)
    scale = 1e6
    basis = np.round(risk_matrix.values * scale).astype(int)
    # n_rows, n_cols = basis.shape
    # pad = np.zeros((n_rows, n_rows - n_cols), dtype=int)
    # basis = np.hstack([basis, pad])
    R = IntegerMatrix.from_matrix(basis)
    U = IntegerMatrix.identity(R.nrows)
    LLL.reduction(R, U)
    short_vec = np.array(R[0])
    coeffs = np.array(U[0])
    return basis, short_vec, coeffs

In [189]:
# Scrape S&P 500 tickers from Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
table = pd.read_html(url)
sp500 = table[0]
tickers = sp500['Symbol'].tolist()

# convert tickers to yfinance format
tickers = [ticker.replace('.', '-') for ticker in tickers]
tickers = ['^GSPC'] + tickers  # Add S&P 500 index ticker

In [None]:
# Fetch stock data for the S&P 500 companies
# start_date = '2020-01-01'
# end_date = '2023-01-01'
# data = fetch_stock_data(tickers, start_date, end_date)
data = pd.read_pickle('sp500_data.pkl')

[*********************100%***********************]  504 of 504 completed

5 Failed downloads:
['SW', 'VLTO', 'KVUE', 'GEV', 'SOLV']: YFPricesMissingError('possibly delisted; no price data found  (1d 2020-01-01 -> 2023-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1577854800, endDate = 1672549200")')


In [268]:
risk_matrix = calculate_risk_matrix(data)

In [269]:
risk_matrix

Unnamed: 0_level_0,Beta,Volatility,Amihud Illiquidity,Tail Risk
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0.771437,0.019964,0.008674,0.920378
AAPL,0.824773,0.023266,0.000147,0.948031
ABBV,0.567051,0.016753,0.001478,0.835938
ABT,0.695043,0.018916,0.002236,0.915447
ACGL,0.697761,0.024607,0.008623,0.861654
...,...,...,...,...
XYL,0.773899,0.022514,0.015031,0.934390
YUM,0.677809,0.019459,0.006612,0.766150
ZBH,0.635505,0.023103,0.011088,0.769680
ZBRA,0.759216,0.026795,0.052239,0.902079


In [270]:
find_short_vector_and_coeffs_rows(risk_matrix)

  basis = np.round(risk_matrix.values * scale).astype(int)


(array([[771437,  19964,   8674, 920378],
        [824773,  23266,    147, 948031],
        [567051,  16753,   1478, 835938],
        ...,
        [635505,  23103,  11088, 769680],
        [759216,  26795,  52239, 902079],
        [754256,  19793,   6951, 922412]], shape=(489, 4)),
 array([0, 0, 0, 0]),
 array([ -770697101381341776,  -727588094205795225, -1719911892093255555,
         2751599102729485109,   368947731652639855,                    0,
                           0,                    0,                    0,
                           0,                    0,                    0,
                           0,                    0,                    0,
                           0,                    0,                    0,
                           0,                    0,                    0,
                           0,                    0,                    0,
                           0,                    0,                    0,
                           0,

In [271]:
basis, short_vec, coeffs = find_short_vector_and_coeffs_rows(risk_matrix)
# multiply basis by coeffs to get the short vector
np.dot(coeffs, basis)

  basis = np.round(risk_matrix.values * scale).astype(int)


array([0, 0, 0, 0])

In [275]:
len(coeffs)

489