In [144]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import yfinance as yf
from fpylll import IntegerMatrix, LLL
from scipy.optimize import linprog

In [145]:
def fetch_stock_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date)
    data = data.dropna(axis=1, how='all')  # Drop columns with all NaN values
    data = data.dropna(axis=0, how='all')  # Drop rows with all NaN values
    data = data.ffill()  # Fill NaN values with the previous valid observation
    return data

In [146]:
def standardise_risk_matrix(raw_risk_matrix):
    risk_matrix = raw_risk_matrix.copy()
    # Min-max scale Volatility and Amihud Illiquidity to [0, 1]
    for col in ['Volatility', 'Amihud Illiquidity']:
        if col in risk_matrix.columns:
            min_val = risk_matrix[col].min()
            max_val = risk_matrix[col].max()
            risk_matrix[col] = (risk_matrix[col] - min_val) / (max_val - min_val)

    return risk_matrix

In [147]:
def calculate_risk_matrix(data):
    close_data = data['Close'].dropna(axis=1, how='any')
    volume_data = data['Volume'].dropna(axis=1, how='any')
    volume_data = volume_data.replace(0, np.nan).ffill()
    returns = close_data.pct_change().dropna()  # Calculate daily returns
    sp500_returns = returns['^GSPC']  # S&P 500 returns
    betas = returns.corrwith(sp500_returns) # Beta of each stock
    volatilities = returns.std()  # Volatility of each stock
    amihud_illiquidity = ((1e6)*(np.abs(returns)/volume_data).mean())  # Amihud Illiquidity Measure
    left_threshold = returns['^GSPC'].quantile(0.05)
    right_threshold = returns['^GSPC'].quantile(0.95)
    tail_data = returns[(returns['^GSPC'] >= right_threshold) | (returns['^GSPC'] <= left_threshold)]
    tail_corr = tail_data.corr()
    tail_risk = tail_corr['^GSPC']
    raw_risk_matrix = pd.DataFrame({
        'Beta': betas,
        'Volatility': volatilities,
        'Amihud Illiquidity': amihud_illiquidity,
        'Tail Risk': tail_risk
    })
    # standardize the risk matrix using min-max scaling
    risk_matrix = standardise_risk_matrix(raw_risk_matrix)
    # drop GSPC row
    risk_matrix = risk_matrix.drop(index='^GSPC', errors='ignore')
    # shuffle order of rows
    risk_matrix = risk_matrix.sample(frac=1, random_state=42)

    return risk_matrix

In [148]:
def find_stock_combinations(risk_matrix):
    # Scale to integers if needed (LLL requires integer matrix)
    scale = 1e6
    basis = np.round(risk_matrix.values * scale).astype(int)
    n_rows, n_cols = basis.shape
    np.random.seed(42)  # For reproducibility
    pad = np.random.randint(0, 2, size=(n_rows, n_rows - n_cols))
    basis = np.hstack([basis, pad])
    R = IntegerMatrix.from_matrix(basis)
    U = IntegerMatrix.identity(R.nrows)
    LLL.reduction(R, U)
    # short_vec = np.array(R[0])
    # coeffs = np.array(U[0])
    return U

In [149]:
# # Scrape S&P 500 tickers from Wikipedia
# url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
# table = pd.read_html(url)
# sp500 = table[0]
# tickers = sp500['Symbol'].tolist()

# # convert tickers to yfinance format
# tickers = [ticker.replace('.', '-') for ticker in tickers]
# tickers = ['^GSPC'] + tickers  # Add S&P 500 index ticker

# # Fetch stock data for the S&P 500 companies
# start_date = '2020-01-01'
# end_date = '2023-01-01'
# data = fetch_stock_data(tickers, start_date, end_date)

data = pd.read_pickle('sp500_data.pkl')

In [150]:
risk_matrix = calculate_risk_matrix(data)

In [171]:
U = find_stock_combinations(risk_matrix)
z = np.zeros((risk_matrix.shape[0], risk_matrix.shape[0]), dtype=int)
_ = U.to_matrix(z)
# create a dictionary to map each row index to the corresponding stock combinations and the quantity
stock_combinations = {
    i:  {
        risk_matrix.index[j]: z[i, j] for j in range(len(z[i])) if z[i, j] != 0
    }
    for i in range(z.shape[0])
}

In [172]:
stock_combinations

{0: {'URI': np.int64(-1),
  'TRV': np.int64(2),
  'WFC': np.int64(1),
  'TMUS': np.int64(1),
  'MS': np.int64(1),
  'IFF': np.int64(1),
  'ADP': np.int64(-1),
  'TER': np.int64(-2),
  'GRMN': np.int64(-1),
  'PH': np.int64(-1),
  'TPR': np.int64(-2),
  'TRGP': np.int64(1),
  'PM': np.int64(-2),
  'FICO': np.int64(1),
  'BEN': np.int64(2),
  'CBOE': np.int64(1),
  'EMR': np.int64(1),
  'KR': np.int64(-1),
  'LII': np.int64(-1)},
 1: {'URI': np.int64(-1),
  'TRV': np.int64(1),
  'WFC': np.int64(1),
  'HRL': np.int64(1),
  'ADP': np.int64(-1),
  'DE': np.int64(1),
  'TER': np.int64(-3),
  'BXP': np.int64(-1),
  'TPR': np.int64(1),
  'PM': np.int64(-1),
  'CCI': np.int64(2),
  'FICO': np.int64(1),
  'BEN': np.int64(1),
  'EMR': np.int64(1),
  'LII': np.int64(1),
  'REGN': np.int64(1),
  'DELL': np.int64(1),
  'MSFT': np.int64(1),
  'NDSN': np.int64(-1),
  'AON': np.int64(-1),
  'CAH': np.int64(-1),
  'A': np.int64(-1),
  'SBAC': np.int64(-1),
  'STLD': np.int64(-1)},
 2: {'CHD': np.int64(2

In [None]:
def optimize_portfolio(risk_matrix, stock_combinations, stock_prices, combo_idx=0, budget=1000000, max_stocks=15):
    # Get the combination to use as starting point
    initial_combo = stock_combinations[combo_idx]
    
    # Convert stocks and quantities to arrays
    stocks = list(initial_combo.keys())
    quantities = np.array([initial_combo[stock] for stock in stocks])
    prices = np.array([stock_prices[stock] for stock in stocks])
    
    # Calculate risk scores (weighted sum of risk metrics)
    risk_weights = {
        'Beta': 0.25,
        'Volatility': 0.35,
        'Amihud Illiquidity': 0.25,
        'Tail Risk': 0.15
    }
    
    risk_scores = np.array([
        sum(risk_weights[metric] * risk_matrix.loc[stock, metric] 
            for metric in risk_weights if metric in risk_matrix.columns)
        for stock in stocks
    ])
    
    # Setup ILP problem (using linprog which handles continuous variables)
    # We'll handle integer constraints by rounding at the end
    
    # For each stock, we have a long and short position variable
    n_stocks = len(stocks)
    c = np.zeros(2 * n_stocks)  # Objective coefficients
    
    # Objective: minimize risk
    for i in range(n_stocks):
        # Long positions
        c[i] = risk_scores[i]
        # Short positions (same risk coefficient)
        c[i + n_stocks] = risk_scores[i]
    
    # Constraints
    A_ub = np.zeros((2, 2 * n_stocks))
    b_ub = np.zeros(2)
    
    # Budget constraint
    A_ub[0, :n_stocks] = prices  # Long positions cost
    A_ub[0, n_stocks:] = prices  # Short positions cost
    b_ub[0] = budget
    
    # Max stocks constraint - approximated with total positions
    A_ub[1, :] = 1  # Count all positions
    b_ub[1] = max_stocks
    
    # Bounds - use the LLL solution as guidance
    bounds = []
    for i in range(n_stocks):
        qty = quantities[i]
        if qty > 0:
            # Long position - between half and double the LLL quantity
            bounds.append((max(1, qty//2), qty*2))
            # No short position
            bounds.append((0, 0))
        elif qty < 0:
            # No long position
            bounds.append((0, 0))
            # Short position - between half and double the LLL quantity
            bounds.append((max(1, abs(qty)//2), abs(qty)*2))
        else:
            # No position suggested by LLL
            bounds.append((0, 0))
            bounds.append((0, 0))
    
    # Solve the optimization problem
    result = linprog(c, A_ub=A_ub, b_ub=b_ub, bounds=bounds, method='highs')
    
    if result.success:
        # Extract the optimal solution
        long_pos = np.round(result.x[:n_stocks]).astype(int)
        short_pos = np.round(result.x[n_stocks:]).astype(int)
        
        # Calculate net positions
        net_positions = {}
        for i, stock in enumerate(stocks):
            if long_pos[i] > 0:
                net_positions[stock] = long_pos[i]
            elif short_pos[i] > 0:
                net_positions[stock] = -short_pos[i]
        
        # Calculate total cost
        total_cost = sum(abs(qty) * stock_prices[stock] for stock, qty in net_positions.items())
        
        return {
            'portfolio': net_positions,
            'total_cost': total_cost,
            'remaining_budget': budget - total_cost,
            'num_stocks': len(net_positions)
        }
    else:
        print(f"Optimization failed: {result.message}")
        return None

In [None]:
# Example usage:
combo_idx = 0  # Use the first (shortest) vector from LLL
result = optimize_portfolio(risk_matrix, stock_combinations, combo_idx=combo_idx, budget=1000000)
print("Optimized Portfolio:")
for stock, qty in result['portfolio'].items():
    print(f"{stock}: {qty}")
print(f"Total cost: ${result['total_cost']:,.2f}")
print(f"Remaining budget: ${result['remaining_budget']:,.2f}")
print(f"Number of stocks: {result['num_stocks']}")