In [10]:
#These are the libraries you can use.  You may add any libraries directy related to threading if this is a direction
#you wish to go (this is not from the course, so it's entirely on you if you wish to use threading).  Any
#further libraries you wish to use you must email me, james@uwaterloo.ca, for permission.

from IPython.display import display, Math, Latex

import pandas as pd
import numpy as np
import numpy_financial as npf
import yfinance as yf
import matplotlib.pyplot as plt
import random
from datetime import datetime
import scipy as sp 
from scipy.optimize import minimize

## Group Assignment
### Team Number: 15
### Team Member Names: Neil Zhang, Rahim Rehan, Krish Patel
### Team Strategy Chosen: Risk-Free 

In [11]:

import pandas as pd
import yfinance as yf
from concurrent.futures import ThreadPoolExecutor, as_completed

START_DATE = "2024-10-01"
END_DATE   = "2025-10-01"

def format_tickers(csv_file_path, ticker_column_name="Ticker"):
   data_table = pd.read_csv(csv_file_path)

   # clean ticker list
   if ticker_column_name in data_table.columns:
       raw_col = data_table[ticker_column_name]
   else:
       raw_col = data_table.iloc[:, 0]

   ticker_list = []
   for cell in raw_col:
       if pd.isna(cell) == False:
           t = str(cell).strip()
           if t != "" and (t not in ticker_list):
               ticker_list.append(t)

   if len(ticker_list) == 0:
       return pd.DataFrame(columns=["Ticker","Sector","Currency","MarketCap"])

   # -------------------------
   # 1) BULK DOWNLOAD HISTORY
   # -------------------------
   hist = yf.download(
       tickers=ticker_list,
       start=START_DATE,
       end=END_DATE,
       group_by="ticker",
       auto_adjust=False,
       threads=True
   )

   # -------------------------
   # 2) FILTER BY AVG VOLUME
   # -------------------------
   tickers_after_vol = []

   if isinstance(hist.columns, pd.MultiIndex):
       # multi-ticker case
       for sym in ticker_list:
           if sym not in hist.columns.get_level_values(0):
               continue

           df = hist[sym]
           if "Volume" not in df.columns or len(df) == 0:
               continue

           if df["Volume"].mean() >= 5000:
               tickers_after_vol.append(sym)
   else:
       # single ticker fallback
       if "Volume" in hist.columns and len(hist) > 0:
           if hist["Volume"].mean() >= 5000:
               tickers_after_vol.append(ticker_list[0])

   # -------------------------------------------------------
   # 3) THREADING FUNCTION: fetch fast_info / info per ticker
   # -------------------------------------------------------
   def process_ticker(sym):
       """Fetch currency, sector, market cap for one ticker (threaded)."""
       try:
           obj = yf.Ticker(sym)

           # fast_info
           try:
               fi = obj.fast_info
           except:
               fi = None

           # full info
           try:
               info = obj.info
           except:
               info = {}

           # currency
           currency = None
           if fi is not None and hasattr(fi, "currency"):
               currency = fi.currency

           if currency is None and "currency" in info:
               currency = info["currency"]
           elif currency is None and "financialCurrency" in info:
               currency = info["financialCurrency"]

           # must be USD or CAD
           if currency not in ["USD", "CAD"]:
               return None

           # sector
           sector = info.get("sector", None)

           # market cap
           mc = None
           if fi is not None and hasattr(fi, "market_cap"):
               mc = fi.market_cap
           if mc is None:
               mc = info.get("marketCap", None)

           return (sym, sector, currency, mc)

       except:
           return None

   # -------------------------------------
   # 4) MULTITHREAD the info-fetching part
   # -------------------------------------
   results = []
   with ThreadPoolExecutor(max_workers=15) as executor:
       futures = {executor.submit(process_ticker, sym): sym for sym in tickers_after_vol}

       for fut in as_completed(futures):
           res = fut.result()
           if res is not None:
               results.append(res)

   # -------------------------
   # 5) Build DataFrame output
   # -------------------------
   if len(results) == 0:
       return pd.DataFrame(columns=["Ticker","Sector","Currency","MarketCap"])

   df = pd.DataFrame(results, columns=["Ticker","Sector","Currency","MarketCap"])
   return df.reset_index(drop=True)

In [12]:
# Krish, Info Extraction 

returns_start = "2024-11-14"
returns_end = "2025-11-14"

# Function to return a list of all tickers (first column elements)
def get_ticker_list (tickers_df):
     return tickers_df.iloc[:, 0].tolist()

# Gets weekly closes of all the stocks in a list of tickers
def get_weekly_closes (ticker_lst, start_date, end_date):
    #Define a dataframe to hold weekly close prices (checks every friday)
    weekly_closes = pd.DataFrame()
    #Extract the weekly close prices and store them in the dataframe
    for i in ticker_lst:
        ticker = yf.Ticker(i)
        data = ticker.history(start=start_date, end=end_date)
        data.index = pd.to_datetime(data.index) # ensure datetime index
        #last() takes the last trading price of the week
        prices = data['Close'].resample('W-FRI').last()
        weekly_closes[f'Close {i}'] = prices
    #Strip time
    weekly_closes.index = weekly_closes.index.strftime('%Y-%m-%d')
    return weekly_closes

# Creates a df with the (weekly) %change for each column
def get_percent_change (closes, start_date, end_date):
    percent_change = pd.DataFrame()
    for i in closes:
        col_name = i[6:]
        #fill_method=None to deal with delisted stocks
        percent_change[f'% Change {col_name}'] = closes[i].pct_change(fill_method=None) * 100
    return percent_change

# Calculate covariance, correlation, variance, standard deviation
def get_calculations(ticker_list, start_date, end_date):
    weekly_closes = get_weekly_closes(ticker_list, start_date, end_date)
    weekly_percent_change = get_percent_change(weekly_closes, start_date, end_date)
    covariance_matrix = {
        'Covariance': weekly_percent_change.cov(),
        'Correlation': weekly_percent_change.corr(),
        'Variance': weekly_percent_change.var(),
        'Std_Dev': weekly_percent_change.std()}
    return covariance_matrix

info_df = format_tickers("Extended_Tickers_Example.csv")
ticker_list = (get_ticker_list(info_df)) # List of all tickers
primary_calculations = get_calculations(ticker_list, returns_start, returns_end)
"""
# Access each piece like:
display(covariance_matrix['Covariance'])
display(covariance_matrix['Std_Dev'])
"""

[*********************100%***********************]  140 of 140 completed

9 Failed downloads:
['CELG', 'GIB.A.TO', 'DFS', 'ATVI', 'AGN', 'MON', 'BRK.B', 'RTN', 'PTR']: YFTzMissingError('possibly delisted; no timezone found')
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = prices
  weekly_closes[f'Close {i}'] = pr

"\n# Access each piece like:\ndisplay(covariance_matrix['Covariance'])\ndisplay(covariance_matrix['Std_Dev'])\n"

In [13]:
# Neil opimization models 
# This is the function we want to minimize, aka the minimum variance function
def port_variance(weights, cov_matrix):
    """     
    port_variance is the function that calculates the variance of a portfolio. It performs 
    dot product/matrix multiplication on the weights and covariance matrixes. 

    :param weights: an array that represents the weight of each asset
    :param cov_matrix: a 2D matrix that represents the covariance between each asset 
    :return: variance of the portfolio
    """
    weights_col = weights.reshape(-1, 1) # Turns into column vector
    port_var = np.dot(weights_col.transpose(), (np.dot(cov_matrix, weights_col))) # Doing dot product 
    return port_var[0][0]

In [14]:
# This code is what runs the primary function

# Primary minimization, there is bounds in this 
def primary_minimization(cov_matrix):
    """     
    primary_minimization is the function that finds the weightings that result in the mimimum variance. 
    It performs this using scipy optimization. This perimary version does not consider bounds. 

    :param cov_matrix: a 2D matrix that represents the covariance between each asset 
    :return: returns the minimum variance and the weightings associated with that
    """
    num_assets = cov_matrix.shape[0]
    initial_weight = [1/num_assets] * num_assets # The initial guess of the weights

    constraint = {
        'type':'eq', # Constraint type is equality
        'fun': lambda w: sum(w) - 1 # The function's weight's must sum to 1
        }
    
    weight_bounds = [(0, 1)] * num_assets # Does not allow short selling
    
    # Finds the resilt of the minimization of the port_variance function, using the initial guess, keeping the cov_matrix constant using the SLSQP method, and with the above listed constraint
    result = minimize(fun=port_variance, x0=initial_weight, args=(cov_matrix,), method='SLSQP', bounds=weight_bounds, constraints=constraint)
    return result.fun, result.x

pd_cov_matrix = primary_calculations['Covariance']
numpy_cov_matrix = pd_cov_matrix.to_numpy() # Matrix of covariances of assets
primary_var, primary_weights = primary_minimization(numpy_cov_matrix)

Secondary Optimization logic: Let n be the size of the tickers, we now have an optimized weighting for those n stocks
We want to find the most optimal set up of 10-25 stocks out of those n. To do so we will try every combination, however if we wanted to brute force from those n stocks it'd take an absurd amount of compute. Instead we will take into the fact that we have the weightings of the (unconstrained) optimization. The higher the weighting of an asset, the more important it is to the minimizing the  variance, thus we will sort the n-optimized assets from highest to lowest weighting. Starting from the top we will then build a 10-25 asset size portfolio and calculate the variance of each portfolio, finding the one with the least variance. 

We must also consider the fact that each portfolio has restraints, aka the min/max weighting of one stock, the max amount of sectors, and the mkt caps
In regards to the weighting rules, those can be implemented via scipy's minimization constraints, making all weightings are in a certain range
In regards to the max amount of sectors, when building the portfolios we will keep count of the sectors, if any sector exceeds a certain amount such that are over represented, we skip an asset and move on
In regards to the small and large mkt cap, we can check after building the portfolio, if one of them is missing we can delete the lowest weighted (least important) asset and then add in a new asset from the list that satisfies the missing mkt cap

In [15]:
# The secondary optimization that includes the bounds 
def secondary_minimization(cov_matrix):
    """     
    secondary_minimization is the function that finds the weightings that result in the mimimum variance while considering the bounds.
    That is, it ensures the weightings 

    :param cov_matrix: a 2D matrix that represents the covariance between each asset 
    :return: returns the minimum variance and the weightings associated with that
    """
    num_assets = len(cov_matrix[0]) 
    initial_weight = [1/num_assets] * num_assets # The initial guess of the weights

    min_weight = (100/2*num_assets)/100 # Do not need to include portfolio value, because 1 is the portfolio value (and sum of weights)
    max_weight = 0.15 # Same as above

    constraint = {
        'type':'eq', # Constraint type is equality
        'fun': lambda w: sum(w) - 1 # The function's weight's must sum to 1
        }
    
    weight_bounds = [(min_weight, max_weight)] * num_assets
    
    # Finds the resilt of the minimization of the port_variance function, using the initial guess, keeping the cov_matrix constant using the SLSQP method, and with the above listed constraint
    result = minimize(fun=port_variance, x0=initial_weight, args=(cov_matrix,), method='SLSQP', bounds = weight_bounds, constraints=constraint)
    return result.fun, result.x

In [16]:
info_df['weight'] = primary_weights #Assume info_df holds the tickers, sector, market cap, etc and not the dates etc. We now add the weights.
ordered_info_df = info_df.copy().sort_values('weight', ascending = False).reset_index(drop=True)

ordered_info_df

Unnamed: 0,Ticker,Sector,Currency,MarketCap,weight
0,BNS.TO,Financial Services,CAD,1.171217e+11,1.536951e-01
1,CME,Financial Services,USD,9.825983e+10,1.150587e-01
2,EXC,Utilities,USD,4.601502e+10,9.665991e-02
3,AEP,Utilities,USD,6.514718e+10,8.973517e-02
4,DG,Consumer Defensive,USD,2.193357e+10,8.773502e-02
...,...,...,...,...,...
125,TRP.TO,Energy,CAD,7.998003e+10,1.553390e-16
126,CMCSA,Communication Services,USD,9.920959e+10,1.474089e-16
127,RIO,Basic Materials,USD,1.135107e+11,1.452383e-16
128,UNP,Industrials,USD,1.311894e+11,9.801320e-17


In [19]:
# Code that creates a portfolio that matches the requirements

# Determines all the indexes in a portfolio that are large cap stocks
def is_lg_cap(portfolio):
    lg_cap = []
    for i in range(len(portfolio)):
        if portfolio[i]['MarketCap'] > 10_000_000_000:
            lg_cap.append(i)
    return lg_cap

# Determines all the indexes in a portfolio that are small cap stocks
def is_sm_cap(portfolio):
    sm_cap = []
    for i in range(len(portfolio)):
        if portfolio[i]['MarketCap'] < 2_000_000_000:
            sm_cap.append(i)
    return sm_cap

# Determines if an individual stock is a large market cap
def is_lg(row):
    return row['MarketCap'] > 10_000_000_000

# Determines if an individual stock is a small market cap
def is_sm(row):
    return row['MarketCap'] < 2_000_000_000

# Determines the index of the least important stock. Least important in this case is the one with the lowest weighting in the ordered list, but
# if the least important is either the ONLY SMALL or LARGE cap stock then the second least important stock is now designated the least important
def find_least_imp(portfolio, lg_indxs, sm_indxs):
    """ 
    :param portfolio: A list of stock data in Series format
    :param lg_indxs: A list of all indexes that hold large cap stocks
    :param sm_indxs: A list of all indexes that hold small cap stocks
    :return: integer that represents the index that is desginated least important
    """

    only_large_idx = None
    only_small_idx = None

    # Determines the protected indexes
    if len(lg_indxs) == 1:
        only_large_idx = lg_indxs[0]
    if len(sm_indxs) == 1:
        only_small_idx = sm_indxs[0]

    for i in range(len(portfolio) - 1, -1, -1):
        if i != only_large_idx and i != only_small_idx:
            return i

# Creates a portfolio that is valid 
def create_portfolio(size):
    """ 
    :param size: the size of the portfolio
    :return: a list of tickers that are in the portfolio, or none if it cannot create a portfolio that satisfies all requirements
    """

    portfolio = []
    port_sectors = []
    i = 0
    max_sector_num = int(size * 0.4)
    ticker_only = []
    
    # Creates preliminary portfolio 
    while len(portfolio) < size:
        if i >= len(ordered_info_df):
            return None
        cur = ordered_info_df.iloc[i]
        cur_sector = cur['Sector']

        # Ensures that no sector is above max weight
        if (port_sectors.count(cur_sector) < max_sector_num):
            portfolio.append(cur)
            port_sectors.append(cur_sector)
        i += 1

    lg_idxs = is_lg_cap(portfolio)
    sm_idxs = is_sm_cap(portfolio)

    # Fixes the no large market cap issue
    while len(lg_idxs) == 0: 
        replaced = find_least_imp(portfolio, lg_idxs, sm_idxs)
        if i >= len(ordered_info_df):
            return None
        cur = ordered_info_df.iloc[i]
        cur_sector = cur['Sector']

        temp_sectors = port_sectors.copy()
        temp_sectors.pop(replaced)
        
        if (is_lg(cur)) and (temp_sectors.count(cur_sector) < max_sector_num):
            portfolio[replaced] = cur
            port_sectors[replaced] = cur_sector

            lg_idxs = is_lg_cap(portfolio)
            sm_idxs = is_sm_cap(portfolio)
        i += 1

    # Fixes the no small market cap issues
    while len(sm_idxs) == 0: 
        replaced = find_least_imp(portfolio, lg_idxs, sm_idxs)
        if i >= len(ordered_info_df):
            return None
        cur = ordered_info_df.iloc[i]
        cur_sector = cur['Sector']

        temp_sectors = port_sectors.copy()
        temp_sectors.pop(replaced)
        
        if (is_sm(cur)) and (temp_sectors.count(cur_sector) < max_sector_num):
            portfolio[replaced] = cur
            port_sectors[replaced] = cur_sector

            lg_idxs = is_lg_cap(portfolio)
            sm_idxs = is_sm_cap(portfolio)
        i += 1
    
    for stock in portfolio:
        ticker_name = stock["Ticker"]
        ticker_only.append(ticker_name)
    
    return ticker_only

In [20]:
# Code that creates the portfolio of 10-25, and then sees which one is the most optimal 

all_ports = []
all_variance = []
all_weights = []
count = 0
while (count + 10) < 26:
    port = create_portfolio(count+10)
    if port is None:
        print(f"Failed to build portfolio of size {count+10}, skipping.")
        count += 1
        continue

    all_ports.append(port)

    temp_cov = get_calculations(all_ports[count], returns_start, returns_end)
    cov_np = temp_cov['Covariance'].to_numpy()
    temp_var, temp_weights = secondary_minimization(cov_np)

    all_variance.append(temp_var)
    all_weights.append(temp_weights)
    count += 1

if not all_variance:
    print("No valid portfolios were generated for some reason. Please check ticker csv.")
else:
    smallest_var = min(all_variance)
    index = all_variance.index(smallest_var)
    target = [smallest_var, all_ports[index], all_weights[index]]
    print(f"The smallest variance found is {target[0]} which is determined from the following portfolio: {target[1]}, at the following weights {target[2]}.")

Failed to build portfolio of size 10, skipping.
Failed to build portfolio of size 11, skipping.
Failed to build portfolio of size 12, skipping.
Failed to build portfolio of size 13, skipping.
Failed to build portfolio of size 14, skipping.
Failed to build portfolio of size 15, skipping.
Failed to build portfolio of size 16, skipping.
Failed to build portfolio of size 17, skipping.
Failed to build portfolio of size 18, skipping.
Failed to build portfolio of size 19, skipping.
Failed to build portfolio of size 20, skipping.
Failed to build portfolio of size 21, skipping.
Failed to build portfolio of size 22, skipping.
Failed to build portfolio of size 23, skipping.
Failed to build portfolio of size 24, skipping.
Failed to build portfolio of size 25, skipping.
No valid portfolios were generated for some reason. Please check ticker csv.


In [21]:
temp_df = pd.DataFrame({
    "Ticker": target[1],
    "Weight": target[2]
})

def get_close_prices_and_rate(tickers, target_date, end_date):
    """
    :param tickers: list of tickers
    :param target_data: the day of price we want, normally most recent business day
    :param end_date: the day after, as yfinance is not inclusive
    :return: a Series that contains the target days close price
    :return: the USD to CAD exchange rate
    """
    data = yf.download(tickers, start=target_date, end=end_date)["Close"]
    close_prices = data.iloc[0]

    exchange_rate = yf.download("CAD=X", start=target_date, end=end_date)["Close"]
    exchange_rate = exchange_rate.iloc[0]
    return close_prices, exchange_rate.item()

def purchase_flat_fee(df, close_prices, budget, exchange_rate):
    temp_df["Price"] = close_prices
    df["Shares Bought Flat Fee"] = (df["Weight"] * (budget - (2.5*exchange_rate))) / df["Price"]
    df["Flat Fee Worth"] = df["Shares Bought Flat Fee"] * df["Price"]

    return df 

def purchase_variable_fee(df, budget, exchange_rate):
    df["Shares w/o Fee"] = (df["Weight"] * budget) / df["Price"]
    total_shares = df["Shares w/o Fee"].sum()
    
    variable_fee_usd = total_shares * 0.001
    variable_fee_cad = variable_fee_usd * exchange_rate

    adjusted_budget = budget - variable_fee_cad
    df["Shares Bought Variable Fee"] = (df["Weight"] * adjusted_budget) / df["Price"]
    df["Variable Fee Worth"] = df["Shares Bought Variable Fee"] * df["Price"]

    return df

def ideal_shares(df):
    sum_flat_fee = df["Flat Fee Worth"].sum()
    sum_variable_fee = df["Variable Fee Worth"].sum()
    if (sum_flat_fee < sum_variable_fee):
        df.drop(["Shares Bought Flat Fee", "Flat Fee Worth"])
        df.rename(column={"Shares Bought Variable Fee":"Shares", "Variable Fee Worth":"Value"})
    else:
        df.drop(["Shares Bought Variable Fee", "Variable Fee Worth"])
        df.rename(column={"Shares Bought Flat Fee":"Shares", "Flat Fee Worth":"Value"})
    
    return df 

def add_currency(df_small, df_large):
    df_with_currency = df_small.merge(df_large[["Ticker", "Currency"]], on="Ticker", how="left")
    return df_with_currency

NameError: name 'target' is not defined

In [None]:
target_date = "2025-18-11" #Example
end_date = "2025-19-11" #Example
closing, usd_cad_rate = get_close_prices_and_rate(target[1], target_date, end_date)
temp_df = purchase_flat_fee(temp_df, closing, 1_000_000, usd_cad_rate)
temp_df = purchase_variable_fee(temp_df, 1_000_000, usd_cad_rate)
temp2_df = ideal_shares(temp_df)
Portfolio_Final = add_currency(temp2_df,ordered_info_df)

Stocks_Final = Portfolio_Final.copy 
Stocks_Final = Stocks_Final.drop(["Currency", "Weight", "Price"])
Stocks_Final.to_csv("Stocks_Group_15.csv", index=False)

In [None]:
#--- Third Optimization ---#
ordered_ticker_list = get_ticker_list (ordered_info_df)
# Take the top 25 most important stocks
ordered_ticker_list = ordered_ticker_list [0:30]
# Create dict to hold every possible portfolio
portfolio_combinations = list(itr.combinations(ordered_ticker_list, 25))

## Contribution Declaration

The following team members made a meaningful contribution to this assignment:

Insert Names Here. 