# Portfolio starts from 2023-01-01 since that's when portfolio became fully healthcare

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import yfinance as yf
import os


# Set pandas display options to show all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Adjust width to avoid truncation

# Set plot style for better aesthetics
sns.set(style="whitegrid")

start_date = "2023-01-01"

def load_portfolio_data(file_path,start_date):
    """Load and preprocess portfolio data."""
    df = pd.read_excel(file_path)
    df.columns = df.columns.str.strip()
    df.rename(columns={"Trade Dt": "Date", "Group": "Ticker"}, inplace=True)
    df['Date'] = pd.to_datetime(df['Date']).ffill()
    df["Returns_Percent"] = df["P&L (%)"]
    
    # Filter to start from 2023-01-01
    df = df[df['Date'] >= start_date]
    
    return df

def load_ff5_data(file_path,start_date):
    """Load and preprocess F-F 5 Factors data."""
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')
    
    # Filter to start from 2023-01-01
    df = df[df['Date'] >= start_date]
    
    return df

def fetch_biotech_index_data(tickers,start_date):
    """Fetch historical adjusted close data for specified tickers."""
    data = yf.download(tickers,start=start_date)['Adj Close']
    daily_returns = data.pct_change().dropna()
    daily_returns.columns = ['XBI_Daily_Returns', 'NBI_Daily_Returns', 'IWM_Daily_Returns']
    daily_returns.reset_index(inplace=True)
    daily_returns['Date'] = pd.to_datetime(daily_returns['Date'], utc=True).dt.tz_localize(None)
    
    # Filter to start from 2023-01-01
    daily_returns = daily_returns[daily_returns['Date'] >= start_date]
    
    return daily_returns

def merge_data(portfolio, ff5, biotech_returns,start_date):
    # Ensure all Date columns are naive
    portfolio['Date'] = portfolio['Date'].dt.tz_localize(None)
    ff5['Date'] = ff5['Date'].dt.tz_localize(None)
    biotech_returns['Date'] = pd.to_datetime(biotech_returns['Date']).dt.tz_localize(None)

    # Subset F-F 5 Factors and Biotech Index to match the portfolio date range

    ff5_subset = ff5[ff5['Date'] >= start_date]
    biotech_subset = biotech_returns[biotech_returns['Date'] >= start_date]

    # Merge the dataframes
    merged = pd.merge(portfolio, ff5_subset, on='Date', how='left')
    final = pd.merge(merged, biotech_subset, on='Date', how='left')

    return final

# Main execution
portfolio_file_path = r"C:\Users\MukeshwaranBaskaran\Downloads\Project_KISKI\Data\ADAR1_Daily_Ticker_Data.xlsx"
ff5_file_path = r"C:\Users\MukeshwaranBaskaran\Downloads\Project_KISKI\Data\F-F_Research_Data_5_Factors_2x3_daily.CSV"
index_tickers = ['XBI', '^NBI', 'IWM']

# Load data
df_daily_portfolio = load_portfolio_data(portfolio_file_path,start_date)
df_ff5_daily = load_ff5_data(ff5_file_path,start_date)
biotech_index_daily_returns = fetch_biotech_index_data(index_tickers,start_date)

# Execute the merging function
final_df = merge_data(df_daily_portfolio, df_ff5_daily, biotech_index_daily_returns,start_date)

# Display the final dataframe
df = final_df.dropna(subset=['Ticker'])

# Replace ' JP' at the end with '.T'
df['Ticker'] = df['Ticker'].apply(lambda x: x[:-3] + '.T' if x.endswith(' JP') else x)

# Replace '2746505Z DC' with 'ASND'
df['Ticker'] = df['Ticker'].replace('2746505Z DC', 'ASND')
df.head()

[*********************100%***********************]  3 of 3 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ticker'] = df['Ticker'].apply(lambda x: x[:-3] + '.T' if x.endswith(' JP') else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ticker'] = df['Ticker'].replace('2746505Z DC', 'ASND')


Unnamed: 0,Date,Ticker,AUM BOD,AUM EOD,P&L,P&L (%),Net,Net (%),Gross,Gross(%),Market Value,Market Value (%),Returns_Percent,Mkt-RF,SMB,HML,RMW,CMA,RF,XBI_Daily_Returns,NBI_Daily_Returns,IWM_Daily_Returns
1,2023-01-31,4502.T,72681150.0,75619330.0,4156.44,5.7e-05,723114.7,0.009563,723114.7,0.009563,723114.7,0.009563,5.7e-05,1.58,0.94,-0.07,-0.06,-0.51,0.017,0.024286,0.020666,0.011836
2,2023-01-31,4523.T,72681150.0,75619330.0,30633.2,0.000421,92382.19,0.001222,92382.19,0.001222,92382.19,0.001222,0.000421,1.58,0.94,-0.07,-0.06,-0.51,0.017,0.024286,0.020666,0.011836
3,2023-01-31,1893407D,72681150.0,75619330.0,,,120000.0,0.001587,120000.0,0.001587,120000.0,0.001587,,1.58,0.94,-0.07,-0.06,-0.51,0.017,0.024286,0.020666,0.011836
4,2023-01-31,ASND,72681150.0,75619330.0,-495.87,-7e-06,,,,,,,-7e-06,1.58,0.94,-0.07,-0.06,-0.51,0.017,0.024286,0.020666,0.011836
5,2023-01-31,AADI,72681150.0,75619330.0,-974.67,-1.3e-05,50196.076239,0.000664,50196.076239,0.000664,32649.0,0.000432,-1.3e-05,1.58,0.94,-0.07,-0.06,-0.51,0.017,0.024286,0.020666,0.011836


In [2]:
# Create a subfolder for storing data
data_folder = 'yfinance_data'
os.makedirs(data_folder, exist_ok=True)

# Initialize lists to hold tickers and results
successful_tickers = []
failed_tickers = []

# Define the time period
start_date = '2018-01-01'
end_date = pd.Timestamp.now().strftime('%Y-%m-%d')

# Loop through each ticker in the DataFrame
for ticker in df['Ticker'].unique():
    file_path = os.path.join(data_folder, f"{ticker}_data.csv")
    
    try:
        # Check if the file already exists
        if os.path.exists(file_path):
            # Load existing data
            existing_data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
            latest_date = existing_data.index[-1]

            # If the latest data is from today, skip fetching
            if latest_date.date() == pd.Timestamp.now().date():
                successful_tickers.append(ticker)
                continue
            
            # Adjust the start date for fetching new data
            start_date = latest_date.strftime('%Y-%m-%d')
        
        # Fetch historical data from Yahoo Finance
        data = yf.download(ticker, start=start_date, end=end_date)
        
        if not data.empty:
            # Append new data if any
            if os.path.exists(file_path):
                data = pd.concat([existing_data, data])
                data = data[~data.index.duplicated(keep='last')]  # Remove duplicates if any
            
            # Save the updated data to a CSV file
            data.to_csv(file_path)
            successful_tickers.append(ticker)
        else:
            failed_tickers.append(ticker)
    
    except Exception as e:
        print(f"Failed to fetch data for {ticker}: {e}")
        failed_tickers.append(ticker)

# Save the list of tickers with no data to a separate CSV
failed_df = pd.DataFrame(failed_tickers, columns=['Failed Tickers'])
failed_df.to_csv('failed_tickers.csv', index=False)

# Save the successful tickers to a CSV
successful_df = pd.DataFrame(successful_tickers, columns=['Successful Tickers'])
successful_df.to_csv('successful_tickers.csv', index=False)

print("Data fetching complete.")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['1893407D']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ACACW']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2018-01-01 -> 2024-10-11)')
[**

Failed to fetch data for ANEB Warrants 09/28/27 $4.215: Cannot save file into a non-existent directory: 'yfinance_data\ANEB Warrants 09\28'


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ASTLW']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2018-01-01 -> 2024-10-11)')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%**********

Failed to fetch data for BCTX US 01/19/24 C5: Cannot save file into a non-existent directory: 'yfinance_data\BCTX US 01\19'


[*********************100%***********************]  4 of 4 completed

2 Failed downloads:
['01/19/24', 'P7.5']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  2 of 2 completed

1 Failed download:
['CURNCY']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')


Failed to fetch data for BCTX US 01/19/24 P7.5: Cannot save file into a non-existent directory: 'yfinance_data\BCTX US 01\19'


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['CANOQ']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  2 of 2 completed

1 Failed download:
['CURNCY']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  2 of 2 completed

1 Failed download:
['CURNCY']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  2 of 2 completed

1 Failed download:
['CURNCY']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  3 of 3 completed

2 Failed downloads:
['EXCHANGE', 'LOSS']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed

1 Fail

Failed to fetch data for BCTX1 US 01/19/24 C5: Cannot save file into a non-existent directory: 'yfinance_data\BCTX1 US 01\19'
Failed to fetch data for BCTX1 US 01/19/24 P7.5: Cannot save file into a non-existent directory: 'yfinance_data\BCTX1 US 01\19'



[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TRADINGEXPENSES']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AYX']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Data fetching complete.





In [22]:
# Create a dictionary to hold the adjusted close data for all tickers
adjusted_close_data = {}

# Loop through each unique ticker in the DataFrame
for ticker in df['Ticker'].unique():
    file_path = os.path.join(data_folder, f"{ticker}_data.csv")
    
    # Debug output for file paths
    print(f"Processing file: {file_path}")
    
    if os.path.exists(file_path):
        # Read CSV and clean column names
        temp_data = pd.read_csv(file_path)
        temp_data.columns = temp_data.columns.str.strip()  # Strip whitespace from column names
        
        # Debug output for column names
        print(f"Columns found in {file_path}: {temp_data.columns.tolist()}")
        
        if 'Date' in temp_data.columns:
            # Read adjusted close data while parsing dates
            existing_data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
            existing_data.reset_index(inplace=True)
            
            # Add the ticker column
            existing_data['Ticker'] = ticker
            
            # Store the adjusted close data
            adjusted_close_data[ticker] = existing_data[['Date', 'Ticker', 'Adj Close']].rename(columns={'Adj Close': 'Adj_Close'})
        else:
            print(f"Column 'Date' not found in {file_path}")
    else:
        print(f"File not found: {file_path}")

# Combine all adjusted close data into a single DataFrame
if adjusted_close_data:  # Check if there's any data to concatenate
    adjusted_close_df = pd.concat(adjusted_close_data.values(), ignore_index=True)
else:
    print("No adjusted close data found for any tickers.")
    adjusted_close_df = pd.DataFrame()  # Create an empty DataFrame if no data

# Final DataFrame output
print("Final adjusted close DataFrame:")
print(adjusted_close_df.head())  # Display first few rows for verification


Processing file: yfinance_data\4502.T_data.csv
Columns found in yfinance_data\4502.T_data.csv: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
Processing file: yfinance_data\4523.T_data.csv
Columns found in yfinance_data\4523.T_data.csv: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
Processing file: yfinance_data\1893407D_data.csv
File not found: yfinance_data\1893407D_data.csv
Processing file: yfinance_data\ASND_data.csv
Columns found in yfinance_data\ASND_data.csv: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
Processing file: yfinance_data\AADI_data.csv
Columns found in yfinance_data\AADI_data.csv: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
Processing file: yfinance_data\AAGR_data.csv
Columns found in yfinance_data\AAGR_data.csv: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
Processing file: yfinance_data\ABBV_data.csv
Columns found in yfinance_data\ABBV_data.csv: ['Date', 'Open', 'High', 'Low

In [23]:
# Assuming df and adjusted_close_df are already defined
# Merging df with adjusted_close_df on 'Ticker' and 'Date' using a left join
merged_df = pd.merge(df, adjusted_close_df, on=['Ticker', 'Date'], how='left')

# Drop all rows with any NaN values
cleaned_df = merged_df.dropna()

# Display the first few rows of the cleaned DataFrame for verification
print("Cleaned DataFrame (with no NaN values):")
print(cleaned_df.head())


Cleaned DataFrame (with no NaN values):
        Date  Ticker       AUM BOD       AUM EOD       P&L   P&L (%)  \
0 2023-01-31  4502.T  7.268115e+07  7.561933e+07   4156.44  0.000057   
1 2023-01-31  4523.T  7.268115e+07  7.561933e+07  30633.20  0.000421   
4 2023-01-31    AADI  7.268115e+07  7.561933e+07   -974.67 -0.000013   
6 2023-01-31    ABBV  7.268115e+07  7.561933e+07 -13906.36 -0.000191   
7 2023-01-31    ABCL  7.268115e+07  7.561933e+07    655.10  0.000009   

             Net   Net (%)          Gross  Gross(%)  Market Value  \
0  723114.700000  0.009563  723114.700000  0.009563     723114.70   
1   92382.190000  0.001222   92382.190000  0.001222      92382.19   
4   50196.076239  0.000664   50196.076239  0.000664      32649.00   
6  886500.000000  0.011723  886500.000000  0.011723     886500.00   
7   -2365.001923 -0.000031   17446.862015  0.000231      -5280.00   

   Market Value (%)  Returns_Percent  Mkt-RF   SMB   HML   RMW   CMA     RF  \
0          0.009563         0.000

In [26]:
def calculate_factor_exposure(cleaned_df, date):
    # Filter the DataFrame for the specific date
    filtered_df = cleaned_df[cleaned_df['Date'] == date]
    
    # Calculate the weighted exposure for each factor
    weights = filtered_df['Market Value (%)']
    print(weights)
    factor_columns = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']
    exposure = (filtered_df[factor_columns].T * weights).T.sum()
    
    return exposure

# Example usage
date = '2023-01-31'
portfolio_exposure = calculate_factor_exposure(cleaned_df, date)
print(portfolio_exposure)

0      0.009563
1      0.001222
4      0.000432
6      0.011723
7     -0.000070
         ...   
325   -0.000195
332    0.000130
333   -0.007111
334   -0.000081
335   -0.000073
Name: Market Value (%), Length: 164, dtype: float64
Mkt-RF    0.268427
SMB       0.159697
HML      -0.011892
RMW      -0.010193
CMA      -0.086644
dtype: float64


### Function Purpose
The function `calculate_factor_exposure` helps to figure out how much different financial factors (like market risk, size, and value) affect a group of investments on a specific date.

### How It Works

1. **Filtering Data**:
   - First, it looks at the investment data for a specific date. Imagine you have a big notebook with daily records, and you just want to open the page for January 31, 2023.

2. **Getting Weights**:
   - Next, it takes the "Market Value (%)" for each investment on that date. Think of this as how much each investment is worth compared to the total. For example, if one investment is worth 10% of the total, it will have a weight of 0.10.

3. **Defining Factors**:
   - It lists out some key financial factors that affect returns, such as:
     - **Mkt-RF**: How much the market earns beyond the risk-free rate (like the return on a safe investment).
     - **SMB**: Smaller companies compared to larger ones.
     - **HML**: Companies that are considered "cheap" versus those that are "expensive."
     - **RMW**: Companies that are more profitable versus less profitable.
     - **CMA**: Companies that invest conservatively versus aggressively.

4. **Calculating Exposure**:
   - For each factor, it multiplies the factor's value by the investment's weight. This tells you how much each investment contributes to each factor, based on its size.
   - Then, it adds all those contributions together for each factor to see the total exposure to that factor.

### Final Result
- The function returns a summary showing how much exposure the investments have to each of those financial factors on that specific date. This helps investors understand where their risks and opportunities lie in relation to market movements and investment styles. 

In essence, it’s like saying, "For this day, how much do our investments lean toward these different factors?"