In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import yfinance as yf#     

# List of tickers
tickers_it = ['AAPL', 'NVDA', 'ORCL', 'MSFT']
tickers_consumer_staples = ['WMT', 'COST', 'PM']
tickers_financials = ['JPM', 'AXP']
tickers_etfs = ['VTI', 'IVV', 'VOO']
tickers_mutual_funds = ['FDFIX', 'VFTAX', 'VLACX']
sp500_ticker = '^GSPC'  # S&P 500 Index

# Combine all tickers into one list
all_tickers = tickers_it + tickers_consumer_staples + tickers_financials + tickers_etfs + tickers_mutual_funds + [sp500_ticker]

# Date range
start_date = '2019-01-01'
end_date = '2024-01-01'

# Download data from Yahoo Finance with weekly intervals
data = yf.download(all_tickers, start=start_date, end=end_date, interval='1mo')['Adj Close']

# Create a multi-index dataframe with sector labels
sectors = {
    'IT': tickers_it,
    'Consumer Staples': tickers_consumer_staples,
    'Financials': tickers_financials,
    'ETFs': tickers_etfs,
    'Mutual Funds': tickers_mutual_funds,
    'SP500': [sp500_ticker]
}

# Organize data into sector columns
sector_data = pd.DataFrame()

# Add data for each sector
for sector, tickers in sectors.items():
    sector_df = data[tickers]
    sector_df.columns = [f"{ticker}" for ticker in tickers]
    sector_data = pd.concat([sector_data, sector_df], axis=1)

# Display the resulting DataFrame
sector_data.head()  # Display the first few rows

[*********************100%***********************]  16 of 16 completed


Unnamed: 0_level_0,AAPL,NVDA,ORCL,MSFT,WMT,COST,PM,JPM,AXP,VTI,IVV,VOO,FDFIX,VFTAX,VLACX,^GSPC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-01-01 00:00:00+00:00,39.743019,3.565071,45.720047,98.430374,29.111803,195.332901,55.851952,86.860832,94.490417,126.368614,246.739883,225.563721,10.490228,,46.062733,2704.100098
2019-02-01 00:00:00+00:00,41.345261,3.825723,47.637314,105.593758,30.071772,199.07338,63.292099,88.293846,99.533234,130.865829,254.70871,232.894287,10.827443,,47.560474,2784.48999
2019-03-01 00:00:00+00:00,45.551338,4.457765,49.08115,111.638962,29.628244,220.964737,64.347687,85.645714,100.974403,132.006088,258.56131,236.050262,11.037067,24.30061,48.176125,2834.399902
2019-04-01 00:00:00+00:00,48.122078,4.493515,50.561531,123.622612,31.409441,224.058304,63.813404,98.184174,108.300362,137.929184,269.969879,246.954971,11.438085,25.433977,50.402054,2945.830078
2019-05-01 00:00:00+00:00,41.983006,3.362936,46.446255,117.072311,30.981853,218.628586,56.861435,90.333488,106.347466,129.032837,252.965576,231.278534,10.755268,23.812319,47.205147,2752.060059


In [8]:
sector_data = sector_data.ffill()


In [None]:
def get_fff_returns():
    """
    Load the Fama-French Research Factor Monthly Dataset
    """
    file_path = "/Users/anhpham/Downloads/FIN2/MIDTERM/F-F_Research_Data_Factors.CSV"
    
    # Read the file, skipping the first metadata row and handling extra spaces
    rets = pd.read_csv(file_path, header=None, names=['Date', 'Mkt-RF', 'SMB', 'HML', 'RF'], 
                       skiprows=1, sep=',', na_values=-99.99)
    
    # Clean up any extra spaces in the column names
    rets.columns = rets.columns.str.strip()

    # Filter out rows where the 'Date' column is not in the correct YYYYMM format (6 digits)
    rets = rets[rets['Date'].astype(str).str.match(r'^\d{6}$', na=False)]

    # Convert 'Date' column to a period (monthly), handling invalid rows
    rets['Date'] = pd.to_datetime(rets['Date'].astype(str), format="%Y%m", errors='coerce')
    
    # Drop rows where the date conversion failed (NaT)
    rets = rets.dropna(subset=['Date'])

    # Set the 'Date' column as the index
    rets.set_index('Date', inplace=True)

    # Filter the data for the date range 2019-01-01 to 2024-01-01
    start_date = pd.to_datetime('2019-01-01')
    end_date = pd.to_datetime('2024-01-01')
    rets = rets[(rets.index >= start_date) & (rets.index <= end_date)]

    # Convert the factor columns to numeric, forcing errors to NaN
    rets[['Mkt-RF', 'SMB', 'HML', 'RF']] = rets[['Mkt-RF', 'SMB', 'HML', 'RF']].apply(pd.to_numeric, errors='coerce')

    # Normalize the returns to percentages
    rets = rets / 100

    return rets


In [11]:
fff = get_fff_returns()
print(fff.head())


            Mkt-RF     SMB     HML      RF
Date                                      
2019-01-01  0.0840  0.0288 -0.0045  0.0021
2019-02-01  0.0340  0.0206 -0.0271  0.0018
2019-03-01  0.0110 -0.0305 -0.0412  0.0019
2019-04-01  0.0397 -0.0172  0.0216  0.0021
2019-05-01 -0.0694 -0.0131 -0.0237  0.0021


In [12]:
# Convert the index of fff to be timezone-aware (e.g., UTC)
fff.index = fff.index.tz_localize('UTC')

# Now attempt to join the data
merged_data = sector_data.join(fff, how='inner')


# Check the first few rows of the merged data
print(merged_data.head())


                                AAPL      NVDA       ORCL        MSFT  \
Date                                                                    
2019-01-01 00:00:00+00:00  39.743019  3.565071  45.720047   98.430374   
2019-02-01 00:00:00+00:00  41.345261  3.825723  47.637314  105.593758   
2019-03-01 00:00:00+00:00  45.551338  4.457765  49.081150  111.638962   
2019-04-01 00:00:00+00:00  48.122078  4.493515  50.561531  123.622612   
2019-05-01 00:00:00+00:00  41.983006  3.362936  46.446255  117.072311   

                                 WMT        COST         PM        JPM  \
Date                                                                     
2019-01-01 00:00:00+00:00  29.111803  195.332901  55.851952  86.860832   
2019-02-01 00:00:00+00:00  30.071772  199.073380  63.292099  88.293846   
2019-03-01 00:00:00+00:00  29.628244  220.964737  64.347687  85.645714   
2019-04-01 00:00:00+00:00  31.409441  224.058304  63.813404  98.184174   
2019-05-01 00:00:00+00:00  30.981853  218.62

In [13]:
merged_data = merged_data.ffill()

In [None]:
# Specify the file path for saving the CSV
output_file_path = '/Users/anhpham/Downloads/FIN2/MIDTERM/merged_data.csv'

# Ensure that the 'Date' column is included as a regular column (not index)
merged_data.reset_index(inplace=True)

# Save the DataFrame to a CSV file with the proper header and data
merged_data.to_csv(output_file_path, index=False)

# Confirm where the file is saved
print(f'Merged data saved to: {output_file_path}')

Merged data saved to: /Users/anhpham/Downloads/FIN2/merged_data.csv
