In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import yfinance as yf#     

# List of tickers
tickers_it = ['AAPL', 'NVDA', 'ORCL', 'MSFT']
tickers_consumer_staples = ['WMT', 'COST', 'PM']
tickers_financials = ['JPM', 'AXP']
tickers_etfs = ['VTI', 'IVV', 'VOO']
tickers_mutual_funds = ['FDFIX', 'VFTAX', 'VLACX', 'SCHX']
sp500_ticker = '^GSPC'  # S&P 500 Index

# Combine all tickers into one list
all_tickers = tickers_it + tickers_consumer_staples + tickers_financials + tickers_etfs + tickers_mutual_funds + [sp500_ticker]

# Date range
start_date = '2019-01-01'
end_date = '2024-01-01'

# Download data from Yahoo Finance with weekly intervals
data = yf.download(all_tickers, start=start_date, end=end_date, interval='1mo')['Adj Close']

# Create a multi-index dataframe with sector labels
sectors = {
    'IT': tickers_it,
    'Consumer Staples': tickers_consumer_staples,
    'Financials': tickers_financials,
    'ETFs': tickers_etfs,
    'Mutual Funds': tickers_mutual_funds,
    'SP500': [sp500_ticker]
}

# Organize data into sector columns
sector_data = pd.DataFrame()

# Add data for each sector
for sector, tickers in sectors.items():
    sector_df = data[tickers]
    sector_df.columns = [f"{ticker}" for ticker in tickers]
    sector_data = pd.concat([sector_data, sector_df], axis=1)

# Display the resulting DataFrame
sector_data.head()  # Display the first few rows

[*********************100%***********************]  17 of 17 completed


Unnamed: 0_level_0,AAPL,NVDA,ORCL,MSFT,WMT,COST,PM,JPM,AXP,VTI,IVV,VOO,FDFIX,VFTAX,VLACX,SCHX,^GSPC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019-01-01 00:00:00+00:00,39.743023,3.565071,45.720039,98.430382,29.111807,195.332855,55.851959,86.860825,94.49041,126.368622,246.739883,225.563705,10.490227,,46.06274,8.853344,2704.100098
2019-02-01 00:00:00+00:00,41.345268,3.825723,47.637314,105.59375,30.071772,199.07338,63.292107,88.293839,99.533241,130.865799,254.708618,232.894287,10.827447,,47.560493,9.155225,2784.48999
2019-03-01 00:00:00+00:00,45.551338,4.457764,49.081146,111.638977,29.628242,220.964752,64.347687,85.645721,100.974403,132.006134,258.561249,236.050232,11.037067,24.300608,48.176132,9.277349,2834.399902
2019-04-01 00:00:00+00:00,48.122066,4.493515,50.561535,123.622597,31.409445,224.058258,63.81337,98.184181,108.300377,137.929169,269.969818,246.954956,11.438086,25.433983,50.402054,9.775023,2945.830078
2019-05-01 00:00:00+00:00,41.983009,3.362937,46.446259,117.072289,30.981857,218.628616,56.861454,90.333488,106.347443,129.032883,252.965561,231.278503,10.75527,23.812317,47.205135,9.156524,2752.060059


In [2]:
sector_data = sector_data.ffill()


In [None]:
def get_fff_returns():
    """
    Load the Fama-French Research Factor Monthly Dataset
    """
    file_path = "/F-F_Research_Data_Factors.CSV"
    
    # Read the file, skipping the first metadata row and handling extra spaces
    rets = pd.read_csv(file_path, header=None, names=['Date', 'Mkt-RF', 'SMB', 'HML', 'RF'], 
                       skiprows=1, sep=',', na_values=-99.99)
    
    # Clean up any extra spaces in the column names
    rets.columns = rets.columns.str.strip()

    # Filter out rows where the 'Date' column is not in the correct YYYYMM format (6 digits)
    rets = rets[rets['Date'].astype(str).str.match(r'^\d{6}$', na=False)]

    # Convert 'Date' column to a period (monthly), handling invalid rows
    rets['Date'] = pd.to_datetime(rets['Date'].astype(str), format="%Y%m", errors='coerce')
    
    # Drop rows where the date conversion failed (NaT)
    rets = rets.dropna(subset=['Date'])

    # Set the 'Date' column as the index
    rets.set_index('Date', inplace=True)

    # Filter the data for the date range 2019-01-01 to 2024-01-01
    start_date = pd.to_datetime('2019-01-01')
    end_date = pd.to_datetime('2024-01-01')
    rets = rets[(rets.index >= start_date) & (rets.index <= end_date)]

    # Convert the factor columns to numeric, forcing errors to NaN
    rets[['Mkt-RF', 'SMB', 'HML', 'RF']] = rets[['Mkt-RF', 'SMB', 'HML', 'RF']].apply(pd.to_numeric, errors='coerce')

    # Normalize the returns to percentages
    rets = rets / 100

    return rets


In [4]:
fff = get_fff_returns()
print(fff.head())


            Mkt-RF     SMB     HML      RF
Date                                      
2019-01-01  0.0840  0.0288 -0.0045  0.0021
2019-02-01  0.0340  0.0206 -0.0271  0.0018
2019-03-01  0.0110 -0.0305 -0.0412  0.0019
2019-04-01  0.0397 -0.0172  0.0216  0.0021
2019-05-01 -0.0694 -0.0131 -0.0237  0.0021


In [5]:
# Convert the index of fff to be timezone-aware (e.g., UTC)
fff.index = fff.index.tz_localize('UTC')

# Now attempt to join the data
merged_data = sector_data.join(fff, how='inner')


# Check the first few rows of the merged data
print(merged_data.head())


                                AAPL      NVDA       ORCL        MSFT  \
Date                                                                    
2019-01-01 00:00:00+00:00  39.743023  3.565071  45.720039   98.430382   
2019-02-01 00:00:00+00:00  41.345268  3.825723  47.637314  105.593750   
2019-03-01 00:00:00+00:00  45.551338  4.457764  49.081146  111.638977   
2019-04-01 00:00:00+00:00  48.122066  4.493515  50.561535  123.622597   
2019-05-01 00:00:00+00:00  41.983009  3.362937  46.446259  117.072289   

                                 WMT        COST         PM        JPM  \
Date                                                                     
2019-01-01 00:00:00+00:00  29.111807  195.332855  55.851959  86.860825   
2019-02-01 00:00:00+00:00  30.071772  199.073380  63.292107  88.293839   
2019-03-01 00:00:00+00:00  29.628242  220.964752  64.347687  85.645721   
2019-04-01 00:00:00+00:00  31.409445  224.058258  63.813370  98.184181   
2019-05-01 00:00:00+00:00  30.981857  218.62

In [6]:
merged_data = merged_data.ffill()

In [None]:
# Specify the file path for saving the CSV
output_file_path = 'FIN2/MIDTERM/merged_data.csv'

# Ensure that the 'Date' column is included as a regular column (not index)
merged_data.reset_index(inplace=True)

# Save the DataFrame to a CSV file with the proper header and data
merged_data.to_csv(output_file_path, index=False)

# Confirm where the file is saved
print(f'Merged data saved to: {output_file_path}')