In [None]:
import pandas as pd
import datetime as dt

from pathlib import Path
import json

print("Importing Complete")

Mounted at /content/drive
Importing Complete


In [None]:
#Let's take a look at the past sp500 tickers
def get_sp500_constitutents_records(filepath):
    sp500_constituents_file = Path(filepath)
    if sp500_constituents_file.is_file():
      df = pd.read_csv(filepath, index_col='date')
    else:
      return print('Could not find SP500 Constituents Records')

filepath = 'p1inputs/S&P 500 Historical Components & Changes.csv'
sp500_constituents_records = get_sp500_constitutents_records(filepath)
display(sp500_constituents_records.head())

Unnamed: 0_level_0,tickers
date,Unnamed: 1_level_1
1996-01-02,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1996-01-03,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1996-01-04,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1996-01-10,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
1996-01-11,"AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."


In [None]:
#Because the file was imported from a CSV we will need to change the tickers columns to lists and dates to datetimes respecitvely
#This will allow us to operate on the index and tickers using python operations

def format_sp500_constituents_records(df):
  df['tickers'] = df['tickers'].apply(lambda x: sorted(x.split(','))) #Change each ticker row to lists
  df.index = pd.to_datetime(df.index, format = '%Y-%m-%d') #Change Date (str) Index to a Datetime Index
  return df

sp500_constituents_records = format_sp500_constituents_records(sp500_constituents_records) 

In [None]:
#We only need the past 15 years of data, so let's remove the rows that we don't need from the dataframe
def slice_sp500_constituents_records(df,
                                     start_date,
                                     end_date):
  start_date = dt.datetime.strptime(start_date,'%Y-%m-%d') #Change string dates to datetime for pandas to compare them
  end_date = dt.datetime.strptime(end_date,'%Y-%m-%d')
  date_ranged_df = df.loc[start_date:end_date]
  return date_ranged_df

start_date = '2007-01-01' #Change your dates as needed for your strategy
end_date = '2022-01-16' 

sp500_changes = slice_sp500_constituents_records(sp500_constituents_records,
                                                start_date,
                                                end_date)

display(sp500_changes.head(5)) #Double check that the start and end dates were sliced correctly
display(sp500_changes.tail(5))

Unnamed: 0_level_0,tickers
date,Unnamed: 1_level_1
2007-01-03,"[A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACS, ADB..."
2007-01-04,"[A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACS, ADB..."
2007-01-05,"[A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACS, ADB..."
2007-01-10,"[A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACS, ADB..."
2007-01-11,"[A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACS, ADB..."


Unnamed: 0_level_0,tickers
date,Unnamed: 1_level_1
2021-08-30,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
2021-09-20,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
2021-10-04,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
2021-12-14,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
2021-12-20,"[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."


In [None]:
#Fianlly lets collect all the tickers that were in the SP500 from our date range
def collect_all_sp500_constituents(df):
  '''Returns an alphabetically sorted list of all constituents that were in the sp500 for the sliced date_range df'''
  sp500_constituents = set()
  for years_constituents in df['tickers']:
    sp500_constituents = sp500_constituents | set(years_constituents)
  return sorted(sp500_constituents)

all_sp500_constituents = collect_all_sp500_constituents(sp500_changes)
print("There were {} total sp500 constituents between {} to {}".format(len(all_sp500_constituents), 
                                                                       start_date,
                                                                       end_date))

There were 844 total sp500 constituents between 2007-01-01 to 2022-01-16


In [None]:
#We will save both all_sp500_constituents and date_ranged_sp500_constituents as jsons
#sp500_constituents will be all the tickers that need to be download from yahoo finance data to form our 15yr sp500 database
#date_ranged_sp500_constituents will be used to control which tickers the backtester sees as it trades through the years

all_sp500_constituents_filepath = 'S&P500 Consitutents 20070101-20220116.json' #Include your filepath here
with open(all_sp500_constituents_filepath, 'w', encoding = 'utf-8') as f: #json.dump for list
  json.dump(all_sp500_constituents, f, ensure_ascii=False, indent=4)

sp500_changes_filepath = 'S&P500 Changes 20070101-20220116.json' #Include your filepath here
sp500_changes.to_json(sp500_changes_filepath) #to_json for pandas df