In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import datetime
import os

In [2]:
#This is the general approch to getting data for our model
#(1) randomly generate a year between 1996-2020
#(2) From this year, select at random a S&P 500 from that year 
#(3) With this stock, measure the expected return from the year to year+1 and also 
#    measure actual return of year+1 to year+2 (this is our binary classification data)
#   get the relevant earings/revenue data from that year 
#(4) From this year, meaure the percent change in CPI and Money Supply
#(5) Repeat this for n number of samples
#(6) Create the data matrix 

In [45]:
years_considered = [i for i in range(1998, 2018)]

In [4]:
#cell that gets one sample
#year = np.random.choice(years_considered)
#day = np.random.choice([i for i in range(1,27)])
#month = np.random.choice([i for i in range(1, 12)])
#date = str(year) + str(month) + str(day)
#datestring = str(year) + '-' + str(month) + '-' + str(day)
#print(date)

def get_table(filename):
    if os.path.isfile(filename):
        df = pd.read_csv(filename, index_col='date')
        return df

#takes in date string an returns a stock ticker
def get_company(date):
    #load in the historical list of companies
    filename = './Data/S&P 500 Historical Components & Changes(03-14-2022).csv'
    df = get_table(filename)
    # Convert ticker column from csv to list, then sort.
    df['tickers'] = df['tickers'].apply(lambda x: sorted(x.split(',')))
    df2 = df[df.index <= date]
    last_row = df2.tail(1)
    member_list = last_row['tickers'][0]
    return np.random.choice(member_list)

#ticker = get_company(datestring)
#print("Ticker: {}".format(ticker))

def get_return(symbol, start, end):
    # create empty dataframe and list to store data frames
    df = pd.DataFrame()
    data = []
    # print the symbol which is being downloaded
    try:
        stock = []
        stock = yf.download(symbol, start=start, end=end, progress=False)
        if len(stock) == 0:
            return None
        df = stock
        df.drop(columns=['Open', 'Low', 'High', 'Close', 'Volume'], inplace=True)
        new_df = df['Adj Close'].pct_change()
        m = new_df.mean()
        # convert to annual expected returns
        # annual = (dialy_return + 1)^365 - 1
        annual = ((m + 1) ** 365) - 1
    except:
        return None
    return annual

#binary classifier
def get_class(symbol, start, end, threshold=.07):
    df = pd.DataFrame()
    stock = []
    stock = yf.download(symbol, start=start, end=end, progress=False)
    if len(stock) == 0:
        return None
    df = stock
    df.drop(columns=['Open', 'Low', 'High', 'Close', 'Volume'], inplace=True)
    ret = (df['Adj Close'][-1] - df['Adj Close'][0]) / df['Adj Close'][0]
    #print(ret)
    if ret > threshold: return 1
    else: return 0
    

#exp_ret = get_return(ticker, datetime.datetime(year,month,day), datetime.datetime(year+1,month,day))
#don't append if None
#if exp_ret != None:
    #print(exp_ret)
    #clas = get_class(ticker, datetime.datetime(year+1,month,day), datetime.datetime(year+2,month,day))
    #print(clas)

In [5]:
#get the yearly change in earnings
def get_earnings(start, end,filename='./Data/spearn.csv'):
    df = pd.read_csv(filename)
    se = df[(df.Year == start)]['Earnings'].to_numpy()[0]
    ee = df[(df.Year == end)]['Earnings'].to_numpy()[0]
    sp = df[(df.Year == start)]['Payout Ratio'].to_numpy()[0]
    ep = df[(df.Year == end)]['Payout Ratio'].to_numpy()[0]
    sp = sp[0:len(sp) -1]
    return (ee - se) /se, float(sp)/100

#will contain dividend info
#ear, pay = get_earnings(year, year +1)
#print(pay)
#print(ear)

In [42]:
def get_cpi(start, end, filename ='./Data/US CPI.csv'):
    df = pd.read_csv(filename)
    df['Yearmon'] = df['Yearmon'].astype(str)
    s = df[(df.Yearmon == start)]['CPI'].to_numpy()[0]
    #print(start)
    #print(end)
    e = df[(df.Yearmon == end)]['CPI'].to_numpy()[0]
    return (e-s) / s

#str_day = '01'
#str_month = str(month)
#if month < 10: str_month = '0' + str(month)
#cpi = get_cpi(str_day + '-' + str_month + '-' + str(year),str_day + '-' + str_month + '-' + str(year+1))
#print(cpi)

#format year - month - date w/zeros
def get_M2(start, end, filename='./Data/M2.csv'):
    df = pd.read_csv(filename)
    s = df[df.DATE.str.contains(start)].to_numpy()[0][1]
    #print(end)
    e = df[df.DATE.str.contains(end)].to_numpy()[0][1]
    return (e- s) /s

#m2 = get_M2(str(year) + '-' + str_month,  str(year +1) + '-' + str_month)
#print(m2)

In [48]:
#function to generate samples returns a dataframe
def generate_samples(years=years_considered, num_samples=1000):
    X = []
    y = []
    for _ in range(num_samples):
        #gen a date
        year = np.random.choice(years_considered)
        day = np.random.choice([i for i in range(1,27)])
        month = np.random.choice([i for i in range(1, 12)])
        date = str(year) + str(month) + str(day)
        datestring = str(year) + '-' + str(month) + '-' + str(day)
        
        #get a ticker
        ticker = get_company(datestring)
        exp_ret = get_return(ticker, datetime.datetime(year,month,day), \
                             datetime.datetime(year+1,month,day))
        #don't append if None
        if exp_ret != None:
            clas = get_class(ticker, datetime.datetime(year+1,month,day),\
                             datetime.datetime(year+2,month,day))
            if clas != None:
                ear, pay = get_earnings(year, year +1)
                str_day = '01'
                str_month = str(month)
                if month < 10: str_month = '0' + str(month)
                cpi = get_cpi(str_day + '-' + str_month + '-' + str(year),str_day + '-' + str_month + '-' + str(year+1))
                m2 = get_M2(str(year) + '-' + str_month,  str(year +1) + '-' + str_month)
            
                row = [exp_ret, ear, cpi, m2, pay]
                X.append(row)
                y.append(int(clas))
    return X, y
    

In [49]:
X, y = generate_samples()
np.savetxt('feature_data.csv', X, delimiter=',')
np.savetxt('class_data.csv', y, delimiter=',')


1 Failed download:
- STJ: No data found for this date range, symbol may be delisted

1 Failed download:
- LB: No data found, symbol may be delisted

1 Failed download:
- MEE: Data doesn't exist for startDate = 902728800, endDate = 934264800

1 Failed download:
- PCS: No data found for this date range, symbol may be delisted

1 Failed download:
- SAPE: No data found for this date range, symbol may be delisted

1 Failed download:
- GTE: Data doesn't exist for startDate = 916210800, endDate = 947746800

1 Failed download:
- CCE: Data doesn't exist for startDate = 1057384800, endDate = 1089007200

1 Failed download:
- GENZ: No data found for this date range, symbol may be delisted

1 Failed download:
- LXK: No data found for this date range, symbol may be delisted

1 Failed download:
- TNB: Data doesn't exist for startDate = 1047193200, endDate = 1078815600

1 Failed download:
- FRX: No data found, symbol may be delisted

1 Failed download:
- LLTC: No data found for this date range, symbo


1 Failed download:
- TSG: No data found, symbol may be delisted

1 Failed download:
- AABA: No data found, symbol may be delisted

1 Failed download:
- PVN: Data doesn't exist for startDate = 1098165600, endDate = 1129701600

1 Failed download:
- STI: No data found, symbol may be delisted

1 Failed download:
- STJ: No data found for this date range, symbol may be delisted

1 Failed download:
- AYE: No data found for this date range, symbol may be delisted

1 Failed download:
- CBS: No data found, symbol may be delisted

1 Failed download:
- BR: Data doesn't exist for startDate = 1126936800, endDate = 1158472800

1 Failed download:
- AS: Data doesn't exist for startDate = 910767600, endDate = 942303600

1 Failed download:
- EC: Data doesn't exist for startDate = 1118901600, endDate = 1150437600

1 Failed download:
- CYM: Data doesn't exist for startDate = 900741600, endDate = 932277600

1 Failed download:
- NSM: Data doesn't exist for startDate = 1047538800, endDate = 1079161200

1 Fai


1 Failed download:
- PGN: No data found for this date range, symbol may be delisted

1 Failed download:
- HSH: No data found for this date range, symbol may be delisted

1 Failed download:
- PX: Data doesn't exist for startDate = 988696800, endDate = 1020232800

1 Failed download:
- BBT: No data found, symbol may be delisted

1 Failed download:
- AGN: No data found, symbol may be delisted

1 Failed download:
- FTR: No data found, symbol may be delisted

1 Failed download:
- WYE: No data found for this date range, symbol may be delisted

1 Failed download:
- BLS: No data found for this date range, symbol may be delisted

1 Failed download:
- LLTC: No data found for this date range, symbol may be delisted

1 Failed download:
- KWP: No data found for this date range, symbol may be delisted

1 Failed download:
- BCR: No data found for this date range, symbol may be delisted

1 Failed download:
- DELL: Data doesn't exist for startDate = 915433200, endDate = 946969200

1 Failed download:
- 


1 Failed download:
- SUN: Data doesn't exist for startDate = 1161842400, endDate = 1193378400

1 Failed download:
- HNZ: No data found, symbol may be delisted

1 Failed download:
- CBE: No data found for this date range, symbol may be delisted

1 Failed download:
- VSTNQ: No data found, symbol may be delisted

1 Failed download:
- PTV: Data doesn't exist for startDate = 1231570800, endDate = 1263106800

1 Failed download:
- CVH: Data doesn't exist for startDate = 1132210800, endDate = 1163746800

1 Failed download:
- BEAM: Data doesn't exist for startDate = 957160800, endDate = 988696800

1 Failed download:
- RX: No data found for this date range, symbol may be delisted

1 Failed download:
- DOW: Data doesn't exist for startDate = 1360393200, endDate = 1391929200

1 Failed download:
- ACS: No data found for this date range, symbol may be delisted

1 Failed download:
- NOVL: No data found for this date range, symbol may be delisted

1 Failed download:
- UPC: Data doesn't exist for star