In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import datetime
import os

In [2]:
#This is the general approch to getting data for our model
#(1) randomly generate a year between 1996-2020
#(2) From this year, select at random a S&P 500 from that year 
#(3) With this stock, measure the expected return from the year to year+1 and also 
#    measure actual return of year+1 to year+2 (this is our binary classification data)
#   get the relevant earings/revenue data from that year 
#(4) From this year, meaure the percent change in CPI and Money Supply
#(5) Repeat this for n number of samples
#(6) Create the data matrix 

In [2]:
years_considered = [i for i in range(1998, 2018)]

In [3]:
#cell that gets one sample
#year = np.random.choice(years_considered)
#day = np.random.choice([i for i in range(1,27)])
#month = np.random.choice([i for i in range(1, 12)])
#date = str(year) + str(month) + str(day)
#datestring = str(year) + '-' + str(month) + '-' + str(day)
#print(date)

def get_table(filename):
    if os.path.isfile(filename):
        df = pd.read_csv(filename, index_col='date')
        return df

#takes in date string an returns a stock ticker
def get_company(date):
    #load in the historical list of companies
    filename = './Data/S&P 500 Historical Components & Changes(03-14-2022).csv'
    df = get_table(filename)
    # Convert ticker column from csv to list, then sort.
    df['tickers'] = df['tickers'].apply(lambda x: sorted(x.split(',')))
    df2 = df[df.index <= date]
    last_row = df2.tail(1)
    member_list = last_row['tickers'][0]
    return np.random.choice(member_list)

#ticker = get_company(datestring)
#print("Ticker: {}".format(ticker))

def get_return(symbol, start, end):
    # create empty dataframe and list to store data frames
    df = pd.DataFrame()
    data = []
    # print the symbol which is being downloaded
    try:
        stock = []
        stock = yf.download(symbol, start=start, end=end, progress=False)
        if len(stock) == 0:
            return None
        df = stock
        df.drop(columns=['Open', 'Low', 'High', 'Close', 'Volume'], inplace=True)
        new_df = df['Adj Close'].pct_change()
        m = new_df.mean()
        # convert to annual expected returns
        # annual = (dialy_return + 1)^365 - 1
        annual = ((m + 1) ** 365) - 1
    except:
        return None
    return annual

#binary classifier
def get_class(symbol, start, end, threshold=.07):
    df = pd.DataFrame()
    stock = []
    stock = yf.download(symbol, start=start, end=end, progress=False)
    if len(stock) == 0:
        return None
    df = stock
    df.drop(columns=['Open', 'Low', 'High', 'Close', 'Volume'], inplace=True)
    ret = (df['Adj Close'][-1] - df['Adj Close'][0]) / df['Adj Close'][0]
    #print(ret)
    if ret > threshold: return 1
    else: return 0
    

#exp_ret = get_return(ticker, datetime.datetime(year,month,day), datetime.datetime(year+1,month,day))
#don't append if None
#if exp_ret != None:
    #print(exp_ret)
    #clas = get_class(ticker, datetime.datetime(year+1,month,day), datetime.datetime(year+2,month,day))
    #print(clas)

In [4]:
#get the yearly change in earnings
def get_earnings(start, end,filename='./Data/spearn.csv'):
    df = pd.read_csv(filename)
    se = df[(df.Year == start)]['Earnings'].to_numpy()[0]
    ee = df[(df.Year == end)]['Earnings'].to_numpy()[0]
    sp = df[(df.Year == start)]['Payout Ratio'].to_numpy()[0]
    ep = df[(df.Year == end)]['Payout Ratio'].to_numpy()[0]
    sp = sp[0:len(sp) -1]
    return (ee - se) /se, float(sp)/100

#will contain dividend info
#ear, pay = get_earnings(year, year +1)
#print(pay)
#print(ear)

In [5]:
def get_cpi(start, end, filename ='./Data/US CPI.csv'):
    df = pd.read_csv(filename)
    df['Yearmon'] = df['Yearmon'].astype(str)
    s = df[(df.Yearmon == start)]['CPI'].to_numpy()[0]
    #print(start)
    #print(end)
    e = df[(df.Yearmon == end)]['CPI'].to_numpy()[0]
    return (e-s) / s

#str_day = '01'
#str_month = str(month)
#if month < 10: str_month = '0' + str(month)
#cpi = get_cpi(str_day + '-' + str_month + '-' + str(year),str_day + '-' + str_month + '-' + str(year+1))
#print(cpi)

#format year - month - date w/zeros
def get_M2(start, end, filename='./Data/M2.csv'):
    df = pd.read_csv(filename)
    s = df[df.DATE.str.contains(start)].to_numpy()[0][1]
    #print(end)
    e = df[df.DATE.str.contains(end)].to_numpy()[0][1]
    return (e- s) /s

#m2 = get_M2(str(year) + '-' + str_month,  str(year +1) + '-' + str_month)
#print(m2)

In [9]:
#function to generate samples returns a dataframe
def generate_samples(years=years_considered, num_samples=2000):
    X = []
    y = []
    for i in range(num_samples):
        print(i)
        #gen a date
        year = np.random.choice(years_considered)
        day = np.random.choice([i for i in range(1,27)])
        month = np.random.choice([i for i in range(1, 12)])
        date = str(year) + str(month) + str(day)
        datestring = str(year) + '-' + str(month) + '-' + str(day)
        
        #get a ticker
        ticker = get_company(datestring)
        exp_ret = get_return(ticker, datetime.datetime(year,month,day), \
                             datetime.datetime(year+1,month,day))
        #don't append if None
        if exp_ret != None:
            clas = get_class(ticker, datetime.datetime(year+1,month,day),\
                             datetime.datetime(year+2,month,day))
            if clas != None:
                ear, pay = get_earnings(year, year +1)
                str_day = '01'
                str_month = str(month)
                if month < 10: str_month = '0' + str(month)
                cpi = get_cpi(str_day + '-' + str_month + '-' + str(year),str_day + '-' + str_month + '-' + str(year+1))
                m2 = get_M2(str(year) + '-' + str_month,  str(year +1) + '-' + str_month)
            
                row = [exp_ret, ear, cpi, m2, pay]
                X.append(row)
                y.append(int(clas))
    return X, y
    

In [8]:
X, y = generate_samples()
np.savetxt('feature_data.csv', X, delimiter=',')
np.savetxt('class_data.csv', y, delimiter=',')


1 Failed download:
- SEBL: No data found for this date range, symbol may be delisted

1 Failed download:
- CTL: No data found, symbol may be delisted

1 Failed download:
- ESV: No data found, symbol may be delisted

1 Failed download:
- DELL: Data doesn't exist for startDate = 1237010400, endDate = 1268550000

1 Failed download:
- FRX: No data found, symbol may be delisted

1 Failed download:
- WYE: No data found for this date range, symbol may be delisted

1 Failed download:
- APOL: No data found for this date range, symbol may be delisted

1 Failed download:
- MTLQQ: No data found for this date range, symbol may be delisted

1 Failed download:
- BMET: Data doesn't exist for startDate = 887785200, endDate = 919321200

1 Failed download:
- SMI: Data doesn't exist for startDate = 942303600, endDate = 973926000

1 Failed download:
- HCBK: No data found for this date range, symbol may be delisted

1 Failed download:
- MON: Data doesn't exist for startDate = 1394604000, endDate = 14261400


1 Failed download:
- JNS: No data found for this date range, symbol may be delisted

1 Failed download:
- FDO: No data found for this date range, symbol may be delisted

1 Failed download:
- DELL: Data doesn't exist for startDate = 1151128800, endDate = 1182664800

1 Failed download:
- NCC: Data doesn't exist for startDate = 1114927200, endDate = 1146463200

1 Failed download:
- MWV: No data found for this date range, symbol may be delisted

1 Failed download:
- TMK: No data found, symbol may be delisted

1 Failed download:
- HCBK: No data found for this date range, symbol may be delisted

1 Failed download:
- PLL: Data doesn't exist for startDate = 1013065200, endDate = 1044601200

1 Failed download:
- LEHMQ: No data found for this date range, symbol may be delisted

1 Failed download:
- TEK: No data found for this date range, symbol may be delisted

1 Failed download:
- CPQ: Data doesn't exist for startDate = 888822000, endDate = 920358000

1 Failed download:
- FRX: No data found, s


1 Failed download:
- CPWR: Data doesn't exist for startDate = 1224828000, endDate = 1256364000

1 Failed download:
- HOT: No data found for this date range, symbol may be delisted

1 Failed download:
- FDO: No data found for this date range, symbol may be delisted

1 Failed download:
- SDS: Data doesn't exist for startDate = 1095660000, endDate = 1127196000

1 Failed download:
- TE: No data found for this date range, symbol may be delisted

1 Failed download:
- LB: No data found, symbol may be delisted

1 Failed download:
- BBT: No data found, symbol may be delisted

1 Failed download:
- MHS: No data found for this date range, symbol may be delisted

1 Failed download:
- PETM: Data doesn't exist for startDate = 1362981600, endDate = 1394517600

1 Failed download:
- TNB: Data doesn't exist for startDate = 1035093600, endDate = 1066629600

1 Failed download:
- UTX: No data found, symbol may be delisted

1 Failed download:
- DGN: Data doesn't exist for startDate = 906012000, endDate = 93


1 Failed download:
- CITGQ: No data found, symbol may be delisted

1 Failed download:
- HM: Data doesn't exist for startDate = 948006000, endDate = 979628400

1 Failed download:
- KSE: Data doesn't exist for startDate = 952066800, endDate = 983602800

1 Failed download:
- NFB: No data found for this date range, symbol may be delisted

1 Failed download:
- FRX: No data found, symbol may be delisted

1 Failed download:
- PCL: No data found for this date range, symbol may be delisted

1 Failed download:
- MDP: No data found for this date range, symbol may be delisted

1 Failed download:
- TWC: No data found for this date range, symbol may be delisted

1 Failed download:
- ABI: No data found for this date range, symbol may be delisted

1 Failed download:
- MDP: No data found for this date range, symbol may be delisted

1 Failed download:
- EDS: No data found for this date range, symbol may be delisted

1 Failed download:
- AM: Data doesn't exist for startDate = 1027663200, endDate = 10591


1 Failed download:
- UCL: Data doesn't exist for startDate = 894434400, endDate = 925970400

1 Failed download:
- AGN: No data found, symbol may be delisted

1 Failed download:
- LIFE: Data doesn't exist for startDate = 1369461600, endDate = 1400997600

1 Failed download:
- FBF: No data found for this date range, symbol may be delisted

1 Failed download:
- MMI: Data doesn't exist for startDate = 1310709600, endDate = 1342332000

1 Failed download:
- DG: Data doesn't exist for startDate = 1139295600, endDate = 1170831600

1 Failed download:
- AGN: No data found, symbol may be delisted

1 Failed download:
- NSM: Data doesn't exist for startDate = 940140000, endDate = 971762400

1 Failed download:
- COG: No data found, symbol may be delisted

1 Failed download:
- HOT: Data doesn't exist for startDate = 963986400, endDate = 995522400

1 Failed download:
- CCE: Data doesn't exist for startDate = 927439200, endDate = 959061600

1 Failed download:
- RDS.A: No data found, symbol may be delis


1 Failed download:
- LLTC: No data found for this date range, symbol may be delisted

1 Failed download:
- PCL: No data found for this date range, symbol may be delisted

1 Failed download:
- FPC: Data doesn't exist for startDate = 974444400, endDate = 1005980400

1 Failed download:
- UAWGQ: No data found, symbol may be delisted

1 Failed download:
- DYN: Data doesn't exist for startDate = 1010041200, endDate = 1041577200

1 Failed download:
- ASND: Data doesn't exist for startDate = 894866400, endDate = 926402400

1 Failed download:
- BR: Data doesn't exist for startDate = 1122271200, endDate = 1153807200

1 Failed download:
- FTR: No data found, symbol may be delisted

1 Failed download:
- BMET: Data doesn't exist for startDate = 989820000, endDate = 1021356000

1 Failed download:
- CTB: No data found, symbol may be delisted

1 Failed download:
- GENZ: No data found for this date range, symbol may be delisted

1 Failed download:
- AL: Data doesn't exist for startDate = 965973600, en


1 Failed download:
- AGC: No data found, symbol may be delisted

1 Failed download:
- WIN: No data found, symbol may be delisted

1 Failed download:
- Q: No data found for this date range, symbol may be delisted

1 Failed download:
- JHF: No data found for this date range, symbol may be delisted

1 Failed download:
- FII: No data found, symbol may be delisted

1 Failed download:
- XLNX: No data found, symbol may be delisted

1 Failed download:
- HET: No data found for this date range, symbol may be delisted

1 Failed download:
- TEK: Data doesn't exist for startDate = 953794800, endDate = 985330800

1 Failed download:
- LM: No data found, symbol may be delisted

1 Failed download:
- KMI: Data doesn't exist for startDate = 1016521200, endDate = 1048057200
