In [1]:
%%time
import yfinance as yf
import pandas as pd
from datetime import datetime
import os
from time import sleep, time
from dateutil.relativedelta import relativedelta
import sklearn as sk
from sklearn.preprocessing import StandardScaler

import numpy as np

CPU times: user 359 ms, sys: 11 ms, total: 370 ms
Wall time: 529 ms


In [2]:
# Functions to maintain and call the S&P 500 from a current date
# Author Michael Djaballah
# Time last edited: 5:56 PM June 1, 2020
# Last edited by: Michael Djaballah

# Takes no input
# Output is newly saved CSV's containing the current makeup of the S&P 500 
# and its historical additions and removals
# data_path is changeable depending on desired save location
def get_snp_store(data_path='data/'):
    curr_raw = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    
    curr = curr_raw[0]
    hist = curr_raw[1]
    
    new_hist = pd.DataFrame(hist['Date'])
    new_hist['Added'] = hist['Added', 'Ticker']
    new_hist['Removed'] = hist['Removed', 'Ticker']
    
    os.makedirs(data_path, exist_ok=True)
    
    curr.to_csv(data_path + 'snp_current.csv', index=False)
    new_hist.to_csv(data_path + 'snp_hist.csv', index=False)
    return None


# Input: a date in string form with its corresponding format:
# Ex: 'January 1, 2020', '%B %d, %Y'
# Output: a list containing the S&P 500 at the input date
def build_snp(date, date_format, data_path='data/'):
    curr = pd.read_csv(data_path + 'snp_current.csv')
    hist = pd.read_csv(data_path + 'snp_hist.csv')
    
    start_date = datetime.strptime(date, date_format)
    
    snp_set = set(curr['Symbol'])
    
    for i in range(len(hist)):
        temp_date = datetime.strptime(hist.iloc[i]['Date'], date_format)
        if temp_date < start_date:
            break

        tb_removed = hist.iloc[i]['Added']
        tb_added = hist.iloc[i]['Removed']

        if tb_removed in snp_set:
            snp_set.remove(tb_removed)
        if not type(tb_added) == float:
            snp_set.add(tb_added)
    
    return list(snp_set)

# Included to build returns for a SPY comparison
# Input is a portfolio (ticker:df dictionary), the tickers desired for returns, and the date of returns
# Output is a list of floats that are returns
def build_returns(portfolio, tickers, date):
    returns = []
    for ticker in tickers:
        temp_ticker_dict = portfolio[ticker].set_index('Date').loc[date]
        returns.append((temp_ticker_dict['Close'] - temp_ticker_dict['Open'])/temp_ticker_dict['Open'])
    return returns

In [5]:
class Portfolio:
    def __init__(self, tickers, hist_depth=None, train_depth=None, features=[], 
                 data_path = 'data/', prefix = 'monthly/', interval = '1mo', 
                 data_start = '2001-01-01', target='Close'):
        self.portfolio = {}
        self.tickers = tickers
        self.features = features
        self.target = target
        
        self.hist_depth = hist_depth
        self.train_depth = train_depth
        
        self.interval = interval
        self.data_start = data_start
        
        self.data_path = data_path
        self.prefix = prefix
        
        self.results = []
        
        self.blacklist = set()
        
        self.portfolio = self.build_portfolio()
        self.tickers = list(self.portfolio.keys())
        self.columns = self.build_columns()
        
    
    def get_data(self, return_bad_tickers=False):
        bad_tickers = []

        os.makedirs(self.data_path + self.prefix, exist_ok=True)

        curr_tickers = set(os.listdir(self.data_path + self.prefix))

        for ticker in self.tickers:
            ticker_label = ticker + '.csv'

            if ticker_label not in curr_tickers:
                temp_ticker = yf.Ticker(ticker)
                temp_hist = temp_ticker.history(start=self.data_start, interval=self.interval)
                temp_hist.dropna(axis=0, inplace=True)
                temp_hist.to_csv(self.data_path + self.prefix + ticker_label)

                if len(temp_hist) < 90:
                    bad_tickers.append((ticker, len(temp_hist)))
                sleep(.5)

        if return_bad_tickers:
            return bad_tickers

        return None
    
    
    def build_columns(self):
        columns = []
        for i in range(self.hist_depth):
            for feature in self.features:
                columns.append(feature + ' ' + str(i + 1))
        return columns
    
    
    def check_ticker(self, ticker, offset):
        ticker_df = pd.read_csv(self.data_path + self.prefix + ticker + '.csv')
        if len(ticker_df) >= offset:
            return ticker_df
        return False
    
    
    def build_portfolio(self):
        offset = self.train_depth + self.hist_depth + 60 + 6

        self.get_data()

        ticker_dict = {}

        for ticker in self.tickers:
            if ticker not in self.blacklist:
                ticker_df = self.check_ticker(ticker, offset)
                if type(ticker_df) != bool:
                    ticker_dict[ticker] = ticker_df

        return ticker_dict
    
    
    def build_returns(self, symbols, date):
        returns = []
        for ticker in symbols:
            temp_ticker_dict = self.portfolio[ticker].set_index('Date').loc[date]
            returns.append((temp_ticker_dict['Close'] - temp_ticker_dict['Open'])/temp_ticker_dict['Open'])
        return returns
    
    
    def build_scaled_df(self, dataframe):
        scaler = StandardScaler()
        scaled_array = scaler.fit_transform(dataframe)
        scaled_dataframe = pd.DataFrame(scaled_array, columns=dataframe.columns)
        return scaled_dataframe
    
    
    def check_date(self, ticker, date):
        dates = set(self.portfolio[ticker]['Date'])
        return date in dates
    
    
    def build_machine(self, model, date, n=15):
        train_df = self.build_train_df(date)
        scaled_train_df = self.build_scaled_df(train_df)

        scaled_train_df.dropna(axis=0, inplace=True)

        X = scaled_train_df.values[:,:-1]
        y = scaled_train_df.values[:, -1]
        model.fit(X, y)

        test_df, symbols = self.build_test_df(date)
        scaled_test_df = self.build_scaled_df(test_df)
        X_test = scaled_test_df.values

        predicted_returns = list(model.predict(X))

        returns_dict = {}

        for i in range(len(symbols)):
            returns_dict[symbols[i]] = predicted_returns[i]

        top = sorted(returns_dict.items(), key=lambda x: x[1])[::-1][:n]
        return [x[0] for x in top]
    
    
    def backtest(self, model, start_date, end_date):
        months = list(pd.date_range(start_date, end_date, freq='MS').strftime('%Y-%m-%d'))

        overall_returns = []
        specific_returns = []
        for month in months:
            start_time = time()
            for ticker in self.tickers:
                if ticker not in self.blacklist:
                    if not self.check_date(ticker, month):
                        self.blacklist.add(ticker)
            symbols = self.build_machine(model, month)
            ticker_returns = self.build_returns(symbols, month)
            overall_returns.append(sum(ticker_returns)/len(ticker_returns))
            print(month, round(sum(ticker_returns)/len(ticker_returns), 6), round(time()-start_time, 2))
            
            specific_returns_dict = {}
            for i in range(len(ticker_returns)):
                specific_returns_dict[symbols[i]] = ticker_returns[i]
            specific_returns.append(specific_returns_dict)
            
        self.results = specific_returns
        return overall_returns
    
    
#     def build_feature_vector(self, ticker, date, keep_pred=True):
#         ticker_df = self.portfolio[ticker]

#         start_date_dt = datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=self.hist_depth)
#         start_date = start_date_dt.strftime('%Y-%m-%d')

#         feature_df = ticker_df.set_index('Date')[start_date:date].reset_index(drop=True)[self.features]
        
#         new_df_dict = {}

#         for i in range(len(feature_df)):
#             for col in feature_df.columns:
#                 if i < len(feature_df) - 1:
#                     new_df_dict[col + ' ' + str(i + 1)] = [feature_df[col].iloc[i]]
#                 elif col == self.target:
#                     if keep_pred:
#                         new_df_dict['Target'] = [feature_df[col].iloc[i]]

#         new_df = pd.DataFrame.from_dict(new_df_dict)
        
#         if len(new_df) == 0:
#             self.blacklist.add(ticker)
#             return -1

#         if keep_pred:
#             new_df = new_df[[col for col in list(new_df.columns) if col not in {'Target'}] + ['Target']]
        
#         return new_df
    
    
#     def build_train_df(self, date):    
#         vector_list = []
#         for ticker in self.tickers:
#             if ticker not in self.blacklist:
#                 for i in range(self.train_depth):
#                     train_start_dt = datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=(1+i))
#                     train_start = train_start_dt.strftime('%Y-%m-%d')
#                     vector = self.build_feature_vector(ticker, train_start)
                    
#                     if type(vector) != int:
#                         vector_list.append(vector)
        
#         start_time = time()
#         feature_df = pd.concat(vector_list)
#         print(time() - start_time)
#         return feature_df.reset_index(drop=True)
    
    
#     def build_test_df(self, date):
#         vector_list = []
#         index_list = []
#         for ticker in self.tickers:
#             if ticker not in self.blacklist:
#                 vector = self.build_feature_vector(ticker, date, keep_pred=False)
#                 if type(vector) != int:
#                     vector_list.append(vector)
#                     index_list.append(ticker)

#         start_time = time()
#         test_df = pd.concat(vector_list)
#         print(time() - start_time)
#         return test_df.reset_index(drop=True), index_list
    
    
    def build_feature_vector(self, ticker, date, keep_pred=True):
        ticker_df = self.portfolio[ticker]

        start_date_dt = datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=self.hist_depth)
        start_date = start_date_dt.strftime('%Y-%m-%d')

        feature_df = ticker_df.set_index('Date')[start_date:date].reset_index(drop=True)[self.features]
        
        new_df_dict = {}

        for i in range(len(feature_df)):
            for col in feature_df.columns:
                if i < len(feature_df) - 1:
                    new_df_dict[col + ' ' + str(i + 1)] = [feature_df[col].iloc[i]]
                elif col == self.target:
                    if keep_pred:
                        new_df_dict['Target'] = [feature_df[col].iloc[i]]

        new_df = pd.DataFrame.from_dict(new_df_dict)
        
        if len(new_df) == 0:
            self.blacklist.add(ticker)
            return -1

        if keep_pred:
            new_df = new_df[[col for col in list(new_df.columns) if col not in {'Target'}] + ['Target']]
        
        return new_df
    
    
    def build_train_df(self, date):    
        vector_list = []
        for ticker in self.tickers:
            if ticker not in self.blacklist:
                for i in range(self.train_depth):
                    train_start_dt = datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=(1+i))
                    train_start = train_start_dt.strftime('%Y-%m-%d')
                    vector = self.build_feature_vector(ticker, train_start)
                    
                    if type(vector) != int:
                        vector_list.append(vector)
        
        start_time = time()
        feature_df = pd.concat(vector_list)
        print(time() - start_time)
        return feature_df.reset_index(drop=True)
    
    
    def build_test_df(self, date):
        vector_list = []
        index_list = []
        for ticker in self.tickers:
            if ticker not in self.blacklist:
                vector = self.build_feature_vector(ticker, date, keep_pred=False)
                if type(vector) != int:
                    vector_list.append(vector)
                    index_list.append(ticker)

        start_time = time()
        test_df = pd.concat(vector_list)
        print(time() - start_time)
        return test_df.reset_index(drop=True), index_list

In [6]:
%%time
start_date = '2015-01-01'
end_date = '2019-12-01'
universe = build_snp('January 1, 2015', '%B %d, %Y')

hist_depth = 12
train_depth = 6

port = Portfolio(
    universe, 
    hist_depth=hist_depth, 
    train_depth=train_depth, 
    features = ['Close', 'Volume']
)

- TEG: No data found for this date range, symbol may be delisted
- ANDV: No data found for this date range, symbol may be delisted
- BRCM: No data found for this date range, symbol may be delisted
- RAI: No data found for this date range, symbol may be delisted
- SNDK: No data found for this date range, symbol may be delisted
- CSC: No data found for this date range, symbol may be delisted
- XL: No data found for this date range, symbol may be delisted
- KRFT: No data found for this date range, symbol may be delisted
- HSP: No data found for this date range, symbol may be delisted
- YHOO: No data found for this date range, symbol may be delisted
- AET: No data found for this date range, symbol may be delisted
- TWC: No data found for this date range, symbol may be delisted
- WYN: No data found for this date range, symbol may be delisted
- GGP: No data found for this date range, symbol may be delisted
- SPLS: No data found for this date range, symbol may be delisted
- BF.B: No data foun

KeyboardInterrupt: 

In [7]:
# return feature_df


vec = port.build_feature_vector('AAPL', '2015-01-01')

NameError: name 'port' is not defined

In [None]:
vec

In [None]:
# new_vec = list(vec.iloc[:-1].values.flatten())
# new_vec.append(vec['Close'].iloc[-1])

new_vec = list(vec.values[:-1].flatten())

In [None]:
new_vec

In [20]:
%%time
port.build_train_df('2015-01-01')

4.115517854690552
CPU times: user 13.9 s, sys: 38.2 ms, total: 14 s
Wall time: 14 s


Unnamed: 0,Close 1,Volume 1,Close 2,Volume 2,Close 3,Volume 3,Close 4,Volume 4,Close 5,Volume 5,...,Volume 8,Close 9,Volume 9,Close 10,Volume 10,Close 11,Volume 11,Close 12,Volume 12,Target
0,45.87,25502300.0,44.10,35798500.0,51.84,50804900.0,53.28,39031700.0,50.55,42417700.0,...,32657400.0,49.80,23803000.0,45.26,35610300.0,51.98,59152000.0,53.33,29448100.0,54.30
1,44.27,25012700.0,45.87,25502300.0,44.10,35798500.0,51.84,50804900.0,53.28,39031700.0,...,33520700.0,49.37,32657400.0,49.80,23803000.0,45.26,35610300.0,51.98,59152000.0,53.33
2,42.95,34405700.0,44.27,25012700.0,45.87,25502300.0,44.10,35798500.0,51.84,50804900.0,...,33499300.0,49.82,33520700.0,49.37,32657400.0,49.80,23803000.0,45.26,35610300.0,51.98
3,42.99,26804900.0,42.95,34405700.0,44.27,25012700.0,45.87,25502300.0,44.10,35798500.0,...,42417700.0,50.06,33499300.0,49.82,33520700.0,49.37,32657400.0,49.80,23803000.0,45.26
4,41.26,22686800.0,42.99,26804900.0,42.95,34405700.0,44.27,25012700.0,45.87,25502300.0,...,39031700.0,50.55,42417700.0,50.06,33499300.0,49.82,33520700.0,49.37,32657400.0,49.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2557,42.07,32228500.0,43.98,31869800.0,40.11,44831100.0,41.68,39759000.0,41.93,43647600.0,...,35423300.0,42.28,36674700.0,45.73,27846000.0,45.70,44781000.0,45.82,62790200.0,45.45
2558,42.65,37883600.0,42.07,32228500.0,43.98,31869800.0,40.11,44831100.0,41.68,39759000.0,...,26638900.0,41.16,35423300.0,42.28,36674700.0,45.73,27846000.0,45.70,44781000.0,45.82
2559,40.09,41976300.0,42.65,37883600.0,42.07,32228500.0,43.98,31869800.0,40.11,44831100.0,...,40244800.0,42.31,26638900.0,41.16,35423300.0,42.28,36674700.0,45.73,27846000.0,45.70
2560,37.20,39249200.0,40.09,41976300.0,42.65,37883600.0,42.07,32228500.0,43.98,31869800.0,...,43647600.0,43.25,40244800.0,42.31,26638900.0,41.16,35423300.0,42.28,36674700.0,45.73


In [3]:
%%time
# port.build_test_df('2015-01-01')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [7]:
np.std(range(10))

2.8722813232690143

In [1]:
list(range(30))[0:0]

[]

In [19]:
x = 1.1**5 * 100

In [22]:
(x/100)**(1/5)

1.1

In [25]:
set([1, 2, 3, 4, 5]).add(10)

In [29]:
-1 * 0

0