[FINM 33150] Regression Analysis and Quantitative Trading Strategies\
Winter 2022 | Professor Brian Boonstra

# Final Project Draft

_**Due:** Thursday, February 24th, at 11:00pm\
**Authors:** Ashley Tsoi (atsoi, Student ID: 12286230), Piyush Kontu (), Gauri Kant ()_

### 1. Fetch and clean data

#### 1-1. Import packages

In [1]:
import os
from pathlib import Path
import functools
import warnings

import quandl
import json
import pandas as pd
# import pandas_datareader.data as pdr
pd.set_option("display.precision", 4)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# import math
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta

# let plot display in the notebook instead of in a different window
%matplotlib inline 
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [21, 8]

#### 1-2. Define the functions to fetch data from Quandl

**1-2-1. Get personal API key** from ../data/APIs.json

In [2]:
f = open('../data/APIs.json')
APIs = json.load(f)
f.close()

**1-2-2. Define helper functions**

In [3]:
def assertCorrectDateFormat(date_text):
    try:
        dt.datetime.strptime(date_text, '%Y-%m-%d')
    except ValueError:
        raise ValueError("Incorrect date format, should be YYYY-MM-DD")

def calcSixMonthsAgo(date_text):
    assertCorrectDateFormat(date_text)
    d = dt.datetime.strptime(date_text, '%Y-%m-%d')
    return (d + relativedelta(months=-6)).strftime('%Y-%m-%d')

def calcNextMonth(month_text):
    if type(month_text) == str:
        m = dt.datetime.strptime(month_text, '%Y-%m')
    else: m = month_text
    return (m + relativedelta(months=1)).strftime('%Y-%m')

def deleteCSV(sec):
    file_name = "../data_large/EOD/"+sec
    if os.path.isfile(file_name):
        os.remove(file_name)

**1-2-3. Define function** to retrieve raw data from Quandl

**Documentation:**
```
Zacks Fundamentals Collection B (ZFB)
https://data.nasdaq.com/databases/ZFB/documentation
https://data.nasdaq.com/databases/ZFB/usage/quickstart/python
```

In [4]:
# Define function that retrieves ZFB data from Quandl
@functools.lru_cache(maxsize=16) # Cache the function output
def getQuandlZFBData(from_table,secs,start_date,end_date,columns):
    # Get data fom Quandl using quandl.get_table
    # NOTE: missing data for the inputted date will NOT return a row.

    # INPUT         | DATA TYPE                 | DESCRIPTION
    # from_table    | string                    | FC, FR, MT, MKTV, SHRS, or HDM
    # secs          | string / tuple of string  | security ticker(s)
    # start_date    | string (YYYY-MM-DD)       | start date of data
    # end_date      | string (YYYY-MM-DD)       | end date of data (same as or after start_date)
    # columns       | string / tuple of string  | names of the columns to return
    
    if secs=='all' or secs==("all",): secs = list(pd.read_csv('../data/zacks-tickers.csv').ticker.unique()) # import all tickers from zacks-tickers

    if type(secs)==str: seclen = 1
    else: seclen=len(secs)
    print(f"Quandl | START | Retriving Quandl data for {seclen:d} securities from the ZACKS/{from_table} table.\n")
    
    # Retrieve data using quandl.get_table
    quandl.ApiConfig.api_key = APIs['Quandl']
        
    if from_table in ['FC','FR','MKTV','SHRS','HDM']:

        data = quandl.get_table('ZACKS/'+from_table,
                                ticker = secs, 
                                per_end_date = {'gte':start_date, 'lte':end_date},
                                qopts = {'columns':list(columns)},
                                paginate = True)
        
        if 'per_end_date' in data.columns:
            data['per_end_date'] = pd.to_datetime(data['per_end_date'])
        if 'filing_date' in data.columns:
            data['filing_date'] = pd.to_datetime(data['filing_date'])

    elif from_table == 'MT':
        data = quandl.get_table('ZACKS/MT',
                                ticker = secs, 
                                qopts = {'columns':list(columns)},
                                paginate = True)

    else:
        print("from_table is limited to FC, FR, MT, MKTV, SHRS and HDM")
    
        
    print(f"Quandl | DONE  | Returning {len(data):d} rows of data from the ZACKS/{from_table} table.\n")

    return data



@functools.lru_cache(maxsize=16) # Cache the function output
def _getZFBDataFromQuandl(secs,start_date,end_date):
    # Merged Zacks data in five tables: FC, FR, MT, MKTV, and SHRS
    # NOTE: missing data for the inputted date will NOT return a row.

    # INPUT         | DATA TYPE                 | DESCRIPTION
    # secs          | string / tuple of string  | security ticker(s)
    # start_date    | string (YYYY-MM-DD)       | start date of data
    # end_date      | string (YYYY-MM-DD)       | end date of data (same as or after start_date)
    
    # Retrieve data using quandl.get_table
    fc = getQuandlZFBData('FC',secs,start_date,end_date,('ticker','exchange','per_end_date','per_type','zacks_sector_code','basic_net_eps','diluted_net_eps','tot_lterm_debt','net_lterm_debt','filing_date'))
    fr = getQuandlZFBData('FR',secs,start_date,end_date,('ticker','exchange','per_end_date','per_type','ret_invst','tot_debt_tot_equity'))
    mt = getQuandlZFBData('MT',secs,start_date,end_date,('ticker','ticker_type','asset_type'))
    mktv = getQuandlZFBData('MKTV',secs,start_date,end_date,('ticker','per_end_date','per_type','mkt_val'))
    shrs = getQuandlZFBData('SHRS',secs,start_date,end_date,('ticker','per_end_date','per_type','shares_out','avg_d_shares'))

    # Merge the tables
    zacks_1 = fc.merge(fr, how='outer', on=['ticker','exchange','per_end_date','per_type'])
    zacks_2 = mktv.merge(shrs, how='outer', on=['ticker','per_end_date','per_type'])
    zacks_3 = zacks_1.merge(zacks_2, how='outer', on=['ticker','per_end_date','per_type'])
    zacks = zacks_3.merge(mt, how='outer', on='ticker')
    
    return zacks



def _getZFBData(secs,start_date,end_date):
    # Return merged Zacks data in five tables: FC, FR, MT, MKTV, and SHRS.
    # Securities: all securities in '../data/zacks-tickers.csv'
    # If table exists locally, get from CSV. Else download as CSV then get from CSV

    path = "../data_large/Zacks"
    Path(path).mkdir(parents=True, exist_ok=True)
    
    if type(secs)==str:
        secs=(secs,)
    
    zacks = []
    for s in secs:
        file_name = path+"/"+s+"_"+start_date+"_"+end_date+".csv"
        if not os.path.isfile(file_name):
            # download as CSV in local directory
            print(f"SAVE   | START | \"{s}\" does not exist in {path}. Saving from Quandl.\n")
            _getZFBDataFromQuandl(secs,start_date,end_date).sort_values('per_end_date',ascending=True,ignore_index=True).set_index(['ticker','per_end_date']).to_csv(file_name)
            print("SAVE   | DONE  | \n")

        zacks.append(pd.read_csv(file_name))
    
    zacks = pd.concat(zacks)
    if 'per_end_date' in zacks.columns:
            zacks['per_end_date'] = pd.to_datetime(zacks['per_end_date'])
    if 'filing_date' in zacks.columns:
        zacks['filing_date'] = pd.to_datetime(zacks['filing_date'])
    if 'Unnamed: 0' in zacks.columns:
        zacks.drop('Unnamed: 0', axis=1, inplace=True)
    
    print("       | DONE  | Returning {:d} rows of ZACKS data.\n".format(len(zacks)))
    
    return zacks


In [6]:
_getZFBData('SPX','2010-01-01','2019-12-31')


SAVE   | START | "SPX" does not exist in ../data_large/Zacks. Saving from Quandl.

Quandl | START | Retriving Quandl data for 1 securities from the ZACKS/FC table.

Quandl | DONE  | Returning 0 rows of data from the ZACKS/FC table.

Quandl | START | Retriving Quandl data for 1 securities from the ZACKS/FR table.

Quandl | DONE  | Returning 0 rows of data from the ZACKS/FR table.

Quandl | START | Retriving Quandl data for 1 securities from the ZACKS/MT table.

Quandl | DONE  | Returning 0 rows of data from the ZACKS/MT table.

Quandl | START | Retriving Quandl data for 1 securities from the ZACKS/MKTV table.

Quandl | DONE  | Returning 0 rows of data from the ZACKS/MKTV table.

Quandl | START | Retriving Quandl data for 1 securities from the ZACKS/SHRS table.

Quandl | DONE  | Returning 0 rows of data from the ZACKS/SHRS table.

SAVE   | DONE  | 

       | DONE  | Returning 0 rows of ZACKS data.



Unnamed: 0,ticker,per_end_date,zacks_sector_code,basic_net_eps,diluted_net_eps,tot_lterm_debt,net_lterm_debt,filing_date,exchange,ret_invst,tot_debt_tot_equity,mkt_val,per_type,shares_out,avg_d_shares,ticker_type,asset_type


**Documentation**
```
End of Day US Stock Prices (EOD)
https://data.nasdaq.com/databases/EOD/documentation
https://data.nasdaq.com/databases/EOD/usage/quickstart/python
```

In [39]:
# Define function that retrieves EOD data from Quandl
@functools.lru_cache(maxsize=16) # Cache the function output
def getQuandlEODData(sec,start_date,end_date,columns):
    # Get one security (sec)'s data fom Quandl using quandl.get_table
    # NOTE: missing data for the inputted date will NOT return a row.

    # INPUT         | DATA TYPE                 | DESCRIPTION
    # sec           | string / list of string   | security ticker
    # start_date    | string (YYYY-MM-DD)       | start date of data
    # end_date      | string (YYYY-MM-DD)       | end date of data (same as or after start_date)
    # columns       | string / list of string   | columns to return
    
    print(f"Quandl | START | Retriving Quandl data for security: {sec}\n")
    
    # Retrieve data using quandl.get_table
    quandl.ApiConfig.api_key = APIs['Quandl']
    data = quandl.get_table('QUOTEMEDIA/PRICES',
                            ticker = sec, 
                            date = {'gte':start_date, 'lte':end_date},
                            qopts = {'columns':list(set(['date','ticker']+list(columns)))}
                            )

    data.date = pd.to_datetime(data.date, unit='D')
    data.set_index(['date','ticker'],inplace=True)
    data.sort_index(inplace=True)
    # data.index = pd.to_datetime(data.index, unit='D')
    
    print(f"Quandl | DONE  | Returning {len(data):d} dates of data for {sec}.\n")
    
    return data

In [40]:
secs = ('SCHB','VEA','VWO','FM','GLD','SLV','USO','UNG','DBA','COW','GOVT','TIP','MBB','LQD','HYG','BNDX','EMB','VNQ','PSP','BIL')

# secs = ('VEA','VWO')
start_date,end_date = '2008-01-01','2022-01-01'

data = getQuandlEODData(secs,start_date,end_date,('adj_close','adj_volume'))
data

Quandl | START | Retriving Quandl data for security: ('SCHB', 'VEA', 'VWO', 'FM', 'GLD', 'SLV', 'USO', 'UNG', 'DBA', 'COW', 'GOVT', 'TIP', 'MBB', 'LQD', 'HYG', 'BNDX', 'EMB', 'VNQ', 'PSP', 'BIL')

Quandl | DONE  | Returning 3526 dates of data for ('SCHB', 'VEA', 'VWO', 'FM', 'GLD', 'SLV', 'USO', 'UNG', 'DBA', 'COW', 'GOVT', 'TIP', 'MBB', 'LQD', 'HYG', 'BNDX', 'EMB', 'VNQ', 'PSP', 'BIL').



Unnamed: 0_level_0,Unnamed: 1_level_0,adj_close,adj_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-01-02,BIL,85.2938,29450.0000
2008-01-03,BIL,85.2380,30500.0000
2008-01-04,BIL,85.2201,173200.0000
2008-01-07,BIL,85.2936,30300.0000
2008-01-08,BIL,85.2342,18350.0000
...,...,...,...
2021-12-27,BIL,91.4300,730397.0000
2021-12-28,BIL,91.4200,1932997.0000
2021-12-29,BIL,91.4200,1246326.0000
2021-12-30,BIL,91.4300,836968.0000


**1-2-4. Define function** to filter / clean raw data

**Requirements:**
```
- US Equities
- not in the automotive, financial or insurance sector over the entire period
- end-of-day adjusted closing prices are available over the entire period
- debt/market cap ratio is greater than 0.1
- has feasible calculation of the ratios over the entire period: 
  - debt to market cap, 
  - return on investment, and 
  - price to earnings. 
  Including for at least one PER END DATE no more than one year old. Debt ratio of zero is OK.
```

In [None]:
def getCleanZFBData(secs,start_date,end_date):

    # === GET RAW DATA ============================================

    zacks = _getZFBData(secs,start_date,end_date)
    zacks['date'] = zacks['filing_date']
    
    # === FILTER / CLEAN ==========================================
    
    # US Equities only
    zacks = zacks[zacks.exchange.isin(('NYSE','NASDAQ'))]   # select US stock exchanges
    zacks = zacks[zacks.ticker_type=='S']                   # S = Securities
    zacks = zacks[zacks.asset_type=='COM'][zacks.columns]   # COM = Common stocks

    # remove tickers without filing dates (tickers without filing dates are impossible to join on)
    filingDate_filter = zacks[pd.isnull(zacks.filing_date)].ticker.unique()
    zacks = zacks[~zacks.ticker.isin(filingDate_filter)]
    
    # not in the automotive, financial or insurance sector for any date (NOTE: there might be sector changes)
    sector_filter = zacks[zacks.zacks_sector_code.isin((5,13))].ticker.unique() # 5 = automotive, 13 = finance (includes insurance)
    zacks = zacks[~zacks.ticker.isin(sector_filter)]
    
    # Debt -- Use net debt where available, total debt otherwise
    zacks['debt'] = zacks.net_lterm_debt.fillna(zacks.tot_lterm_debt)

    # EPS -- use the basic version (GAAP) if no diluted number is available.
    zacks['eps'] = zacks.diluted_net_eps.fillna(zacks.basic_net_eps)
    zacks.eps.clip(lower=0.001, inplace=True) # make all negative eps 0.001

    # If have both quarterly & annual data for the same ticker & date, use quarterly
    ratio_cols = ['tot_debt_tot_equity','mkt_val','ret_invst','debt','eps','shares_out','avg_d_shares']
    zacks.set_index(['ticker','date'],inplace=True)
    zacks_quarterly = zacks[zacks.per_type=='Q']
    zacks_annual = zacks[zacks.per_type=='A']
    for c in ratio_cols:
        zacks_quarterly[c] = zacks_quarterly[c].fillna(zacks_annual[c])
    zacks = zacks_quarterly.reset_index()
    
    # debt-to-market-cap ratio greater than 0.1 AND not null (filter all since we will have enough tickers)
    badDebtToMC_filter = zacks[(zacks.tot_debt_tot_equity<=0.1) | (pd.isnull(zacks.tot_debt_tot_equity))].ticker.unique()
    zacks = zacks[~zacks.ticker.isin(badDebtToMC_filter)]

    # other ratios are not null (ret_invst, mkt_val, eps, debt)
    nullRatio_filter = list(zacks[pd.isnull(zacks.ret_invst) | pd.isnull(zacks.mkt_val)].ticker.unique())
    nullRatio_filter += list(zacks[pd.isnull(zacks.eps) | pd.isnull(zacks.debt)].ticker.unique())
    zacks = zacks[~zacks.ticker.isin(set(nullRatio_filter))]
    
    column_order = ['ticker','date','per_end_date','filing_date',
                    'zacks_sector_code',
                    'basic_net_eps','diluted_net_eps','eps',
                    'tot_lterm_debt','net_lterm_debt','debt',
                    'tot_debt_tot_equity','ret_invst','mkt_val',
                    'shares_out','avg_d_shares']
    return zacks[column_order].sort_values('date',ascending=True,ignore_index=True)

In [None]:
@functools.lru_cache(maxsize=16) # Cache the function output
def getEodFundamentalData(secs,start_date,end_date):
    
    before_start_date = calcSixMonthsAgo(start_date) # get data for 6 extra months before start_date to get data reported by start_date

    ZFB_file_name = '../data_large/clean/ZFB.csv'
    if not os.path.isfile(ZFB_file_name):
        zacks = getCleanZFBData(secs,before_start_date,end_date)
        repeatFilingDate_filter = zacks.loc[zacks[['ticker','date']].duplicated()].ticker.unique()
        zacks = zacks[~zacks.ticker.isin(repeatFilingDate_filter)]
        zacks.set_index(['ticker','date']).to_csv(ZFB_file_name)
    else:
        zacks = pd.read_csv(ZFB_file_name)

    tickers = tuple(zacks.ticker.unique())
    noEOD_filter = set()
    prices_file_name = '../data_large/clean/prices.csv'
    if not os.path.isfile(prices_file_name):
        prices = getAdjClose(tickers,before_start_date,end_date)

        # end-of-day adjusted closing prices are available
        for sec in tickers:
            price = prices.iloc[prices.index.get_level_values('ticker') == sec]
            data_len = len(price)
            if data_len < 1910 or any(pd.isnull(price.adj_close.loc[start_date:end_date])): # 1910 = number of trading days in the period 2013-07-01 -- 2021-01-31
                # filter out the security
                noEOD_filter.add(sec)
                if data_len > 0: pd.DataFrame(columns=['date','ticker','adj_close']).to_csv('../data_large/EOD/'+sec) # make csv of filtered securities an empty table so we skip downloading next time
    
        prices = prices.iloc[~prices.index.get_level_values('ticker').isin(noEOD_filter)]

        prices.to_csv(prices_file_name)
    else:
        prices = pd.read_csv(prices_file_name,index_col=['ticker','date'])
    
    zacks = zacks[~zacks.ticker.isin(noEOD_filter)]

    # get per_end_date_price
    zacks = zacks.join(prices.rename(index={'date':'per_end_date'}),on=['ticker','per_end_date']).rename(columns={'adj_close':'per_end_date_price'})
    
    # === JOIN PRICING DATA with FUNDAMENTAL DATA =================
    
    zacks.set_index(['ticker','date'],inplace=True)
    data = pd.concat([prices,zacks], axis=1)
    
    # === CALCULATE / RECALCULATE RATIOS ==========================

    # forward fill everything
    data = data.transform(lambda v: v.ffill())

    # add returns
    data['return'] = data.adj_close.pct_change()

    # PE (price to earnings)
    data['PE'] = data.adj_close / data.eps

    # DE (debt to market cap)
    data['DE'] = data.tot_debt_tot_equity * data.per_end_date_price / data.adj_close

    # ROI (return on investment)
    data['mkt_val_daily'] = data.mkt_val * data.adj_close / data.per_end_date_price
    data['ROI'] = data.ret_invst * (data.debt+data.mkt_val) / (data.debt+data.mkt_val_daily)

    # Fourth ratio -- want low PE, low DE, and high ROI
    data['combo'] = -0.5*data.PE - 0.5*data.DE + 2*data.ROI

    # Get changes in ratios
    for c in ['PE','DE','ROI','combo']:
        data[c+'_delta'] = data[c].diff()

    # === FINAL CLEAN-UP ==========================================

    # Make sure the index is datetime
    data.index = data.index.set_levels(pd.to_datetime(data.index.levels[1]), level=1)
    
    # === RETURN DATA =============================================
    
    index_ticker = data.index.get_level_values('ticker').unique()
    print(f'Returning EOD + fundamental data for {len(index_ticker)} tickers.')

    columns = ['adj_close','return','PE','PE_delta','DE','DE_delta','ROI','ROI_delta','combo','combo_delta']
    index_date = data.index.get_level_values('date')
    return data[columns].loc[(index_date>=start_date) & (index_date<=end_date)]

#### 1-3. Fetch cleaned data using the functions above

**Dates:**
```
January 1, 2000 - January 31, 2021*
```

**1-3-1. Fetch data** 

In [None]:
secs,start_date,end_date = ('all','2014-01-01','2021-01-31')
secData = getEodFundamentalData(secs,start_date,end_date)

Spot check the data against sample

In [None]:
l = secData.loc[('LLY','2020-06-30')]
assert np.isclose(l.PE, 100.062297)
assert np.isclose(l.DE, 4.532301)
assert np.isclose(l.ROI, 7.127691)

### 2. Define function to get repo rate

In [None]:
# Define function that retrieves data from Quandl
def getQuandlFredData(table,start_date,end_date):
    # Get one security (sec)'s data fom Quandl using quandl.get_table
    # NOTE: missing data for the inputted date will NOT return a row.

    # INPUT         | DATA TYPE                 | DESCRIPTION
    # table         | string                    | [database]/[table]
    # start_date    | string (YYYY-MM-DD)       | start date of data
    # end_date      | string (YYYY-MM-DD)       | end date of data (same as or after start_date)
    
    print("Quandl | START | Retriving Quandl data for table: \n",table)
    
    # Retrieve data using quandl.get_table
    quandl.ApiConfig.api_key = APIs['Quandl']
    data = quandl.get('FRED/'+table,
                      start_date=start_date, end_date=end_date
                      )

    print("Quandl | DONE  | Returning {:d} dates of data for {}.\n".format(len(data),table))
    return data

@functools.lru_cache(maxsize=16) # Cache the function output
def getRepo(start_date,end_date):
    
    ted = getQuandlFredData("TEDRATE", start_date=start_date, end_date=end_date).rename(columns={'Date':'date','Value':'ted'})
    tbill = getQuandlFredData("DTB3", start_date=start_date, end_date=end_date).rename(columns={'Date':'date','Value':'tbill_3m'})

    repo = ted.merge(tbill, how='inner', on='Date').rename(index={'Date':'date'})
    repo['repo'] = repo.ted + repo.tbill_3m - 0.01 # repo rate = funding rate minus 100bp

    rf = pd.read_csv("../data/F-F_Research_Data_Factors_daily.csv")[['Unnamed: 0','RF']].rename(columns={'Unnamed: 0':'date'})
    rf['date'] = rf.date.apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
    rf = rf.set_index('date').loc[start_date:end_date]

    return pd.concat([repo,rf], axis=1)

In [None]:
secData = getEodFundamentalData(secs,start_date,end_date)
data = secData.join(getRepo('2014-01-01','2021-01-31'), on='date')
data

### 3. Trade simulation

Analyze performance of a top-and-bottom decile trading strategy. Now rank based on changes in your ratios rather than the ratios themselves. Play with the effects of sizing positions by rank.

In [None]:
def getDecilePosition(periodData,ratio,buyLow=True):
    
    # Return list of 'position'
    # set lowest 3 deciles' position as 1,
    #     highest 3 deciles' position as -1
    #     middle 4 are 0
    # flip the positions if buyLow=False
    
    buyLow = 1 if buyLow else -1
    decile = pd.qcut(periodData.groupby('ticker').mean()[ratio], 10, labels=False).to_frame()
    decile['position'] = buyLow*[1 if d<3 else -1 if d>6 else 0 for d in decile[ratio]]    

    return periodData.join(decile[['position']], on='ticker')['position'].tolist()

In [None]:
def tradeRatioDecile(data,ratio,buyLow=True):

    # Get positions for all securities, all dates, for the inputted ratio. 
    # 1 = buy, 0 = no position, -1 = sell

    data = data.swaplevel(0,1).sort_index(level=0,ascending=True) # date is now at level 0
    # recalculate decile every month
    months = data.index.get_level_values('date').to_period('M')
    for m in months:
        periodData = data.loc[m:calcNextMonth(m)][[ratio]]
        periodData = periodData.join(getDecilePosition(periodData,'PE',buyLow))
        data = data.join(periodData[['position']])
    
    data = data.swaplevel(0,1).sort_index(level=0,ascending=True) # ticker is now at level 0
    # update signal for every security
    tickers = data.index.get_level_values('ticker').unique()
    for t in tickers:
        pos = data.loc[t]['position']
        data.loc[t,'signal'] = [pos[0]] + pos.diff()[1:].tolist()

    return data.rename(columns={'signal':ratio+'_signal','position':ratio+'_position'})

In [None]:
buyInstuction = {
        # ratio : buyLow
        'PE':True,
        'PE_delta':True,
        'DE':True,
        'DE_delta':True,
        'ROI':False,
        'ROI_delta':True,
        'Combo':False,
        'Combo_delta':True
    }

@functools.lru_cache(maxsize=16) # Cache the function output
def tradeSimulation(secs,start_date,end_date,ratio,buyLow,K=1000000):
    
    # Trade -- get monthly trade positions
    # Get positions for all securities, all dates, for the inputted ratio. 
    # 1 = buy, 0 = no position, -1 = sell

    secData = getEodFundamentalData(secs,start_date,end_date)
    data = secData.join(getRepo(start_date,end_date), on='date')
    data['signal'] = 0
    data['position'] = 0
    
    data = data.swaplevel(0,1).sort_index(level=0,ascending=True) # date is now at level 0
    # recalculate decile every month
    months = [str(m) for m in data.index.get_level_values('date').to_period('M').unique()]
    for m in months:
        periodData = data.loc[m,[ratio]]
        periodData['position'] = getDecilePosition(periodData,ratio,buyLow)
        lastDate = max(periodData.index.get_level_values('date').unique()),
        periodData.loc[lastDate,'position'] = [0]*len(periodData.loc[lastDate]) # set all end-of-month position to 0
        data.loc[m,'position'] = periodData['position']
    
    data.fillna(method='ffill',inplace=True)

    data = data.swaplevel(0,1).sort_index(level=0,ascending=True) # ticker is now at level 0
    # update signal for every security
    tickers = data.index.get_level_values('ticker').unique()
    for t in tickers:
        pos = data.loc[t]['position']
        data.loc[t,'signal'] = [pos[0]] + pos.diff()[1:].tolist()
    
    tradeSim = pd.DataFrame(index=data.index.get_level_values('date').unique(),columns=['buy_total_price','sell_total_price','repo','cash','total_value','PnL'])

    data = data.swaplevel(0,1).sort_index(level=0,ascending=True) # date is now at level 0
    for m in months:
        md = data.loc[m,'adj_close'].sum()

        return md
        
    print('calculated trade signal and positions')

    return tradeSim
    
    # Initiate columns
    K_balances = [K]
    total_values = [K]
    PnL_daily_list, PnL_cumulative_list = [], []
    for i,row in tradeSim.iterrows(): # current columns: adj_close for both securities, spread, quantities to buy

        # Calculate present position value (but don't append to list yet since it may hit stop-loss limit)
        if signal: # if there's a new signal
            position_quantity = position*row['quantity'] # quantity = new position's quantity

        else: # if no signal, position quantity = previous position quantity (unless stop-loss limit is triggered later)
            position_quantity = position_quantities[-1]

        # Security buy amounts (but don't append to list yet since it may hit stop-loss limit)
        secX_position_value = position_quantity*row[secs[0]+'_adj_close']
        secY_position_value = -1*position_quantity*row[secs[1]+'_adj_close']
        position_value = secX_position_value + secY_position_value
        
        # Cash balances and total values (if doesn't hit stop-loss limit)
        K_balance = K_balances[-1] - position_value
        total_value = position_value + K_balance
        total_value_delta = total_value-total_values[-1]

        # PnLs (daily and cumulative)
        PnL_daily = total_value_delta/total_values[-1]
        PnL_cumulative = total_value/K - 1

        # update prev_position for next calculation
        prev_position = position
        
        # Append new variables into lists
        positions.append(position)
        signals.append(signal)
        position_quantities.append(position_quantity)
        position_values.append(position_value)
        K_balances.append(K_balance)
        total_values.append(total_value)
        PnL_daily_list.append(PnL_daily)
        PnL_cumulative_list.append(PnL_cumulative)

    # Save the data in tradeSim table
    tradeSim['position'] = positions[1:]
    tradeSim['position_value'] = position_values[1:]
    tradeSim['cash'] = K_balances[1:]
    tradeSim['total_value'] = total_values[1:]
    tradeSim['PnL_daily'] = PnL_daily_list
    tradeSim['PnL_cumulative'] = PnL_cumulative_list
    
    # print("trade  | DONE  | \n")

    # keep a record of params
    summary = {'data':  ['security_X','security_Y','start_date','end_date','N_window','M','j','g','s','K','final_value','PnL_daily_max','PnL_cumulative'],
               'value': [secs[0],secs[1],start_date,end_date,N_window,M,j,g,s,K,total_values[-1],max(PnL_daily_list),PnL_cumulative_list[-1]]
    }
    
    summary_df = pd.DataFrame(summary).set_index('data')
        
    return summary_df,tradeSim


In [None]:
trade = []
for r,buyLow in buyInstuction:
    trade.append(tradeSimulation(secs,start_date,end_date,r,buyLow))
trades = pd.concat(trade,axis=1)

In [None]:
def plotTradeSim(tradeSim):
    
    plt.clf() # clear previous plots

    tp = plt

    tp.title('Cumulative Return')

    tp.plot(tradeSim['cumret_spread'], label='cumulative return spread')
    
    tp.plot(tradeSim['cumret_spread'][tradeSim['signal']>1], color='green', marker='o', markersize=6, linestyle='none')
    tp.plot(tradeSim['cumret_spread'][tradeSim['signal']==1], color='green', marker='o', markersize=4, linestyle='none', label='buy signal')
    tp.plot(tradeSim['cumret_spread'][tradeSim['signal']==-1], color='red', marker='o', markersize=4, linestyle='none', label='sell signal')
    tp.plot(tradeSim['cumret_spread'][tradeSim['signal']<-1], color='red', marker='o', markersize=6, linestyle='none')
    
    tp.plot(tradeSim['cumret_spread'][tradeSim['stop_loss_trigger']], color='blue', marker='o', markersize=6, linestyle='none', label='stop-loss signal')

    tp.legend()

    return tp