# Multi-Factor Alpha Competition
By James Zhang, Omkar Pathak, Suryaa (Fall 2022)

### Required imports and prerequisite packages for this project
Here are some resources to learn more about each one:


In [None]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import pickle
import warnings
import math
import statsmodels
import sklearn
import random
import kneed
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import collections
from collections import OrderedDict
from statsmodels.tsa.stattools import coint
from IPython.core import display as ICD

from sif.siftools.backtesters import full_backtesters as fb
from sif.siftools import operators as op
from sif.siftools import metrics as metrics
from sif.sifinfra import sif_utils as su
from sif.siftools.abstractalpha import AbstractAlpha
#from sif.sifinsights.alpha_search import apply_metric_func

In [None]:
import warnings
warnings.filterwarnings('ignore')  # Disable warnings to reduce output spam in notebook

# Pairs Trading
distance-based approach where the criterion was choosing pairs that minimized the sum of squared deviations between the two normalized prices
- downside, small variance of the spread and therefore limits overall profitability

correlation - reflects short-term linear dependence, but the problem is that two securities that are correlated don't necessarily share this equilibrium relationship that we desire. 
- we have no way of knowing that divergences will reverse back to equilibrium
- high divergence risk and potential losses
- hgih correlation is not related to high cointegration

cointegration guarantees that there exists an equilibrium s.t. are more confident that divergences will even out over time

## Finding pairs
1. the most brute force solution to finding pairs is comparing every stock against every other stock to determine if there exists a cointegrated relationship
- computationally expensive
- this could lead to restrictive discovery of pairs purely within the same sector
--> how can we have flexible search combinations while not limiting the combinations of pairs to obviously similar solutions (ie. same sector)

## Machine Learning
1. Dimensionality reduction - find a compact representation for each security to reduce computation costs
 
PCA - transforms a large number of variables into a smaller number of uncorrelated variables called principal components. PCA will reduce 500 daily stock prices to 50 vars, and the stocks will be clustered on these components.


2. unsupervised learning - apply an appropriate clustering algo

OPTICS - Ordering Points To Identify the Clustering Structure or KMeans

3. define a set of rules (ARODS) to select pairs for trading

https://israeldi.github.io/coursework/EECS545/545_Final_Project.pdf


In [None]:
import requests
import json
from bs4 import BeautifulSoup
from newsapi.newsapi_client import NewsApiClient
import os
import requests
from datetime import datetime
import csv
import yfinance as yf

def obtainArticleContent(code, year, month, day):
    inputData = []
    date = str(year) + '-' + str(month) + '-' + str(day)
    query = getQuery(code)

    if datetime(year, month, day).weekday() >= 5:
      return
    
    url = makeURL(date, query, 100)
    data = getData(url)
    if data == None:
        return
    for i in data:
        tmpDict = dict(i)
        try:
            page = requests.get(tmpDict['url'])
        except:
            print('page failed')
        if page.status_code == 200:
            soup = BeautifulSoup(page.content, "lxml") 
            all_tags = soup.find_all('p')
            for i in all_tags:
                try:
                    inputData.append(i.get_text())
                except Exception as inst:
                    print('character error')
    return inputData 

def getData(url):
    try:
        response = requests.get(url)
        data = response.json()
        data = data['articles']
        return data
    except:
        return

def makeURL(query, date, pageSize):
    return ('https://newsapi.org/v2/everything?'
            'q=' + query + '&'
            'from=' + date + '&'
            'sortBy=popularity&'
            'pageSize=' + str(pageSize) + '&'
            'apiKey=13bd628fa8b548738d3b113d9442574e&'
            'language=en')


def getQuery(code):
    return get_symbol(code).split(',')[0]

def get_symbol(symbol):
    tick = yf.Ticker(symbol)
    print(tick.info['longName'])
    return tick.info['longName']

import mygrad as mg
import numpy as np
from mynn.layers.dense import dense
from mygrad.nnet.initializers import glorot_normal
from mynn.optimizers.adam import Adam
from mygrad.nnet.losses import softmax_crossentropy
from gensim.models.keyedvectors import KeyedVectors
from noggin import create_plot

mg.turn_memory_guarding_off()

class RNN:
    def __init__(self, dim_input, dim_recurrent, dim_output):

        self.fc_h2y = dense(dim_recurrent, dim_output, weight_initializer=glorot_normal)
        self.Uz = mg.Tensor(
            np.random.randn(dim_input * dim_recurrent).reshape(dim_input, dim_recurrent)
        )
        self.Wz = mg.Tensor(
            np.random.randn(dim_recurrent * dim_recurrent).reshape(
                dim_recurrent, dim_recurrent
            )
        )
        self.bz = mg.Tensor(np.random.randn(dim_recurrent))
        self.Ur = mg.Tensor(
            np.random.randn(dim_input * dim_recurrent).reshape(dim_input, dim_recurrent)
        )
        self.Wr = mg.Tensor(
            np.random.randn(dim_recurrent * dim_recurrent).reshape(
                dim_recurrent, dim_recurrent
            )
        )
        self.br = mg.Tensor(np.random.randn(dim_recurrent))
        self.Uh = mg.Tensor(
            np.random.randn(dim_input * dim_recurrent).reshape(dim_input, dim_recurrent)
        )
        self.Wh = mg.Tensor(
            np.random.randn(dim_recurrent * dim_recurrent).reshape(
                dim_recurrent, dim_recurrent
            )
        )
        self.bh = mg.Tensor(np.random.randn(dim_recurrent))

    def __call__(self, x):

        h = mg.nnet.layers.gru(
            x,
            self.Uz,
            self.Wz,
            self.bz,
            self.Ur,
            self.Wr,
            self.br,
            self.Uh,
            self.Wh,
            self.bh,
        )
        return self.fc_h2y(h[-1])

    @property
    def parameters(self):
        return self.fc_h2y.parameters + (
        self.Uz, self.Wz, self.bz, self.Ur, self.Wr, self.br, self.Uh, self.Wh, self.bh)

def to_glove(sentence):
    out = []
    for word in sentence.split():
        word = word.lower()
        try:
            out.append(glove[word])
        except:
            continue
    if len(out) > MAXLEN:
        out = out[:MAXLEN]
    elif len(out) < MAXLEN:
        for _ in range(len(out), MAXLEN):
            out.append(np.zeros(50))
    return out


def predict(sentence):
    sentence = to_glove(sentence)
    w = np.ascontiguousarray(np.swapaxes(np.array(sentence).reshape(1, 100, 50), 0, 1))
    pred = Keys[np.argmax(model(w))]
    print(pred)


def predict(multiple_sentences):
    good = 0
    bad = 0
    pred = 0
    for sentence in multiple_sentences:
        sentence = to_glove(sentence)
        w = np.ascontiguousarray(np.swapaxes(np.array(sentence).reshape(1, 100, 50), 0, 1))
        pred = np.argmax(model(w))
        if pred==1:
            good +=1
        else:
            bad += 1
    
    good = good / 800
    if good > 8:
      return false
    else:
      return true


def initPrediction(code, month, day, year):
    inputData = obtainArticleContent(code, year, month, day)
    print('got data')
    params = np.load("model.npy", allow_pickle=True)
    model = RNN(50, 16, 2)
    MAXLEN = 100
    model.fc_h2y.weight, model.fc_h2y.bias, model.Uz, model.Wz, model.bz, model.Ur, model.Wr, model.br, model.Uh, model.Wh, model.bh = (
        params[0],
        params[1],
        params[2],
        params[3],
        params[4],
        params[5],
        params[6],
        params[7],
        params[8],
        params[9],
        params[10]
    )
    glove = KeyedVectors.load_word2vec_format("glove.6B.50d.txt.w2v", binary=False)
    return predict(inputData)

In [None]:
class PairsTrading_PValue_ML(AbstractAlpha):
    def __init__(self, reset, npairs):
        self.name = 'Pairs Trading - P Value'
        self.lookback = 252
        self.factor_list = ['close']
        self.universe_size = 500
        
        self.pairs = None
        self.keyPairs = []
        self.reset = reset
        self.npairs = npairs
        self.holdings = np.zeros(self.universe_size)
        self.day_counter = 0
        self.count = 0
        csvfile = open('constituents_csv.csv','rb')
        csvFileArray = []
        for row in csv.reader(csvfile, delimiter = '.'):
            csvFileArray.append(row)
    
    def zscore(self, series):
        return (series - series.mean()) / np.std(series)
    
    # this function was created by Delanie MacKensie and posted on Quanttopia
    def form_pairs(self, df):
        df = pd.DataFrame(df).dropna()
        n = df.shape[1]
        
        # creating an adjacency matrix sort of for cointegration scores and pvalues
        score_matrix = np.zeros((n, n))
        pvalue_matrix = np.ones((n, n))
        keys = df.keys()
        pairs = []
        pairsDict = {} #maps from pvalue to pair
        
        # this nested for loop part can be optimized with ML
        # for the sake of time, I don't loop through the entire universe size
        # but this is definitely one of the areas of improvement
        for i in range(150):
            for j in range(i + 1, 200):
                S1 = df[keys[i]]        
                S2 = df[keys[j]]
                result = coint(S1, S2)
                score = result[0]
                pvalue = result[1]
                score_matrix[i, j] = score
                pvalue_matrix[i, j] = pvalue
                
                # used a hashmap, sorted keys (pvalues) in ascending order in order
                # to get the smallest pvalues
                if pvalue < 0.05:
                    pairsDict[pvalue] = ([i,j])
                    #self.keyPairs.append((keys[i], keys[j]))
        
        OrderedPairs = OrderedDict(sorted(pairsDict.items()))
        if len(OrderedPairs) < self.npairs:
            pairs = OrderedPairs.values()
        else:
            pairs = list((OrderedPairs.values()))[:self.npairs]
        
        #print(f"PValue Cointegrated Pairs - Round {self.count} - Pairs are {pairs}")
        #self.count += 1
        return pairs
    
    def generate_day(self, day, data):
        
        # creating new pairs
        if self.day_counter == 0:
            self.day_counter = self.reset
            self.pairs = self.form_pairs(data['close'])
            return op.weight(self.holdings)
      
        data = pd.DataFrame(data['close'])
        for p in self.pairs:
            diff = data[p[0]] - data[p[1]]
            
            #zscore tells us how far from away from the mean a data point is
            z_score = self.zscore(diff).tail(1).values[0]
            
            bool = initPrediction(csvFileArray[p[0]], day.month(), day.day(), day.year())
            
            # enter the trade, short the FIRST, long SECOND
            if z_score > 1.0 and bool:
                self.holdings[p[0]] = -1
                self.holdings[p[1]] = 1   
            # exit the trade
            elif abs(z_score) < 0.5:
                self.holdings[p[0]] = 0
                self.holdings[p[1]] = 0
            # enter the trade; long the FIRST, short SECOND
            elif z_score < -1.0 and not bool:
                self.holdings[p[0]] = 1
                self.holdings[p[1]] = -1
            
        # at the end of the trading day, decrement day_counter
        self.day_counter -= 1
        return op.weight(self.holdings)

# Use below for faster testing

In [None]:
class PairsTrading_PValue(AbstractAlpha):
    def __init__(self, reset, npairs):
        self.name = 'Pairs Trading - P Value'
        self.lookback = 252
        self.factor_list = ['close']
        self.universe_size = 500
        
        self.pairs = None
        self.keyPairs = []
        self.reset = reset
        self.npairs = npairs
        self.holdings = np.zeros(self.universe_size)
        self.day_counter = 0
        self.count = 0
    
    def zscore(self, series):
        return (series - series.mean()) / np.std(series)
    
    # this function was created by Delanie MacKensie and posted on Quanttopia
    def form_pairs(self, df):
        df = pd.DataFrame(df).dropna()
        n = df.shape[1]
        
        # creating an adjacency matrix sort of for cointegration scores and pvalues
        score_matrix = np.zeros((n, n))
        pvalue_matrix = np.ones((n, n))
        keys = df.keys()
        pairs = []
        pairsDict = {} #maps from pvalue to pair
        
        # this nested for loop part can be optimized with ML
        # for the sake of time, I don't loop through the entire universe size
        # but this is definitely one of the areas of improvement
        for i in range(150):
            for j in range(i + 1, 200):
                S1 = df[keys[i]]        
                S2 = df[keys[j]]
                result = coint(S1, S2)
                score = result[0]
                pvalue = result[1]
                score_matrix[i, j] = score
                pvalue_matrix[i, j] = pvalue
                
                # used a hashmap, sorted keys (pvalues) in ascending order in order
                # to get the smallest pvalues
                if pvalue < 0.05:
                    pairsDict[pvalue] = ([i,j])
                    #self.keyPairs.append((keys[i], keys[j]))
        
        OrderedPairs = OrderedDict(sorted(pairsDict.items()))
        if len(OrderedPairs) < self.npairs:
            pairs = OrderedPairs.values()
        else:
            pairs = list((OrderedPairs.values()))[:self.npairs]
        
        #print(f"PValue Cointegrated Pairs - Round {self.count} - Pairs are {pairs}")
        #self.count += 1
        return pairs
    
    def generate_day(self, day, data):
        
        # creating new pairs
        if self.day_counter == 0:
            self.day_counter = self.reset
            self.pairs = self.form_pairs(data['close'])
            return op.weight(self.holdings)
      
        data = pd.DataFrame(data['close'])
        for p in self.pairs:
            diff = data[p[0]] - data[p[1]]
            
            #zscore tells us how far from away from the mean a data point is
            z_score = self.zscore(diff).tail(1).values[0]
            
            
            # enter the trade, short the FIRST, long SECOND
            if z_score > 1.0:
                self.holdings[p[0]] = -1
                self.holdings[p[1]] = 1   
            # exit the trade
            elif abs(z_score) < 0.5:
                self.holdings[p[0]] = 0
                self.holdings[p[1]] = 0
            # enter the trade; long the FIRST, short SECOND
            elif z_score < -1.0:
                self.holdings[p[0]] = 1
                self.holdings[p[1]] = -1
            
        # at the end of the trading day, decrement day_counter
        self.day_counter -= 1
        return op.weight(self.holdings)

In [None]:
alphas = [PairsTrading_PValue(120, 5)]
# , PairsTrading_CointScore(120,5) 'Pairs Trading - Cointegration Score'
returns, holdings = backtester.backtest(alphas, processes=None)

metrics.summary_plot(returns, source=['Pairs Trading - P Value'])