In [378]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

# Data

Let's pull some starting data from NASDAQ on ETFs

In [413]:
nasdaq = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nasdaq&render=download'
nyse = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nyse&render=download'
amex = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=amex&render=download'
info = pd.concat([pd.read_csv(url) for url in [nasdaq, nyse, amex]])

In [414]:
info.MarketCap = pd.to_numeric(info.MarketCap.str.replace("$","").str.replace("M","e6").str.replace("B","e9"))

In [415]:
bonds = pd.read_csv('https://www.nasdaq.com/investing/etfs/etf-finder-results.aspx?download=Yes')
bonds = pd.merge(bonds,info, how='left',on='Symbol', suffixes=('','_info'))
bonds['NetChangeDirectionNum'] = bonds['NetChangeDirection'].map({'up':1,'down':-1,'unch':0})

In [416]:
num_cols = ['LastSale','NetChange','NetChangeDirectionNum','PercentChange','1YrPercentChange','MarketCap']
num_cols_normed = [col+'_normed' for col in num_cols]

In [417]:
bonds = bonds.assign(**{col+'_normed':bonds[col] for col in num_cols})

In [418]:
# normalize
def normalize_minmax(df):
    return (df-df.min())/(df.max()-df.min())
def normalize_mean(df):
    return (df-df.mean())/df.std()

In [419]:
bonds[num_cols_normed] = normalize_minmax(bonds[num_cols_normed])

In [424]:
def top_n(df, n=10, i=0, num_cols=num_cols_normed, cosine_similarity=cosine_similarity, ignore_original=True):
    
    my_bond = df.iloc[i]
    num_cols = [col for col in num_cols if not np.isnan(my_bond[col])]
    print(f"Performing Cosine Similarity with {num_cols}")
    df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

    # TODO: allow column weighting for cosine_similarity
    cosine_sim = cosine_similarity(df[num_cols],df[num_cols])
    
    #top ten
    sim_scores = list(enumerate(cosine_sim[i]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[ignore_original:n + 1]
    sim_scores = pd.DataFrame(sim_scores)
    
    indexes,scores = sim_scores[0],sim_scores[1]
    
    top_ten = df.iloc[indexes].copy()
    top_ten['cosine_sim']=scores.values
    return top_ten

In [511]:
top = top_n(bonds,i=1, n=5, num_cols=num_cols_normed, ignore_original=False)
top

Performing Cosine Similarity with ['LastSale_normed', 'NetChange_normed', 'NetChangeDirectionNum_normed', 'PercentChange_normed', '1YrPercentChange_normed']


Unnamed: 0,Symbol,Name,LastSale,NetChange,NetChangeDirection,PercentChange,1YrPercentChange,Name_info,LastSale_info,MarketCap,...,Summary Quote,Unnamed: 8,NetChangeDirectionNum,LastSale_normed,NetChange_normed,NetChangeDirectionNum_normed,PercentChange_normed,1YrPercentChange_normed,MarketCap_normed,cosine_sim
1,QDEF,FlexShares Quality Dividend Defensive Index Fund,45.69,-0.0674,down,0.15,2.62,,,,...,,,-1,0.120588,0.847467,0.0,0.008808,0.360462,,1.0
4,QDYN,FlexShares Quality Dynamic Index Fund,44.5614,-0.082,down,0.18,3.44,,,,...,,,-1,0.117409,0.847028,0.0,0.01057,0.363859,,0.999986
2,QDF,FlexShares Quality Dividend Index Fund,45.52,-0.0833,down,0.18,0.89,,,,...,,,-1,0.120109,0.846989,0.0,0.01057,0.353297,,0.999974
1320,HYLS,First Trust High Yield Long/Short ETF,48.22,-0.08,down,0.17,2.4,First Trust High Yield Long/Short ETF,48.22,1250000000.0,...,https://www.nasdaq.com/symbol/hyls,,-1,0.127715,0.847089,0.0,0.009982,0.359551,0.016587,0.999969
200,SHYG,iShares 0-5 Year High Yield Corporate Bond ETF,46.57,-0.11,down,0.24,0.65,,,,...,,,-1,0.123067,0.846187,0.0,0.014093,0.352303,,0.999949
256,SPHB,Invesco S&P 500 High Beta ETF,43.23,-0.04,down,0.09,1.22,,,,...,,,-1,0.113659,0.84829,0.0,0.005285,0.354664,,0.999948


# Visualize

In [512]:
import cufflinks as cf
cf.go_offline()

import plotly.graph_objs as go
from plotly.offline import iplot

def make_polar_plot(df, num_cols):
    theta = df[num_cols].columns
    rs = df[num_cols].values
    names = df.Symbol.values
    
    data = [go.Scatterpolar(
      opacity = 0.5,
      name = names[i],
      text = names[i],
      r = r,
      theta = theta,
      fill = 'toself'
    ) for i,r in enumerate(rs)]

    layout = go.Layout(
      polar = dict(
        radialaxis = dict(
          visible = True
        )
      ),
      showlegend = True
    )

    fig = go.Figure(data=data, layout=layout)
    return iplot(fig)

In [513]:
make_polar_plot(top, num_cols_normed)