In [1]:
import csv
import numpy

### Read ETF holding data
I'll be comparing two popular Canadian dividend ETFs by looking at their holdings:

CDZ - [iShares S&P/TSX Canadian Dividend Aristocrats Index ETF](https://www.blackrock.com/ca/individual/en/products/239834/ishares-sptsx-canadian-dividend-aristocrats-index-fund)   
XEI - [iShares S&P/TSX Composite High Dividend Index ETF](https://www.blackrock.com/ca/individual/en/products/239846/ishares-sptsx-equity-income-index-etf)

Read in all the holdings from each ETF and use them to create `index -> name` and `name -> index` mappings.

In [2]:
with open('data/CDZ_holdings.csv', 'rb') as csv_file:
    start_row = 11 # Individual holding data starts here
    for i in range(0, start_row - 1):
        csv_file.next()
    
    csv_reader = csv.DictReader(csv_file, delimiter=',')
    CDZ_data = list(csv_reader)
    
with open('data/XEI_holdings.csv', 'rb') as csv_file:
    start_row = 11 # Individual holding data starts here
    for i in range(0, start_row - 1):
        csv_file.next()
    
    csv_reader = csv.DictReader(csv_file, delimiter=',')
    XEI_data = list(csv_reader)
    
# Find all the holdings between the two ETFs
# Note that this is a set comprehension, not a dict comprehension.
CDZ_holding_names = {row['Ticker'] for row in CDZ_data}
XEI_holding_names = {row['Ticker'] for row in XEI_data}
combined_holdings = CDZ_holding_names.union(XEI_holding_names)

index_to_name = {i: value for i, value in enumerate(combined_holdings)}
name_to_index = {value: i for i, value in enumerate(combined_holdings)}

Create a vector representation for each ETF, using its holdings:  

$\textrm{XEI} = \begin{bmatrix}4.82\\0\\\vdots\\2.77\\0\end{bmatrix} = \begin{bmatrix}\textrm{BCE INC (4.82%)}\\ \textrm{METRO INC (0%)}\\\vdots\\\textrm{INTER PIPELINE LTD (2.77%)}\\\textrm{TOROMONT INDUSTRIES LTD (0%)}\end{bmatrix}$

In [3]:
n_holdings = len(combined_holdings)

CDZ = numpy.zeros(shape=(n_holdings))
XEI = numpy.zeros(shape=(n_holdings))

for row in CDZ_data:
    index = name_to_index[row['Ticker']]
    CDZ[index] = row['Weight (%)']

for row in XEI_data:
    index = name_to_index[row['Ticker']]
    XEI[index] = row['Weight (%)']
    

Use [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) to determine who similair the two ETFs are by holdings.

$$\textrm{similarity} = \cos(\theta) = \frac{A \cdot B}{\lVert A \rVert \times \lVert B \rVert}$$

Cosine similarity return 1 for complete positive correlation, 0 for completely uncorrelated, and -1 for complete negative correlation.

In [4]:
def cosine_similarity(a, b):
    return a.dot(b) / (numpy.linalg.norm(a) * numpy.linalg.norm(b))

holding_correlation = cosine_similarity(CDZ, XEI)
print "Holding correlation: {}".format(holding_correlation)

Holding correlation: 0.52298900804


Do the same thing the holding sectors

In [5]:
# Find all the sectors between the two ETFs
# Note that this is a set comprehension, not a dict comprehension.
CDZ_sector_names = {row['Sector'] for row in CDZ_data}
XEI_sector_names = {row['Sector'] for row in XEI_data}
combined_sectors = CDZ_sector_names.union(XEI_sector_names)

index_to_name = {i: value for i, value in enumerate(combined_sectors)}
name_to_index = {value: i for i, value in enumerate(combined_sectors)}

n_sectors = len(combined_sectors)
CDZ_sector_counts = numpy.zeros(shape=(n_sectors))
XEI_sector_counts = numpy.zeros(shape=(n_sectors))

for row in CDZ_data:
    index = name_to_index[row['Sector']]
    CDZ_sector_counts[index] += float(row['Weight (%)'])
    
for row in XEI_data:
    index = name_to_index[row['Sector']]
    XEI_sector_counts[index] += float(row['Weight (%)'])

sector_correlation = cosine_similarity(CDZ_sector_counts, XEI_sector_counts)
print "Sector correlation: {}".format(sector_correlation)

Sector correlation: 0.916196528432


Average the holding and sector correlation to get an idea of how similair the two ETFs are

In [6]:
print "Similarity: {}".format(numpy.mean([holding_correlation, sector_correlation]))

Similarity: 0.719592768236
