In [2]:
# Example from Programming Collective Intelligence, Chapter 10

### Yahoo stocks analysis ###

In [3]:
import nnmf
import urllib2
from numpy import *

tickers = ['YHOO', 'AVP', 'BIIB', 'BP', 'CL', 'CVX',
           'MSFT', 'EXPE', 'ORCL', 'PG', 'XOM', 'AMGN']
shortest = 300
prices = {}
dates = None

Read data and perform matrix factorization:

In [4]:
for t in tickers:
    print t
    # Open the URL
    rows = urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?s=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996' % t + \
                           '&ignore=.csv').readlines()
    # Extract the volume field from every line
    prices[t] = [float(r.split(',')[5]) for r in rows[1:] if r.strip() != '']
    if len(prices[t]) < shortest: shortest = len(prices[t])

    if not dates:
        dates = [r.split(',')[0] for r in rows[1:] if r.strip() != '']

l1 = [[prices[tickers[i]][j]
       for i in range(len(tickers))]
      for j in range(shortest)]

w, h = nnmf.factorize(matrix(l1), pc=5)

YHOO
AVP
BIIB
BP
CL
CVX
MSFT
EXPE
ORCL
PG
XOM
AMGN
3.09090862659e+18
1.28449428498e+17
4.61389672212e+16
3.69136295069e+16
3.37461207739e+16


In [5]:
print h
print w

[[  5.87018230e+06   6.36744345e+05   1.52376761e+06   1.54639284e+06
    1.38675211e+06   3.64946234e+06   1.14050766e+07   6.59894696e+05
    4.97489660e+07   3.25571155e+06   8.98056995e+06   3.73943226e+06]
 [  3.03202012e+02   4.25175640e+05   1.29489543e+04   1.91479587e+04
    1.12899720e+05   5.79694151e+04   2.78345504e+07   4.17489334e+03
    1.77730955e+06   1.15164137e+05   1.26906105e+05   1.95280301e+04]
 [  3.23762785e+07   6.81427632e+05   7.48743944e+05   1.02938142e+06
    1.11103966e+06   2.34222078e+06   1.05860437e+07   1.39586860e+05
    4.32305185e+06   1.86223051e+06   5.41211541e+06   2.77169189e+06]
 [  4.56436937e+06   1.32351092e+06   1.61606733e+06   1.80429123e+06
    2.22779095e+06   4.62910940e+06   3.51197075e+07   5.00740667e+05
    6.31917072e+06   4.23351218e+06   1.01510402e+07   4.72498828e+06]
 [  9.28475148e+05   1.18133593e+06   1.61200216e+06   1.61711416e+06
    2.21626136e+06   5.19904790e+06   4.13071991e+07   8.50425555e+05
    1.24375883e+

Extract top 10 stocks for each feature (volume contributions of each feature to each stock, as well as the dates most strongly associated with those features):

In [6]:
for i in range(shape(h)[0]):
    print "Feature %d" % i
    # Get the top stocks for this feature
    ol = [(h[i, j], tickers[j]) for j in range(shape(h)[1])]
    ol.sort()
    ol.reverse()
    for j in range(12):
        print ol[j]
    print
    # Show the top dates for this feature
    porder = [(w[d, i], d) for d in range(300)]
    porder.sort()
    porder.reverse()
    print [(p[0], dates[p[1]]) for p in porder[0:3]]
    print

Feature 0
(49748965.987069026, 'ORCL')
(11405076.5800968, 'MSFT')
(8980569.9537781663, 'XOM')
(5870182.3017073134, 'YHOO')
(3739432.261421578, 'AMGN')
(3649462.3436591015, 'CVX')
(3255711.5540102059, 'PG')
(1546392.8403563192, 'BP')
(1523767.6134920272, 'BIIB')
(1386752.1073220181, 'CL')
(659894.69636210147, 'EXPE')
(636744.34489510383, 'AVP')

[(3.9683951286164234, '2005-12-16'), (3.8963168117738314, '2006-09-20'), (2.6821056116306994, '2006-03-21')]

Feature 1
(27834550.412227415, 'MSFT')
(1777309.5512959384, 'ORCL')
(425175.64044171182, 'AVP')
(126906.10513709947, 'XOM')
(115164.13726523756, 'PG')
(112899.72025447352, 'CL')
(57969.415116767348, 'CVX')
(19528.030117175047, 'AMGN')
(19147.958661356941, 'BP')
(12948.954332253656, 'BIIB')
(4174.893339694604, 'EXPE')
(303.2020124044916, 'YHOO')

[(17.780056602386207, '2006-04-28'), (3.4582513834600821, '2006-05-04'), (2.9607724253166539, '2006-07-21')]

Feature 2
(32376278.51927872, 'YHOO')
(10586043.682543918, 'MSFT')
(5412115.412555014