In [1]:
## S&P 500 STOCK CLUSTERING AND NEURAL NETWORK ANALYSIS ##
# This script downloads the most recent listing of stock tickers of the S&P 500 from Wikipedia
# It then downloads the basic stock prices dating back to January 1st, 2015
# These data points are aggreagated into basic average summaries as well as processed for neural network analysis.
# At its most fundimental level, the algorithm uses the previous n days of adjusted close data to predict a rise
# or fall in price the following day/week/month.

# It is important to note that the S&P 500 does not mimic the behavior of all stocks. Separately, the metrics by which
# we evaluate our model are constantly improving. At this point, there is a theoretical lower-bound of 50% accuracy:
# did the stock go up or down. However we also seek to gauge how much the stock increased or decreased by increasing
# the number of neurons in the output layer or by running an entirely separate anlaysis--such a discussion is ongoing.

In [None]:
# Test from new machine

In [1]:
import datetime
import urllib.request
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas.io.data import DataReader
import re

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


In [2]:
# COLLECT STOCK TICKERS
wiki_page = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
wiki_page = str(wiki_page.read())

In [3]:
ticker_locations_part_one = [m.start() for m in re.finditer('XNYS', wiki_page)]
ticker_locations_part_two = [m.start() for m in re.finditer('.com/symbol/', wiki_page)]
all_tickers = []
for t in range(0,len(ticker_locations_part_one)): # t=1
    ticker = wiki_page[ticker_locations_part_one[t]:ticker_locations_part_one[t]+15]
    ticker = re.search('XNYS:(.+?)>',ticker).group(1).replace('"','')
    all_tickers.append(ticker)
for t in range(0,len(ticker_locations_part_two)):
    ticker = wiki_page[ticker_locations_part_two[t]:ticker_locations_part_two[t]+25]
    ticker = re.search('com/symbol/(.+?)>',ticker).group(1).replace('"','')
    all_tickers.append(ticker)

In [4]:
symbols_list = all_tickers

symbols=[]
for ticker in symbols_list:
    try:
        r = DataReader(ticker, "yahoo", start=datetime.datetime(2014, 12, 30))
        # add a symbol column
        r['Symbol'] = ticker 
        symbols.append(r)
    except:
        print(ticker, "has been renamed")
# concatenate all the dfs
df = pd.concat(symbols)
#define cell with the columns that i need
cell= df[['Symbol','Open','High','Low','Adj Close','Volume']]
#changing sort of Symbol (ascending) and Date(descending) setting Symbol as first column and changing date format
cell = cell.reset_index().sort(['Symbol', 'Date'], ascending=[1,0]).set_index(['Symbol'])



In [5]:
last_close = []
five_day_avg = []
one_month_avg = []
two_month_avg = []
three_month_avg = []
six_month_avg = []
twelve_month_avg = []

last_close_vol = []
five_day_avg_vol = []
one_month_avg_vol = []
two_month_avg_vol = []
three_month_avg_vol = []
six_month_avg_vol = []
twelve_month_avg_vol = []

for i in range(0,len(symbols_list)):
    ticker = symbols_list[i]
    subset = cell.loc[ticker]

    last_close.append(subset.iloc[2,:]["Adj Close"])
    five_day_avg.append(subset.iloc[2:7,:]["Adj Close"].mean())
    one_month_avg.append(subset.iloc[2:32,:]["Adj Close"].mean())
    two_month_avg.append(subset.iloc[2:62,:]["Adj Close"].mean())
    three_month_avg.append(subset.iloc[2:92,:]["Adj Close"].mean())
    six_month_avg.append(subset.iloc[2:182,:]["Adj Close"].mean())
    twelve_month_avg.append(subset.iloc[2:365,:]["Adj Close"].mean())

    last_close_vol.append(subset.iloc[2,:]["Volume"])
    five_day_avg_vol.append(subset.iloc[2:7,:]["Volume"].mean())
    one_month_avg_vol.append(subset.iloc[2:32,:]["Volume"].mean())
    two_month_avg_vol.append(subset.iloc[2:62,:]["Volume"].mean())
    three_month_avg_vol.append(subset.iloc[2:92,:]["Volume"].mean())
    six_month_avg_vol.append(subset.iloc[2:182,:]["Volume"].mean())
    twelve_month_avg_vol.append(subset.iloc[2:365,:]["Volume"].mean())

ticker_data = {'Ticker':ticker,'Last Close':last_close,'Five Day Avg':five_day_avg,'One Month Avg':one_month_avg,
               'Two Month Avg':two_month_avg,'Three Month Avg':three_month_avg,'Six Month Avg':six_month_avg,
               'Twelve Month Avg':twelve_month_avg,'Last Close Vol':last_close_vol,'Five Day Avg Vol':five_day_avg_vol,
               'One Month Avg Vol':one_month_avg_vol,'Two Month Avg Vol':two_month_avg_vol,
               'Three Month Avg Vol':three_month_avg_vol,'Six Month Avg Vol':six_month_avg_vol,
               'Twelve Month Avg Vol':one_month_avg_vol}
ticker_outputs = DataFrame(data=ticker_data, index=symbols_list)
ticker_outputs = ticker_outputs[['Last Close','Five Day Avg','One Month Avg','Two Month Avg','Three Month Avg',
                                'Six Month Avg','Twelve Month Avg','Last Close Vol','Five Day Avg Vol',
                                'One Month Avg Vol','Two Month Avg Vol','Three Month Avg Vol','Six Month Avg Vol',
                                'Twelve Month Avg Vol']]

In [6]:
ticker_outputs

Unnamed: 0,Last Close,Five Day Avg,One Month Avg,Two Month Avg,Three Month Avg,Six Month Avg,Twelve Month Avg,Last Close Vol,Five Day Avg Vol,One Month Avg Vol,Two Month Avg Vol,Three Month Avg Vol,Six Month Avg Vol,Twelve Month Avg Vol
MMM,171.740005,172.550003,169.448359,171.545037,173.538146,170.474946,158.263707,2527700,1619420.0,2.084290e+06,2.000773e+06,1.820780e+06,1.896367e+06,2.084290e+06
ABT,38.070000,38.736000,39.578000,40.557642,41.633536,40.709507,41.776857,14874500,7984100.0,8.777217e+06,8.401897e+06,9.022787e+06,9.354002e+06,8.777217e+06
ABBV,60.799999,60.507999,60.227333,61.441068,62.702799,61.252596,59.381514,8927300,6824640.0,8.924340e+06,7.482665e+06,7.144542e+06,7.705035e+06,8.924340e+06
ACN,119.430000,120.338000,117.866333,115.941643,114.869449,114.325441,106.818903,2617300,1774640.0,2.382750e+06,2.647772e+06,2.396938e+06,2.381797e+06,2.382750e+06
AYI,251.410004,253.332001,241.262000,248.363298,255.951827,252.055662,227.914748,376300,344900.0,5.155800e+05,4.905667e+05,3.919678e+05,4.524694e+05,5.155800e+05
AAP,169.720001,170.438000,150.722000,149.896579,154.297239,155.451629,160.768291,815400,768400.0,1.388257e+06,1.189442e+06,1.158363e+06,1.058542e+06,1.388257e+06
AES,11.450000,11.648000,11.713584,12.000745,12.037330,11.653731,10.943528,5452600,4873220.0,5.554253e+06,5.725610e+06,5.252691e+06,5.030362e+06,5.554253e+06
AET,130.839996,130.032001,117.446666,115.720600,116.357758,115.473715,112.466189,3989100,2378260.0,3.113517e+06,2.534213e+06,2.295780e+06,2.534973e+06,3.113517e+06
AMG,148.100006,147.063998,140.762000,143.160833,143.105667,151.260167,161.465345,592200,575520.0,6.173633e+05,5.536567e+05,5.066411e+05,5.429783e+05,6.173633e+05
AFL,71.379997,71.400000,70.397657,70.975030,71.400244,69.490504,64.170855,2120900,1372940.0,1.830577e+06,1.724838e+06,1.708648e+06,1.920805e+06,1.830577e+06


In [7]:
ticker_outputs.loc['aapl']

Last Close              1.105200e+02
Five Day Avg            1.113140e+02
One Month Avg           1.116926e+02
Two Month Avg           1.120855e+02
Three Month Avg         1.101278e+02
Six Month Avg           1.043781e+02
Twelve Month Avg        1.064396e+02
Last Close Vol          3.576500e+07
Five Day Avg Vol        2.601264e+07
One Month Avg Vol       3.407368e+07
Two Month Avg Vol       3.821263e+07
Three Month Avg Vol     3.577176e+07
Six Month Avg Vol       3.645178e+07
Twelve Month Avg Vol    3.407368e+07
Name: aapl, dtype: float64

In [8]:
cell.loc['A']

Unnamed: 0_level_0,Date,Open,High,Low,Adj Close,Volume
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2016-12-02,43.270000,44.090000,43.270000,44.029999,2153200
A,2016-12-01,44.080002,44.099998,42.919998,43.209999,2823100
A,2016-11-30,44.790001,44.900002,43.939999,43.980000,2228000
A,2016-11-29,43.599998,44.630001,43.520000,44.480000,1951900
A,2016-11-28,44.520000,44.689999,43.580002,43.599998,2240100
A,2016-11-25,44.740002,45.009998,44.619999,44.889999,877100
A,2016-11-23,44.180000,44.779999,43.970001,44.650002,1926300
A,2016-11-22,44.939999,45.160000,43.770000,44.459999,2798200
A,2016-11-21,45.220001,45.240002,44.650002,44.970001,2537000
A,2016-11-18,45.910000,46.200001,45.009998,45.040001,2845700


In [45]:
correct_values = np.array([[0.0]])
estimation_values = np.array([[0.0],[0.0],[0.0],[0.0],[0.0]])
stock_subset = cell.loc[stock]
    
for i in range(0,(len(stock_subset)-5)):
    if stock_subset.iloc[i,4] > stock_subset.iloc[i+1,4]:
        correct_values = np.append(correct_values, np.array([[1.0]]), axis=0)
    else:
        correct_values = np.append(correct_values, np.array([[0.0]]), axis=0)
    estimation_values = np.append(estimation_values, np.array([[stock_subset.iloc[i+1,4]],
                                                               [stock_subset.iloc[i+2,4]],
                                                               [stock_subset.iloc[i+3,4]],
                                                               [stock_subset.iloc[i+4,4]],
                                                               [stock_subset.iloc[i+5,4]]]))
    # current problem I'm solving: trying to get the estimation values to be blocked in fives

In [10]:
# """Need to create a structure that makes training data in the form: [day1, day2, day3, day4, day5, result] for each 
# ticker/stock. Then I can sample 400 of the 505 as training data, 55 for testing and the last 50 for validation. Each
# ticker will have about 36 months worth of data (365*3-5 = 1090 training/testing/validation values)."""
# training_data_tickers = np.random.choice(symbols_list, 400, replace=False) # 400 samples
# testing_data_tickers = np.random.choice(list(set(symbols_list)-set(training_data)),55,replace=False) # 55 samples
# validation_data_tickers = list(set(symbols_list)-set(training_data)-set(testing_data)) # 50 samles

# for ticker in training_data_tickers:
#     stock_backlog = cell.loc[ticker]
#     for i in range(1,len(stock_backlog)):
#         print(i)
    
# print(stock_backlog)
# range(len(stock_backlog),1)

In [30]:
# NEURAL NETWORK IN-DATA FORMAT:
# (array[[a.],[b.],[c.],[d.],[e.]],dtype=fload32,array([[1.],[0.]]))
# First array: previous five days of adjusted closing data
# Second array: if day 6 > day 5 close, first box is 0. and second is 1., otherwise 1. and 0. respectively

stock_subset.iloc[0,4]

37.900002000000001

In [27]:
stock_subset

Unnamed: 0_level_0,Date,Open,High,Low,Adj Close,Volume
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABT,2016-12-02,37.680000,38.080002,37.599998,37.900002,7250800
ABT,2016-12-01,38.119999,38.299999,37.380001,37.599998,9001300
ABT,2016-11-30,38.750000,38.750000,38.070000,38.070000,14874500
ABT,2016-11-29,38.860001,39.020000,38.660000,38.730000,7024800
ABT,2016-11-28,39.220001,39.220001,38.700001,38.740002,5719300
ABT,2016-11-25,39.080002,39.209999,38.939999,39.160000,2733000
ABT,2016-11-23,38.040001,39.130001,37.900002,38.980000,9568900
ABT,2016-11-22,39.419998,39.610001,37.810001,38.099998,16224100
ABT,2016-11-21,39.849998,40.130001,39.639999,39.759998,8605500
ABT,2016-11-18,40.310001,40.419998,39.799999,39.840000,7003900


In [46]:
correct_values

array([[ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
      

In [53]:
estimation_values[0:15]

array([  0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
        37.599998,  38.07    ,  38.73    ,  38.740002,  39.16    ,
        38.07    ,  38.73    ,  38.740002,  39.16    ,  38.98    ])