In [482]:
from pandas_datareader import data
import re



def get_data(symbol, start, end):
    df = data.DataReader(symbol, 'yahoo', start, end)
    df.columns.values[-2] = 'Adj_Close'
    df.columns = df.columns + '_' +symbol
    df['return_%s' %symbol] = df['Adj_Close_%s' %symbol].pct_change()
    return df

# utils
def get_symbol(df):
    return df.columns.values[0].split('_')[-1]

def get_digits(df, string):
    symbol = get_symbol(df)
    name = '|'.join(df.columns.values)
    match = re.findall(r'\|([\w]+)\_(\d+)\_', name)
    for m in match:
        if m[0] == string:
            return m[0] + '_' + m[1]
        
def merge_dataframe(df_list):
    ret = pd.concat(df_list, axis=1)
    return ret

# feature generation

def delta_return(df, delta):
    symbol = get_symbol(df)
    df['delta_%d_%s' %(delta, symbol)] = df['Adj_Close_%s' %symbol].pct_change(delta)

def moving_average(df, window):
    symbol = get_symbol(df)
    df['mvavg_%d_%s' %(window, symbol)] = df['return_%s' %symbol].rolling(window).mean()

# slicing
def get_adj_close(df):
    symbol = get_symbol(df)
    df = df['Adj_Close_%s' %symbol]
    return df

def get_features(df, features=['mvavg', 'delta']):
    symbol = get_symbol(df)
    feature_name = [get_digits(df, i) for i in features]
    
    col_list = [i + '_' + symbol for i in feature_name]
    col_list.append('return_%s' %symbol)
    
    return df[col_list]
    

In [489]:
symbols = ['^IXIC', '^GDAXI', '^FTSE', '^FCHI', '^HSI', '^N225']

symbol_dict = {
    'TOT': 'Total',
    'XOM': 'Exxon',
    'CVX': 'Chevron',
    'COP': 'ConocoPhillips',
    'VLO': 'Valero Energy',
    'MSFT': 'Microsoft',
    'IBM': 'IBM',
    'TWX': 'Time Warner',
    'CMCSA': 'Comcast',
    'CVC': 'Cablevision',
    'YHOO': 'Yahoo',
    'DELL': 'Dell',
    'HPQ': 'HP',
    'AMZN': 'Amazon',
    'TM': 'Toyota',
    'CAJ': 'Canon',
    'SNE': 'Sony',
    'F': 'Ford',
    'HMC': 'Honda',
    'NAV': 'Navistar',
    'NOC': 'Northrop Grumman',
    'BA': 'Boeing',
    'KO': 'Coca Cola',
    'MMM': '3M',
    'MCD': 'McDonald\'s',
    'PEP': 'Pepsi',
    'K': 'Kellogg',
    'UN': 'Unilever',
    'MAR': 'Marriott',
    'PG': 'Procter Gamble',
    'CL': 'Colgate-Palmolive',
    'GE': 'General Electrics',
    'WFC': 'Wells Fargo',
    'JPM': 'JPMorgan Chase',
    'AIG': 'AIG',
    'AXP': 'American express',
    'BAC': 'Bank of America',
    'GS': 'Goldman Sachs',
    'AAPL': 'Apple',
    'SAP': 'SAP',
    'CSCO': 'Cisco',
    'TXN': 'Texas Instruments',
    'XRX': 'Xerox',
    'WMT': 'Wal-Mart',
    'HD': 'Home Depot',
    'GSK': 'GlaxoSmithKline',
    'PFE': 'Pfizer',
    'SNY': 'Sanofi-Aventis',
    'NVS': 'Novartis',
    'KMB': 'Kimberly-Clark',
    'R': 'Ryder',
    'GD': 'General Dynamics',
    'RTN': 'Raytheon',
    'CVS': 'CVS',
    'CAT': 'Caterpillar',
    'DD': 'DuPont de Nemours'}

symbol = list(symbol_dict)

start = '2004-01-01'
end = '2017-01-01'

qoutes = []
for s in symbols:
    qoutes.append(get_data(s, start, end))
    
for q in qoutes:
    delta_return(q, 9)
    moving_average(q, 9)

In [490]:
features = []
for q in qoutes:
    features.append(get_features(q))

In [491]:
predictor = merge_dataframe(features)
predictor['return_^IXIC'] = predictor['return_^IXIC'].shift(-1)
predictor = predictor.dropna()

In [492]:
out = pd.DataFrame()
out['up_down'] = predictor['return_^IXIC']
out.loc[predictor['return_^IXIC']>0, ['up_down']] = 1
out.loc[predictor['return_^IXIC']<0, ['up_down']] = 0

In [495]:
predictor = predictor.drop(['mvavg_9_^IXIC', 'delta_9_^IXIC', 'return_^IXIC'], axis=1)

In [496]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

In [497]:
target = le.fit(out.up_down).transform(out.up_down)

In [498]:
len(out)

2477

In [499]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
clf.fit(predictor[:2350], target[:2350])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [500]:
clf.score(predictor[2350:], target[2350:])

0.45669291338582679

In [501]:
target[30:31]

array([1])

In [502]:
from sklearn.manifold import TSNE

clf = TSNE(n_components=2, early_exaggeration=2000)
embedded = clf.fit_transform(predictor.transpose())

In [503]:
len(embedded)

15

In [504]:
import numpy as np
import bokeh.plotting as bp
from bokeh.plotting import show
from bokeh.models import HoverTool
from bokeh.io import output_notebook
output_notebook()

In [505]:
figure = bp.figure()

figure.scatter(x=[i[0] for i in embedded], y=[i[1] for i in embedded])

show(figure)