In [34]:
#https://www.youtube.com/watch?v=Z-5wNWgRJpk
import numpy as np
import pandas as pd
import pickle
from collections import Counter
from sklearn import svm,model_selection as cross_validation,neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [35]:
def process_data_for_labels(ticker):
    hm_days=7
    df=pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers=df.columns.values.tolist()
    df.fillna(0,inplace=True)
    
    for i in range(1,hm_days+1):
        df['{}_{}d'.format(ticker,i)]=(df[ticker].shift(-i) - df[ticker]) / df[ticker]
    
    df.fillna(0,inplace=True)
    return tickers,df        

In [36]:
process_data_for_labels('XOM')

(['MMM',
  'ABT',
  'ABBV',
  'ABMD',
  'ACN',
  'ATVI',
  'ADBE',
  'AMD',
  'AAP',
  'AES',
  'AMG',
  'AFL',
  'A',
  'APD',
  'AKAM',
  'ALK',
  'ALB',
  'ARE',
  'ALXN',
  'ALGN',
  'ALLE',
  'AGN',
  'ADS',
  'LNT',
  'ALL',
  'GOOGL',
  'GOOG',
  'MO',
  'AMZN',
  'AEE',
  'AAL',
  'AEP',
  'AXP',
  'AIG',
  'AMT',
  'AWK',
  'AMP',
  'ABC',
  'AME',
  'AMGN',
  'APH',
  'APC',
  'ADI',
  'ANSS',
  'ANTM',
  'AON',
  'AOS',
  'APA',
  'AIV',
  'AAPL',
  'AMAT',
  'APTV',
  'ADM',
  'ARNC',
  'ANET',
  'AJG',
  'AIZ',
  'T',
  'ADSK',
  'ADP',
  'AZO',
  'AVB',
  'AVY',
  'BHGE',
  'BLL',
  'BAC',
  'BK',
  'BAX',
  'BBT',
  'BDX',
  'BBY',
  'BIIB',
  'BLK',
  'HRB',
  'BA',
  'BKNG',
  'BWA',
  'BXP',
  'BSX',
  'BHF',
  'BMY',
  'AVGO',
  'BR',
  'CHRW',
  'COG',
  'CDNS',
  'CPB',
  'COF',
  'CAH',
  'KMX',
  'CCL',
  'CAT',
  'CBOE',
  'CBRE',
  'CBS',
  'CELG',
  'CNC',
  'CNP',
  'CTL',
  'CERN',
  'CF',
  'SCHW',
  'CHTR',
  'CVX',
  'CMG',
  'CB',
  'CHD',
  'CI',
  'XEC

In [37]:
def buy_sell_hold(*args):
    cols=[c for c in args]
    # se o preço mudar mais de 2% no tempo predefinido antes
    requirement=0.02
    for col in cols:
        if col>requirement:
            return 1
        if col<-requirement:
            return -1
    return 0
    

In [38]:
def extract_featuresets(ticker):
    tickers, df=process_data_for_labels(ticker )
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold,
                                              df['{}_1d'.format(ticker)],
                                              df['{}_2d'.format(ticker)],
                                              df['{}_3d'.format(ticker)],
                                              df['{}_4d'.format(ticker)],
                                              df['{}_5d'.format(ticker)],
                                              df['{}_6d'.format(ticker)],
                                              df['{}_7d'.format(ticker)] 
                                             ))
    vals=df['{}_target'.format(ticker)].values.tolist()
    str_vals=[str(i) for i in vals]
    print('Data spread:', Counter(str_vals))
    df.fillna(0,inplace=True)
    
    df=df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    
    df_vals=df[[ticker for ticker in tickers]].pct_change()
    df_vals=df_vals.replace([np.inf,-np.inf],0)
    df_vals.fillna(0,inplace=True)
    
    x=df_vals.values
    y=df['{}_target'.format(ticker)].values
    
    return x,y,df

In [39]:
extract_featuresets('XOM')

Data spread: Counter({'1': 1719, '-1': 1453, '0': 1105})


(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.039735  , -0.02857103,  0.        , ...,  0.        ,
         -0.04842357,  0.        ],
        [ 0.02896509, -0.00183829,  0.        , ...,  0.        ,
         -0.00118335,  0.        ],
        ...,
        [-0.00469483, -0.00984436, -0.00368005, ...,  0.00155497,
         -0.01479618, -0.00521221],
        [ 0.00185314,  0.00235478,  0.00738722, ...,  0.00426932,
         -0.01132175,  0.00336823],
        [ 0.00089695,  0.00261008, -0.00175351, ..., -0.00289849,
          0.00584252, -0.00167846]]),
 array([1, 1, 1, ..., 0, 0, 0], dtype=int64),
                    MMM        ABT       ABBV        ABMD         ACN  \
 Date                                                                   
 2000-01-03   27.481184   6.730005   0.000000   18.250000    0.000000   
 2000-01-04   26.389219   6.537722   0.000000   17.812500    0.000000   
 2000-01-05   27.153585   6.525703 

In [46]:
def do_ml(ticker):
    x,y,df=extract_featuresets(ticker)
    x_train,x_test,y_train,y_test=cross_validation.train_test_split(x,y,test_size=0.25)
    
    #clf=neighbors.KNeighborsClassifier()
    clf=VotingClassifier([('lsvc',svm.LinearSVC()),
                          ('knn',neighbors.KNeighborsClassifier()),
                          ('rfor',RandomForestClassifier())])
    
    clf.fit(x_train,y_train)
    confidence=clf.score(x_test,y_test)
    print('Accuracy', confidence)
    predictions=clf.predict(x_test)
    
    print('Predicited spread:',Counter(predictions))
    
    return confidence

In [48]:
do_ml('BAC')

Data spread: Counter({'1': 1812, '-1': 1645, '0': 820})




Accuracy 0.4102803738317757
Predicited spread: Counter({-1: 589, 1: 460, 0: 21})


0.4102803738317757