In [1]:
import pandas as pd
import numpy as np
import pandas_datareader
from pandas_datareader import data as pdr
from datetime import datetime, timedelta
import fix_yahoo_finance as yf

def stockData(sList, stDate, endDate):
    """Take list of stocks and pull data for that stock and create technical indicator variables
    sList requires list of stocks, dates should be entered in format 'YYYY-MM-DD' """
    #trailing measures require a certain number of days to calculate. This makes sure we get the necessary datapoints
    started = datetime.strptime(stDate, "%Y-%m-%d")
    backdate = started - timedelta(days=30)
   
    data = []
    for element in sList:
        temp = pdr.get_data_yahoo(element, start = backdate, end = endDate)
    #add an indentifier
        temp['Symbol'] = element
        pd.to_datetime(temp.index)

        #Subtract day's close by prior day's close
        temp['price_change'] = (temp['Adj Close'] - temp['Adj Close'].shift())

        #next day price movement (target variable)
        temp['price_chg_nextday'] = (temp['price_change'].shift(-1))
        temp['perc_change'] = temp['price_chg_nextday'] / temp['Adj Close']
        #Calculate if a period was a rise or not over previous periods
        
        temp.loc[temp['perc_change'] >= .01, 'size_of_change'] = 1
        temp.loc[temp['perc_change'] <= -.01, 'size_of_change'] = -1
        temp.loc[(temp['perc_change'] < .01) & (temp['perc_change'] > -.01), 'size_of_change'] = 0
        
        temp.loc[temp['perc_change'] > 0, 'stock_up_down'] = 1
        temp.loc[temp['perc_change'] <= 0, 'stock_up_down'] = 0
        

        #numpy sign function gives 1 for rise, 0 for no change and -1 for decline. makes it easy to multiply volumes
        temp['sign'] = np.sign(temp['price_change'].dropna())

        #next day sign (target variable)
        temp['sign_nextday'] = (temp['sign'].shift(-1))

     
        #Calculate if a period was a rise or not over previous periods
        temp['Rise'] = [1 if x == 1 else 0 for x in temp['sign']]

        #On Basis Volume (OBV calculation)
        temp['OBV'] = (temp['Volume'] * temp['sign']).cumsum()
        
        #Magnitude Recent Direction - Volume X Sign
        temp['MAG'] = (temp['Volume'] * temp['sign'])

        #Psychological Line(PSY) caluclation is the number of increasing days over a specified period, 12 in this case
        temp['PSY12'] = ((temp['Rise'].rolling(window=12, center=False).sum())/12)*100

        #Rolling mean of the last 6 adjusted closing prices for the BIAS calculation
        SMA6 = temp['Adj Close'].rolling(window=6, center=False).mean()

        temp['BIAS6'] = ((temp['Adj Close'] - SMA6)/ SMA6)*100

        #for loop to calculate the average return over a given number of periods
        for i in range(1,6):
            temp['ASY' + str(i)] =(temp['price_change'].rolling(window=i, center=False).sum())/i

        #Ticknor indicators: https://parsproje.com/tarjome/modiriyat/492.pdf
        #Rolling mean of the last 10 adjusted closing price
        temp['SMA10'] = (temp['Adj Close'].rolling(window=10, center=False).mean())
        #Exponential Moving Average: weighting the more recent values more
        temp['EMA10'] = temp['Adj Close'].ewm(span = 10, adjust=False, min_periods=10).mean()

        #Rolling mean of the last 5 adjusted closing prices 
        temp['SMA5'] = temp['Adj Close'].rolling(window=5, center=False).mean()
        #Exponential Moving Average: weighting the more recent values more
        rest5 = temp.SMA5[5:]
        temp['EMA5'] = temp['Adj Close'].ewm(span=5, adjust=False, min_periods=5).mean()

        #Relative Strength Index
        up, down = temp['price_change'].copy(), temp['price_change'].copy()
        up[up < 0] = 0
        down[down > 0] = 0
        rUp = up.ewm(com=13,  adjust=False, min_periods = 13).mean()
        rDown = down.ewm(com=13, adjust=False, min_periods = 13).mean().abs()

        RSI = 100 - 100 / (1 + rUp / rDown)
        temp['RSI'] = RSI

        #Williams%R
        high = temp['High'].rolling(window=14, center=False).max()
        low = temp['Low'].rolling(window=14, center=False).min()
        temp['WilliamsR%'] = (high - temp['Close'])/ (high - low)*-100

        #Stochastic K%
        temp['stochasticK%'] = (temp['Close'] - low) / (high - low) * 100

        #Stochastic D%
        temp['stochasticD%'] = (temp['stochasticK%'].rolling(window=3, center=False).mean())

        #add data to list 
        data.append(temp)
    #convert to dataframe    
    df = pd.concat(data)
    #drop rows for which we will not have all data points calculated
    df = df.dropna()
    #dates above what user entered
    df = df.loc[stDate : endDate]
    #order and limit columns to those needed
    df = df.loc[:, ['Symbol', 'MAG', 'OBV', 'PSY12', 'BIAS6', 'ASY1','ASY2','ASY3','ASY4','ASY5', 'SMA10', 'EMA10', 
                'SMA5', 'EMA5', 'RSI', 'WilliamsR%', 'stochasticK%', 'stochasticD%', 'stock_up_down']]
    
    #Add date as column
    df['Day'] = pd.to_datetime(df.index)
    df['Day'] = df.Day.apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    
    return(df)

def getForm8Kdata():
    #actual code to download the Form 8-K's is on github
    #https://github.com/Capstone-CUNY/Capstone-Main/Download_Form8K.py
    #All the data is downloaded and preprocessed
    
    items8kdf = pd.read_csv("https://raw.githubusercontent.com/Capstone-CUNY/Capstone-Main/master/ItemsFiled8K.csv", delimiter='|')
    items8kdf['date'] = items8kdf.date.astype(int)

    items8kdf["value"] = 1
    items8kdf = pd.pivot_table(items8kdf, values="value", index=['date', 'ticker'], columns = 'item', fill_value=0)
    items8kdf['date'] = items8kdf.index.get_level_values('date')
    items8kdf['ticker'] = items8kdf.index.get_level_values('ticker')
    items8kdf = items8kdf.reset_index(drop=True)
    items8kdf.rename(columns={'date': 'Day', 'ticker': 'Symbol'}, inplace=True)
    return(items8kdf)

#https://dataanalysiswithpandas.blogspot.com/2016/08/technical-indicator-with-pandas-and.html

In [2]:
#List of automaker ticker symbols
companies = pd.read_csv("https://raw.githubusercontent.com/Capstone-CUNY/Capstone-Main/master/Car_Ticks_8k.csv")
companies

Unnamed: 0,Ticker,Name,Country,parent,Comments
0,TSLA,Tesla,USA,,
1,F,Ford,USA,Ford Motor Co,owns Ford and Lincoln
2,FCAU,Fiat Chrysler,USA,Fiat Chrysler Automobiles,"owns Alfa Romeo, Chrysler, Dodge, Fiat, Jeep, ..."
3,GM,GM,USA,General Motors,"owns Buick, Cadillac, Chevrolet, and GMC"
4,DAI.DE,Mercedes,GER,Daimler AG,owns Mercedes-Benz and Smart
5,BMWYY,BMW,GER,BMW Group,"owns BMW, Mini, and Rolls-Royce"
6,VWAGY,Volkswagen,GER,Volkswagen Group,"owns Audi, Bentley, Bugatti, Lamborghini, Por..."
7,TM,Toyota,JPN,Toyota Motor Corp,owns Lexus and Toyota
8,HYMTF,Hyundai,KOR,Hyundai Motor Group,"owns Genesis, Hyundai, and Kia"


In [3]:
#Get Form 8K details
form8kFiledDf = getForm8Kdata()

In [4]:
#Pass the ticker list to the technical indicator function
df = stockData('F', '2013-01-01', '2018-12-31')
df.shape

(1481, 20)

In [5]:
#Companies that filed Form 8K
form8kFiledDf.head()

item,Item101,Item102,Item103,Item201,Item202,Item203,Item204,Item205,Item206,Item301,...,Item601,Item602,Item603,Item604,Item605,Item701,Item801,Item901,Day,Symbol
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,20080103,F
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,20080103,GM
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,20080104,F
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,20080104,GM
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,20080116,F


In [6]:
df.head()

Unnamed: 0_level_0,Symbol,MAG,OBV,PSY12,BIAS6,ASY1,ASY2,ASY3,ASY4,ASY5,SMA10,EMA10,SMA5,EMA5,RSI,WilliamsR%,stochasticK%,stochasticD%,stock_up_down,Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-01-02,F,75274700.0,583925300.0,83.333333,2.897253,0.194139,0.128131,0.113894,0.079597,0.124248,9.629173,9.712278,10.028318,9.984799,73.756023,-3.555552,96.444448,96.520809,1.0,20130102
2013-01-03,F,121284700.0,705210000.0,91.666667,3.498658,0.2019,0.19802,0.152721,0.135895,0.104058,9.768175,9.84683,10.132375,10.140637,76.826018,-8.988755,91.011245,93.704741,1.0,20130103
2013-01-04,F,54669900.0,759879900.0,91.666667,3.311744,0.085418,0.143659,0.160486,0.135895,0.1258,9.911059,9.972448,10.258175,10.273001,77.998628,-4.868918,95.131082,94.195592,0.0,20130104
2013-01-07,F,-43482400.0,716397500.0,83.333333,1.384001,-0.108714,-0.011648,0.059535,0.093186,0.086973,10.039966,10.05546,10.345149,10.325006,72.939911,-10.546858,89.453142,91.865156,0.0,20130107
2013-01-08,F,-46336200.0,670061300.0,75.0,0.175094,-0.062123,-0.085419,-0.028473,0.02912,0.062124,10.155671,10.112085,10.407273,10.338968,70.140445,-15.217365,84.782635,89.788953,1.0,20130108


In [7]:
df = pd.merge(df, form8kFiledDf, how='outer',on=['Day', 'Symbol'])
df = df.fillna(0)

In [8]:
df.head()

Unnamed: 0,Symbol,MAG,OBV,PSY12,BIAS6,ASY1,ASY2,ASY3,ASY4,ASY5,...,Item504,Item507,Item601,Item602,Item603,Item604,Item605,Item701,Item801,Item901
0,F,75274700.0,583925300.0,83.333333,2.897253,0.194139,0.128131,0.113894,0.079597,0.124248,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,F,121284700.0,705210000.0,91.666667,3.498658,0.2019,0.19802,0.152721,0.135895,0.104058,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,F,54669900.0,759879900.0,91.666667,3.311744,0.085418,0.143659,0.160486,0.135895,0.1258,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,F,-43482400.0,716397500.0,83.333333,1.384001,-0.108714,-0.011648,0.059535,0.093186,0.086973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,F,-46336200.0,670061300.0,75.0,0.175094,-0.062123,-0.085419,-0.028473,0.02912,0.062124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [10]:
df['stock_up_down'].value_counts()

0.0    1950
1.0     739
Name: stock_up_down, dtype: int64

In [11]:
df.to_csv('test.csv')

In [10]:
#Check number of up days compared to down or flat days. Adjust for imbalance
bal = df['stock_up_down'].value_counts()
mult = bal.loc[0,]/bal.loc[1]
mult

1.7682926829268293

In [11]:
weight = {0: 1., 
          1: mult}

In [12]:
# Move stock_up_down to end
df = df[[column for column in df if column not in ['stock_up_down']] + ['stock_up_down']]

In [13]:
#Preproccesing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Create X and Y variables
dataset = df.values
features = dataset[:,1:len(df.columns)-1].astype(float)
target = dataset[:,len(df.columns)-1: len(df.columns)].astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.33, random_state=42)
#https://www.datacamp.com/community/tutorials/deep-learning-python?utm_source=adwords_ppc&utm_campaignid=1565261270&utm_adgroupid=67750485268&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=1t2&utm_creative=295208661505&utm_targetid=aud-299261629574:dsa-473406573755&utm_loc_interest_ms=&utm_loc_physical_ms=9060446&gclid=CjwKCAjwx7DeBRBJEiwA9MeX_CEsfxirk8KaFqoEW_9X7UhP7ufasCyuetSADYr7hR80e4U8o2g98xoCIosQAvD_BwE

# Define the scaler 
scaler = StandardScaler().fit(X_train)

# Scale the train set
X_train = scaler.transform(X_train)

# Scale the test set
X_test = scaler.transform(X_test)

In [192]:
import numpy as np
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification

#function to perform a grid search

def neural(optimizer='rmsprop'):
    #set seed
    seed = 123
    np.random.seed(seed)

    number_of_features = len(df.columns)-2
    
    network = models.Sequential()
    network.add(layers.Dense(units=number_of_features, activation='relu', input_shape=(number_of_features,)))
    network.add(layers.Dropout(0.2))
    network.add(layers.Dense(units=number_of_features, activation='relu'))
    network.add(layers.Dropout(0.2))
    network.add(layers.Dense(units=1, activation='sigmoid'))

    #Compile neural network
    network.compile(loss='binary_crossentropy', 
                    optimizer=optimizer, 
                    metrics=['accuracy']) 

    return network


#https://chrisalbon.com/deep_learning/keras/adding_dropout/

In [193]:
neural_network = KerasClassifier(build_fn=neural, verbose=0)
#https://chrisalbon.com/deep_learning/keras/tuning_neural_network_hyperparameters/

In [194]:
# hyperparameter metrics
epochs = [5, 10]
batches = [5, 10, 100]
optimizers = ['rmsprop', 'adam']

hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

In [195]:
#grid search
grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)

grid_result = grid.fit(features, target)

In [196]:
#best parameters
grid_result.best_params_

{'batch_size': 5, 'epochs': 5, 'optimizer': 'rmsprop'}

In [197]:
#new function incorporating the parameters
def nnet(X, Y):
    #set seed
    seed = 123
    np.random.seed(seed)

    number_of_features = len(df.columns)-2


    network = models.Sequential()
    network.add(layers.Dense(units=number_of_features, activation='relu'))
    network.add(layers.Dropout(0.2))
    network.add(layers.Dense(units=number_of_features, activation='relu'))
    network.add(layers.Dropout(0.2))
    network.add(layers.Dense(units=1, activation='sigmoid'))

    #Compile neural network
    network.compile(loss='binary_crossentropy', 
                    optimizer='rmsprop', 
                    metrics=['accuracy']) 


    #Training model
    history = network.fit(X, 
                          Y, 
                          epochs=5, 
                          verbose=0, 
                          batch_size=5,
                          class_weight=weight) 

    scores = network.evaluate(X, Y)
    print("\n%s: %.2f%%" % (network.metrics_names[1], scores[1]*100))
    return network





#https://chrisalbon.com/deep_learning/keras/adding_dropout/

In [198]:
nnet_trained = nnet(X_train, Y_train)


acc: 65.79%


In [199]:
score = nnet_trained.evaluate(X_test, Y_test,verbose=1)

print(score)

[0.5486137653280188, 0.6444444445327476]
