In [25]:
import pandas as pd
import numpy as np
import pandas_datareader
from pandas_datareader import data as pdr
from datetime import datetime, timedelta
import fix_yahoo_finance as yf

import requests
from bs4 import BeautifulSoup as soup

def stockData(sList, stDate, endDate):
    """Take list of stocks and pull data for that stock and create technical indicator variables
    sList requires list of stocks, dates should be entered in format 'YYYY-MM-DD' """
    #trailing measures require a certain number of days to calculate. This makes sure we get the necessary datapoints
    started = datetime.strptime(stDate, "%Y-%m-%d")
    backdate = started - timedelta(days=30)
   
    data = []
    for element in sList:
        temp = pdr.get_data_yahoo(element, start = backdate, end = endDate)
    #add an indentifier
        temp['Symbol'] = element
        pd.to_datetime(temp.index)

        #Subtract day's close by prior day's close
        temp['price_change'] = (temp['Adj Close'] - temp['Adj Close'].shift())

        #next day price movement (target variable)
        temp['price_chg_nextday'] = (temp['price_change'].shift(-1))
        temp['perc_change'] = temp['price_chg_nextday'] / temp['Adj Close']
        #Calculate if a period was a rise or not over previous periods
        
        temp.loc[temp['perc_change'] >= .01, 'size_of_change'] = 1
        temp.loc[temp['perc_change'] <= -.01, 'size_of_change'] = -1
        temp.loc[(temp['perc_change'] < .01) & (temp['perc_change'] > -.01), 'size_of_change'] = 0
        
        temp.loc[temp['perc_change'] > 0, 'stock_up_down'] = 1
        temp.loc[temp['perc_change'] <= 0, 'stock_up_down'] = 0
        

        #numpy sign function gives 1 for rise, 0 for no change and -1 for decline. makes it easy to multiply volumes
        temp['sign'] = np.sign(temp['price_change'].dropna())

        #next day sign (target variable)
        temp['sign_nextday'] = (temp['sign'].shift(-1))

     
        #Calculate if a period was a rise or not over previous periods
        temp['Rise'] = [1 if x == 1 else 0 for x in temp['sign']]

        #On Basis Volume (OBV calculation)
        temp['OBV'] = (temp['Volume'] * temp['sign']).cumsum()
        
        #Magnitude Recent Direction - Volume X Sign
        temp['MAG'] = (temp['Volume'] * temp['sign'])

        #Psychological Line(PSY) caluclation is the number of increasing days over a specified period, 12 in this case
        temp['PSY12'] = ((temp['Rise'].rolling(window=12, center=False).sum())/12)*100

        #Rolling mean of the last 6 adjusted closing prices for the BIAS calculation
        SMA6 = temp['Adj Close'].rolling(window=6, center=False).mean()

        temp['BIAS6'] = ((temp['Adj Close'] - SMA6)/ SMA6)*100

        #for loop to calculate the average return over a given number of periods
        for i in range(1,6):
            temp['ASY' + str(i)] =(temp['price_change'].rolling(window=i, center=False).sum())/i

        #Ticknor indicators: https://parsproje.com/tarjome/modiriyat/492.pdf
        #Rolling mean of the last 10 adjusted closing price
        temp['SMA10'] = (temp['Adj Close'].rolling(window=10, center=False).mean())
        #Exponential Moving Average: weighting the more recent values more
        temp['EMA10'] = temp['Adj Close'].ewm(span = 10, adjust=False, min_periods=10).mean()

        #Rolling mean of the last 5 adjusted closing prices 
        temp['SMA5'] = temp['Adj Close'].rolling(window=5, center=False).mean()
        #Exponential Moving Average: weighting the more recent values more
        rest5 = temp.SMA5[5:]
        temp['EMA5'] = temp['Adj Close'].ewm(span=5, adjust=False, min_periods=5).mean()

        #Relative Strength Index
        up, down = temp['price_change'].copy(), temp['price_change'].copy()
        up[up < 0] = 0
        down[down > 0] = 0
        rUp = up.ewm(com=13,  adjust=False, min_periods = 13).mean()
        rDown = down.ewm(com=13, adjust=False, min_periods = 13).mean().abs()

        RSI = 100 - 100 / (1 + rUp / rDown)
        temp['RSI'] = RSI

        #Williams%R
        high = temp['High'].rolling(window=14, center=False).max()
        low = temp['Low'].rolling(window=14, center=False).min()
        temp['WilliamsR%'] = (high - temp['Close'])/ (high - low)*-100

        #Stochastic K%
        temp['stochasticK%'] = (temp['Close'] - low) / (high - low) * 100

        #Stochastic D%
        temp['stochasticD%'] = (temp['stochasticK%'].rolling(window=3, center=False).mean())

        #add data to list 
        data.append(temp)
    #convert to dataframe    
    df = pd.concat(data)
    #drop rows for which we will not have all data points calculated
    df = df.dropna()
    #dates above what user entered
    df = df.loc[stDate : endDate]
    #order and limit columns to those needed
    df = df.loc[:, ['Symbol', 'MAG', 'OBV', 'PSY12', 'BIAS6', 'ASY1','ASY2','ASY3','ASY4','ASY5', 'SMA10', 'EMA10', 
                'SMA5', 'EMA5', 'RSI', 'WilliamsR%', 'stochasticK%', 'stochasticD%', 'stock_up_down']]
    
    #Add date as column
    df['Day'] = pd.to_datetime(df.index)
    df['Day'] = df.Day.apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    
    return(df)

def getForm8Kdata(sList, stDate, endDate):
    #Initialize variables
    data = []
    form8kdf = pd.DataFrame([])
    #Set quarters, SEC stores data in quarters
    quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
    
    #Loop for years
    for iYear in range(stDate,endDate,1):
        #Loop of quarters
        for quarter in quarters:
            #Quarter url
            secUrl = "https://www.sec.gov/Archives/edgar/daily-index/" + str(iYear) + "/"+ quarter + '/'

            #Get page and extract tables
            pageData = requests.get(secUrl)
            html_content = soup(pageData.content, 'html.parser')
            tables = html_content.findAll("table")

            #Loop through tables and get idx files
            for table in tables:
                 if table.findParent("table") is None:
                     df = pd.read_html(str(table))[0]

            #Loop through each idx file and save it to local drive
            #Read only master files, it pipe(|) seperated and easy to read
            for index, row in df.iterrows():
                if row[0].startswith( 'master' ):
                    idxUrl = "https://www.sec.gov/Archives/edgar/daily-index/" + str(iYear) + "/" + quarter + '/' + row[0]
                    file_content = requests.get(idxUrl)

                    #Get matching recored, that is 8-K filing and belongs to interested companies
                    records = [tuple((str(line) + '|' + ticker + '|1').rstrip().split('|')) 
                                for line in file_content.iter_lines() if ((str(line).find("|edgar/data") > 0) and (str(line).find("|8-K") > 0)) 
                                    for company, ticker in sList.items() if (str(line).find(company) > 0)]
                    if records:
                        data = pd.DataFrame(records, columns=['cik', 'company', 'type', 'Day', 'path', 'Symbol', 'form8K'])
                        form8kdf = form8kdf.append(data)


    form8kdf = form8kdf[['Day', 'Symbol', 'form8K']]
    form8kdf = form8kdf.drop_duplicates()
    form8kdf['Day'] = form8kdf.Day.astype(int)
    return(form8kdf)

#https://dataanalysiswithpandas.blogspot.com/2016/08/technical-indicator-with-pandas-and.html

In [3]:
#List of automaker ticker symbols
companies = pd.read_csv("https://raw.githubusercontent.com/Capstone-CUNY/Capstone-Main/master/Car_Ticks_8k.csv")
companies.head()

Unnamed: 0,Ticker,Name,Country,parent,Comments
0,TSLA,Tesla,USA,,
1,F,Ford,USA,Ford Motor Co,owns Ford and Lincoln
2,FCAU,Fiat Chrysler,USA,Fiat Chrysler Automobiles,"owns Alfa Romeo, Chrysler, Dodge, Fiat, Jeep, ..."
3,GM,GM,USA,General Motors,"owns Buick, Cadillac, Chevrolet, and GMC"
4,DAI.DE,Mercedes,GER,Daimler AG,owns Mercedes-Benz and Smart


In [7]:
#Set interested companies
stockList = {'Ford Motor':'F', 
             'Fiat':'FCAU', 'Chrysler':'FCAU', 
             'General Motors':'GM', 
             'Mercedes':'DAI.DE', 'Daimler':'DAI.DE',
             'BMW':'BMWYY',
             'Volkswagen':'VWAGY', 'Bentley':'VWAGY', 'Bugatti':'VWAGY', 'Lamborghini':'VWAGY', 'Porsche':'VWAGY',
             'Hyundai':'HYMTF',
             'Toyota':'TM', 
             'Tesla':'TSLA'
            }

#Get Form 8K details
form8kFiledDf = getForm8Kdata(stockList, 2013, 2019)

In [31]:
#Pass the ticker list to the function
df = stockData(companies['Ticker'].tolist(), '2013-01-01', '2018-12-31')
df.shape

(13229, 20)

In [23]:
#Companies that filed Form 8K
form8kFiledDf.head()

Unnamed: 0,Day,Symbol,form8K
0,20130403,GM,1
0,20130404,FCAU,1
0,20130408,GM,1
0,20130411,TM,1
0,20130417,TM,1


In [32]:
df.head()

Unnamed: 0_level_0,Symbol,MAG,OBV,PSY12,BIAS6,ASY1,ASY2,ASY3,ASY4,ASY5,SMA10,EMA10,SMA5,EMA5,RSI,WilliamsR%,stochasticK%,stochasticD%,stock_up_down,Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-01-02,TSLA,1194800.0,1685400.0,66.666667,3.994905,1.490002,1.07,0.556667,0.4425,0.216,34.164,34.192091,33.946,34.289795,42.769193,-14.426188,85.573812,45.901659,0.0,20130102
2013-01-03,TSLA,-742000.0,943400.0,58.333333,2.014671,-0.59,0.450001,0.516666,0.27,0.236,34.182,34.297166,34.182,34.449863,39.573171,-25.185189,74.814811,65.703303,0.0,20130103
2013-01-04,TSLA,-674000.0,269400.0,50.0,0.530908,-0.369999,-0.48,0.176668,0.295,0.142001,34.161,34.315863,34.324001,34.433243,37.671961,-42.857098,57.142902,72.510508,0.0,20130104
2013-01-07,TSLA,-442000.0,-172600.0,41.666667,0.038842,-0.060001,-0.215,-0.34,0.1175,0.224,34.152,34.320252,34.548,34.402162,37.358514,-45.306133,54.693867,62.217193,0.0,20130107
2013-01-08,TSLA,-1284000.0,-1456600.0,33.333333,-2.10251,-0.66,-0.360001,-0.363333,-0.42,-0.038,34.12,34.203842,34.510001,34.161441,34.006715,-72.244894,27.755106,46.530625,0.0,20130108


In [33]:
df = pd.merge(df, form8kFiledDf, how='outer',on=['Day', 'Symbol'])
df = df[['OBV', 'PSY12', 'BIAS6', 'ASY1','ASY2','ASY3','ASY4','ASY5', 'SMA10', 'EMA10', 
                'SMA5', 'EMA5', 'RSI', 'WilliamsR%', 'stochasticK%', 'stochasticD%', 'stock_up_down','form8K']]

df['form8K'] = df['form8K'].replace(np.nan, 0)
df['form8K'] = df['form8K'].astype(int)
df = df.dropna()

In [34]:
df.head()

Unnamed: 0,OBV,PSY12,BIAS6,ASY1,ASY2,ASY3,ASY4,ASY5,SMA10,EMA10,SMA5,EMA5,RSI,WilliamsR%,stochasticK%,stochasticD%,stock_up_down,form8K
0,1685400.0,66.666667,3.994905,1.490002,1.07,0.556667,0.4425,0.216,34.164,34.192091,33.946,34.289795,42.769193,-14.426188,85.573812,45.901659,0.0,0
1,943400.0,58.333333,2.014671,-0.59,0.450001,0.516666,0.27,0.236,34.182,34.297166,34.182,34.449863,39.573171,-25.185189,74.814811,65.703303,0.0,0
2,269400.0,50.0,0.530908,-0.369999,-0.48,0.176668,0.295,0.142001,34.161,34.315863,34.324001,34.433243,37.671961,-42.857098,57.142902,72.510508,0.0,0
3,-172600.0,41.666667,0.038842,-0.060001,-0.215,-0.34,0.1175,0.224,34.152,34.320252,34.548,34.402162,37.358514,-45.306133,54.693867,62.217193,0.0,0
4,-1456600.0,33.333333,-2.10251,-0.66,-0.360001,-0.363333,-0.42,-0.038,34.12,34.203842,34.510001,34.161441,34.006715,-72.244894,27.755106,46.530625,0.0,0


In [35]:
df['stock_up_down'].value_counts()

0.0    7053
1.0    6176
Name: stock_up_down, dtype: int64

In [36]:
#Check number of up days compared to down or flat days. Adjust for imbalance
bal = df['stock_up_down'].value_counts()
mult = bal.loc[0,]/bal.loc[1]
mult

1.1420012953367875

In [37]:
weight = {0: 1., 
          1: mult}

In [38]:
#Preproccesing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Create X and Y variables
dataset = df.values
X = dataset[:,0:len(df.columns)-1].astype(float)
Y = dataset[:,len(df.columns)-1: len(df.columns)].astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
#https://www.datacamp.com/community/tutorials/deep-learning-python?utm_source=adwords_ppc&utm_campaignid=1565261270&utm_adgroupid=67750485268&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=1t2&utm_creative=295208661505&utm_targetid=aud-299261629574:dsa-473406573755&utm_loc_interest_ms=&utm_loc_physical_ms=9060446&gclid=CjwKCAjwx7DeBRBJEiwA9MeX_CEsfxirk8KaFqoEW_9X7UhP7ufasCyuetSADYr7hR80e4U8o2g98xoCIosQAvD_BwE

# Define the scaler 
scaler = StandardScaler().fit(X_train)

# Scale the train set
X_train = scaler.transform(X_train)

# Scale the test set
X_test = scaler.transform(X_test)

In [39]:
dataset

array([[ 1.68540000e+06,  6.66666667e+01,  3.99490486e+00, ...,
         4.59016592e+01,  0.00000000e+00,  0.00000000e+00],
       [ 9.43400000e+05,  5.83333333e+01,  2.01467127e+00, ...,
         6.57033027e+01,  0.00000000e+00,  0.00000000e+00],
       [ 2.69400000e+05,  5.00000000e+01,  5.30908273e-01, ...,
         7.25105081e+01,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-2.60100000e+05,  0.00000000e+00,  0.00000000e+00, ...,
         5.44749431e-14,  0.00000000e+00,  0.00000000e+00],
       [-2.60100000e+05,  0.00000000e+00,  0.00000000e+00, ...,
         5.44749431e-14,  0.00000000e+00,  0.00000000e+00],
       [-2.60100000e+05,  0.00000000e+00,  0.00000000e+00, ...,
         5.44749431e-14,  0.00000000e+00,  0.00000000e+00]])

In [40]:
import numpy as np
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers



def neural(X, Y):
    #set seed
    seed = 123
    np.random.seed(seed)

    # 
    number_of_features = len(df.columns)-1


    network = models.Sequential()
    network.add(layers.Dense(units=16, activation='relu'))
    network.add(layers.Dropout(0.2))
    network.add(layers.Dense(units=8, activation='relu'))
    network.add(layers.Dropout(0.2))
    network.add(layers.Dense(units=1, activation='sigmoid'))

    #Compile neural network
    network.compile(loss='binary_crossentropy', 
                    optimizer='rmsprop', 
                    metrics=['accuracy']) 


    #Training model
    history = network.fit(X, 
                          Y, 
                          epochs=50, 
                          verbose=0, 
                          batch_size=25,
                          class_weight=weight) 

    scores = network.evaluate(X, Y)
    print("\n%s: %.2f%%" % (network.metrics_names[1], scores[1]*100))
    return network





#https://chrisalbon.com/deep_learning/keras/adding_dropout/

In [41]:
nnet = neural(X_train, Y_train)


acc: 97.59%


In [42]:
score = nnet.evaluate(X_test, Y_test,verbose=1)

print(score)

[0.11007259108989653, 0.9773247824095281]
