In [None]:
#################################LOGISTIC REGRESION MODEL FOR STOCKS#####################################################

In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import datetime


In [None]:
#1: First create functions that allow for easy access of data
#2: Logistic regresion to compute estimate probabilities where i'll probably use l1 regularization to maximize
#3: create a function that given data of stocks in a certain period of time predics one time step into the future
bin_file = open('sp500_stock_data', 'rb')
sp500_stocks, stock_data = pickle.load(bin_file)
bin_file.close()

In [None]:
sp500_stocks

In [None]:
stocks_by_sector = sp500_stocks.groupby('Sector').agg(lambda x: list(x))[['Symbol']]
stocks_by_sector.head()

In [None]:
#will use this sort of formatting 
#when predicting features based on sector averages
sector = stocks_by_sector.loc['Energy'].tolist()[0]
sector

In [None]:
"""set of features we will be using:
1: three day daily percent change for close/ variance of.
2: seven day daily percent change for close/(high-low)
3: perform based on top 3/sectors/and individual stock if trying to predict a trend for one company
4: """


In [None]:
stock_data['GOOGL']

In [None]:
#####compute the 30 day avg. variance from of closing prices##

def variance_calc(num_days, company):
    close = stock_data[company]['Adj. Close'].tolist()
    moving_variance = []
    for i in range(len(close)):
        if i < num_days:
            moving_variance.append(0)
        else:
            for_var = np.array(close[i - num_days: i])
            moving_variance.append(np.var(for_var))
    return moving_variance

def expo_moving_avg(num_days, company):
    """returns list containing num_days exponential moving average of
    the company's closing price"""
    close_price = stock_data[company]['Adj. Close'].tolist()
    sma, ema = [],[] 
    multiplier = (2 / (num_days + 1))
    for i in range(len(close_price)):
        if i < num_days:
            sma.append(0)
        else:
            sma.append(sum(close_price[i - num_days: i]) / num_days)
    
    for i in range(len(close_price)):
        if i < num_days:
            ema.append(sma[i])
        else:
            prev_ema = ema[-1]
            ema.append((close_price[i]  - prev_ema) * multiplier + prev_ema)  
    return ema

#computes the n-day close percent change
def close_percent_change(num_days, company):
    """num_days: an integer (3 or 7)
       company: company ticker
       adds pandas series containing num_days moving average of stock's closing price"""
    close_price = stock_data[company]['Adj. Close'].tolist()
    moving_avg = []
    for i in range(len(close_price)):
        if i < num_days:
            moving_avg.append(0)
        else:
            start, stop = close_price[i - num_days], close_price[i]
            diff = stop - start
            percent = diff / start
            moving_avg.append(percent)
    return moving_avg

#computes the nday chaiken money flow
def chaiken_money_flow(num_days, company):
    close = stock_data[company]['Adj. Close']
    high = stock_data[company]['Adj. High']
    low = stock_data[company]['Adj. Low']
    volume = stock_data[company]['Adj. Volume']
    #print(close, high, low, volume)
    mfm = ((close - low) - (high - close)) / (high - low)
    mfv  = (mfm * volume).tolist()
    cmf = []
    for i in range(len(close)):
        if i < num_days:
            cmf.append(0)
        else:
            s_mfv = sum(mfv[i - num_days: i])
            s_v = sum(volume[i - num_days: i])
            cmf.append(s_mfv / s_v)
    return cmf  
#computes the nday relative strength index
def rsi(num_days, company):
    close = stock_data[company]['Adj. Close'].tolist()
    rs, rsi = [],[]
    for i in range(len(close)):
        if i < num_days:
            rs.append(0)
        else:
            period = close[i - num_days: i]
            gain, loss = [],[]
            for c in range(1, len(period)):
                prev, curr = period[c - 1], period[c]
                if prev < curr:
                    gain.append(curr - prev)
                else:
                    loss.append(prev - curr)
            if len(loss) != 0 and sum(loss) != 0:
                metric = (sum(gain) / num_days) / (sum(loss) / num_days)
            else:
                metric = (sum(gain) / num_days) / (.001 / num_days)
            rs.append(metric)
    for i in range(len(rs)):
        rsi.append(100 - (100 / (1 + rs[i])))
    return rsi

#%R = (Highest High – Close) / (Highest High – Lowest Low) X -100

def williams_R_indicator(num_days, company):  
    high = stock_data[company]['Adj. High'].tolist()
    low = stock_data[company]['Adj. Low'].tolist()
    close = stock_data[company]['Adj. Close'].tolist()
    r_ind = []
    for i in range(len(high)):
        if i < num_days:
            r_ind.append(0)
        else:
            hh = max(high[i - num_days: i])
            ll = min(low[i - num_days: i])
            r_val = (hh - close[i]) / (hh - ll) * num_days - 100
            r_ind.append(r_val)
    return r_ind
    
    
    
#CHO

In [None]:
del(stock_data['AXP'])
del(stock_data['SYY'])

In [None]:
for company in sp500_stocks['Symbol']:
    if company in stock_data.keys():
        print(company)
        stock_data[company]['3 day percent change'] = pd.Series(close_percent_change(3, company), index = stock_data[company].index)
        stock_data[company]['7 day percent change'] = pd.Series(close_percent_change(7, company), index = stock_data[company].index)
        stock_data[company]['30 day percent change'] = pd.Series(close_percent_change(30, company), index = stock_data[company].index)
        stock_data[company]['10 ema'] = pd.Series(expo_moving_avg(10, company), index = stock_data[company].index)
        stock_data[company]['3 ema'] = pd.Series(expo_moving_avg(3, company), index = stock_data[company].index)
        stock_data[company]['30 day var'] = pd.Series(variance_calc(30, company), index = stock_data[company].index)
        stock_data[company]['10 CMF'] = pd.Series(chaiken_money_flow(20, company), index = stock_data[company].index)
        stock_data[company]['20 will'] = pd.Series(williams_R_indicator(20, company), index = stock_data[company].index)
        stock_data[company]['20 rsi'] = pd.Series(rsi(20, company), index = stock_data[company].index)         

In [None]:
###########Generalized 3,10,30 percent change + top 3 unweighted / weighted########################################
#############filter stocks that don't fit the bill`###########################################

cutoff = datetime.date(2013, 11, 28)
to_del = []
for stock in stock_data.keys():
    first, last = stock_data[stock].index[0].date(), stock_data[stock].index[-1].date()
    print(first, last, cutoff)
    if first > cutoff and datetime.date(2018,3,27) != last:
        to_del.append(stock)
        print(first, last)
len(stock_data)


In [None]:
stocks_for_grouping = {}
for stock in stock_data.keys():
    stocks_for_grouping[stock] = stock_data[stock]

to_del = []
for stock in stocks_for_grouping.keys():
    stocks_for_grouping[stock] = stocks_for_grouping[stock].iloc[-1000:]
    if stocks_for_grouping[stock].shape != (1000, 21):
        to_del.append(stock)
for stock in to_del:
    del(stocks_for_grouping[stock])

In [None]:
stocks_for_grouping['AAPL'].shape, stock_data['AAPL'].shape

In [None]:

feature_matrix = ['3 day percent change', '7 day percent change', '30 day percent change', '10 ema', '3 ema', '30 day var', '10 CMF', '20 will', '20 rsi','Close']
for stock in stocks_for_grouping.keys():
    stocks_for_grouping[stock] = stocks_for_grouping[stock][feature_matrix]
stocks_for_grouping['AAPL']

In [None]:
#stocks_by_sector = sp500_stocks.groupby(sector).agg(lambda x: list(x))[['Symbol']]

#gain == 1 loss == 0#
def is_loss(company):
    close = stocks_for_grouping[company]['Close'].tolist()
    is_loss = []
    for i in range(len(close)):
        if i == 0:
            is_loss.append(1)
        else:
            prev,curr = close[i - 1], close[i]
            if prev >= curr:
                is_loss.append(0)
            else:
                is_loss.append(1)
    return is_loss

for company in stocks_for_grouping.keys():
    stocks_for_grouping[company]['gain/loss'] = pd.Series(is_loss(company), index = stocks_for_grouping[company].index)

#stocks_for_grouping['MO']

In [None]:
def sector_percent_change(sector, num_days):
    p_sum, count = np.zeros((1000,)),0
    companies = stocks_by_sector.loc[sector].tolist()[0]
    for company in companies:
        if company in stocks_for_grouping.keys():
           # print(stocks_for_grouping[company].keys())
            count += 1
            if num_days == 3:
                p_sum += stocks_for_grouping[company]['3 day percent change']
            elif num_days == 7:
                p_sum += stocks_for_grouping[company]['7 day percent change']
            else:
                p_sum += stocks_for_grouping[company]['30 day percent change']
    p_sum = (p_sum / count)
    p_sum.fillna(0)
    return p_sum.tolist()
                
#    sample_stock = pd.Series(np.zeroes(stocks_data['AAPL']['3 day percent change'], )
#stocks_by_sector
stocks_for_grouping['AMZN']

In [None]:
for sector in stocks_by_sector.index:
    companies_by_sector = stocks_by_sector.loc[sector].tolist()[0]
    for company in companies_by_sector:
        if company in stocks_for_grouping.keys(): 
            for elem in [3,7,30]:
                print(company)
                stocks_for_grouping[company][str(elem) + ' sector_avg'] = pd.Series(sector_percent_change(sector, elem),index = stocks_for_grouping[company].index)
                stocks_for_grouping[company][str(elem) + ' sector_avg'] = stocks_for_grouping[company][str(elem) + ' sector_avg'].fillna(0)

In [None]:
##########tables of weighted versus unweighted top performers##########

weighted = (sp500_stocks['Symbol'] == 'AAPL') | (sp500_stocks['Symbol'] == 'AMZN') | (sp500_stocks['Symbol'] == 'MSFT')
non_weighted = (sp500_stocks['Symbol'] == 'NFLX') | (sp500_stocks['Symbol'] == 'XL') | (sp500_stocks['Symbol'] == 'TRIP')
top_3_weighted = sp500_stocks.loc[weighted]['Symbol']
top_3_nweighted = sp500_stocks.loc[non_weighted]['Symbol']
top_3_nweighted

In [None]:
def top_3_percent_change(stocks, num_days):
    sum_stocks = np.zeros((1000))
    for company in stocks:
        if num_days == 30:
            sum_stocks += stocks_for_grouping[company]['30 day percent change']
        elif num_days == 7:
            sum_stocks += stocks_for_grouping[company]['7 day percent change']
        else:
            sum_stocks += stocks_for_grouping[company]['3 day percent change']
    sum_stocks = (sum_stocks / 3).fillna(0)
    return sum_stocks.tolist()

for company in stocks_for_grouping.keys():
    print(company)
    for elem in [3,7,30]:
        stocks_for_grouping[company]['nw_top3'] = pd.Series(top_3_percent_change(top_3_nweighted, elem), index = stocks_for_grouping[company].index)
        stocks_for_grouping[company]['w_top3'] = pd.Series(top_3_percent_change(top_3_weighted, elem), index = stocks_for_grouping[company].index)

In [None]:
stocks_for_grouping['AAPL']

In [None]:
def train_model(company_ticker):
    ###currently predicts gain loss patters for the next consequtive 100 days##
    ##returns:(prediction score, predictions, actual values)
    X = stocks_for_grouping[company_ticker].drop(['gain/loss'], axis=1).values
    y = stocks_for_grouping[company_ticker]['gain/loss'].values
    X_train, X_test, y_train, y_test = X[:900], X[900:], y[:900],y[900:]
    clf = LogisticRegression(max_iter=10000, penalty='l2', solver = 'liblinear')

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    tup = clf.score(X_test, y_test), clf.predict(X_test), y_test
    return tup

In [None]:
train_model('GOOGL')[0], train_model('AMZN')[0], train_model('FB')[0]


In [None]:
##graph predictions of wins/loss versus actual google win loss patterns for the prev 100 days
goog_inf = train_model('GOOG')
fb_inf = train_model('FB')
amzn_inf = train_model('AMZN')


In [None]:
plt.figure(figsize = (8,4))
plt.xlabel('Days Ahead')
plt.ylabel('Gain\Loss on given day')
plt.title('Google gain\loss Predictions versus Values')

x = list(range(100))
act = goog_inf[2]
pred = goog_inf[1]
df = pd.DataFrame(np.c_[pred, act], index=x, columns = ['GOOGLE GAIN\LOSS PREDICTIONS', 'GOOGLE GAIN\LOSS'])

ax = sns.lineplot(data=df)
plt.savefig('google.png', dpi = 300)

In [None]:
plt.figure(figsize = (8,4))
plt.xlabel('Days Ahead')
plt.ylabel('Gain\Loss on given day')
plt.title('FB gain\loss Predictions versus Values')

x = list(range(100))
act = fb_inf[2]
pred = fb_inf[1]
df = pd.DataFrame(np.c_[pred, act], index=x, columns = ['FB GAIN\LOSS PREDICTIONS', 'FB GAIN\LOSS'])

ax = sns.lineplot(data=df)
plt.savefig('fb.png', dpi = 300)

In [None]:
plt.figure(figsize = (8,4))
plt.xlabel('Days Ahead')
plt.ylabel('Gain\Loss on given day')
plt.title('Amazon gain\loss Predictions versus Values')

x = list(range(100))
act = amzn_inf[2]
pred = amzn_inf[1]
df = pd.DataFrame(np.c_[pred, act], index=x, columns = ['AMZN GAIN\LOSS PREDICTIONS', 'AMZN GAIN\LOSS'])

ax = sns.lineplot(data=df)
plt.savefig('amzn.png', dpi = 300)