In [49]:
# Import pandas as pd
import pandas as pd
from yahoofinancials import YahooFinancials as YF
from pandas_datareader import data
import time
from tqdm import tqdm
from utils import *
import matplotlib.pyplot as plt
%matplotlib ipympl



In [61]:
def convert_time(epoch):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch))

def getStockData(tickers, days_back):
    stocks = {}
    epoch_time = int(time.time())
    day_epoch = 60*60*24
    for tick in tqdm(tickers):
        try:
            stock_data = data.DataReader(tick, 
                        start=convert_time(epoch_time - (days_back* day_epoch)), 
                        end=convert_time(epoch_time), 
                        data_source='yahoo')
            stocks[tick] = stock_data 
        except:
            print("Skipping stock for {}, bad data :<".format(tick))
    return stocks

def getSP500Tickers():
    table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    df = table[0]
    df.to_csv('S&P500-Info.csv')
    df.to_csv("S&P500-Symbols.csv", columns=['Symbol'])
    tickers = df['Symbol']
    return tickers


def getPortReturns(stocks):
    df = pd.DataFrame()
    #df.index = stocks[list(stocks.keys())[0]].index
    for stock in list(stocks):
        df[stock] = stocks[stock]['simple_returns']
    return df.dropna()

def dict_2_panel(stocks_lagged):  
    df = pd.DataFrame()
    for stock in list(stocks_lagged):
        stocks_lagged[stock]['ticker'] = stock
        df = df.append(stocks_lagged[stock])
    return df.dropna()

def addMACD(stocks):
    for stock in list(stocks):
        exp1 = stocks[stock]['Adj Close'].ewm(span=12, adjust=False).mean()
        exp2 = stocks[stock]['Adj Close'].ewm(span=26, adjust=False).mean()
        stocks[stock]['macd'] = exp1-exp2
        stocks[stock]['macd_signal'] = stocks[stock]['macd'].ewm(
            span=9, adjust=False).mean()
    return(stocks)


def computeRSI(data, time_window):
    diff = data.diff(1).dropna()        # diff in one field(one day)

    # this preservers dimensions off diff values
    up_chg = 0 * diff
    down_chg = 0 * diff

    # up change is equal to the positive difference, otherwise equal to zero
    up_chg[diff > 0] = diff[diff > 0]

    # down change is equal to negative deifference, otherwise equal to zero
    down_chg[diff < 0] = diff[diff < 0]
    up_chg_avg = up_chg.ewm(com=time_window-1, min_periods=time_window).mean()
    down_chg_avg = down_chg.ewm(
        com=time_window-1, min_periods=time_window).mean()

    rs = abs(up_chg_avg/down_chg_avg)
    rsi = 100 - 100/(1+rs)
    return rsi


def addRSI(stocks, time_window):
    for stock in list(stocks):
        stocks[stock]["RSI"] = computeRSI(
            stocks[stock]["Adj Close"], time_window)
    return(stocks)


def addBB(stocks, time_window):
    for stock in list(stocks):
        stocks[stock]['MA20'] = stocks[stock]['Adj Close'].rolling(
            window=time_window).mean()
        stocks[stock]['20dSTD'] = stocks[stock]['Adj Close'].rolling(
            window=time_window).std()
        stocks[stock]['UpperBB'] = stocks[stock]['MA20'] + \
            (stocks[stock]['20dSTD'] * 2)
        stocks[stock]['LowerBB'] = stocks[stock]['MA20'] - \
            (stocks[stock]['20dSTD'] * 2)
        stocks[stock]['LowerBB_dist'] = stocks[stock]['LowerBB'] - \
            stocks[stock]['MA20']
        stocks[stock]['UpperBB_dist'] = stocks[stock]['MA20'] - \
            stocks[stock]['UpperBB']
    return(stocks)


def addReturns(stocks):
    for stock in list(stocks):
        stocks[stock]['simple_returns'] = stocks[stock]['Adj Close'].pct_change()
        stocks[stock]['log_returns'] = np.log(
            stocks[stock]['simple_returns']+1)
        stocks[stock]['cum_daily_return'] = (
            (1 + stocks[stock]['simple_returns']).cumprod() - 1)
    return(stocks)


def addVol(stocks, periods):
    for stock in list(stocks):
        stocks[stock]['volatility'] = stocks[stock]['simple_returns'].rolling(
            periods).std() * np.sqrt(periods)
    return stocks


def lagFeatures(stocks, features, periods, returns):
    # sets the columns we want in our final df
    cols_wanted = features + returns
    stocks_lagged = stocks.copy()
    print(f'The columns wanted are {cols_wanted}')
    for stock in list(stocks):
        stocks_lagged[stock][features] = stocks_lagged[stock][features].shift(
            periods)
        stocks_lagged[stock] = pd.DataFrame(
            stocks_lagged[stock], columns=cols_wanted)
    return(stocks_lagged)


def getRandomWeights(numstocks):
    weights = np.random.rand(numstocks)
    return (weights/np.sum(weights))


def getPortWeightedReturns(port_ret, weights):
    assert(len(port_ret.columns) == len(weights))
    return port_ret.iloc[:, 0:len(weights)].mul(weights, axis=1).sum(axis=1)
# getRandomWeights(50)


def getPortWeightedVol(port_ret, weights):
    cov_mat = port_ret.cov()
    #cov_mat_annual = cov_mat * 252
    # cov_mat_annual
    port_vol = np.sqrt(np.dot(weights.T, np.dot(cov_mat, weights)))
    return port_vol


def getPortWeightedAnnualReturn(port_ret, weights):
    returns = getPortWeightedReturns(port_ret, weights)

    mean_return_daily = np.mean(returns)
    # Calculate the implied annualized average return
    mean_return_annualized = ((1+mean_return_daily)**252)-1
    return(mean_return_annualized)



In [52]:
def getSP500Tickers():
    table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    df = table[0]
    df.to_csv('S&P500-Info.csv')
    df.to_csv("S&P500-Symbols.csv", columns=['Symbol'])
    tickers = df['Symbol']
    return tickers


In [57]:
from utils import *
tickers = getSP500Tickers()[:50]
stocks = getStockData(tickers, 50)
stocks = addBB(stocks,20)        
stocks= addMACD(stocks)
stocks = addRSI(stocks,14)
stocks = addReturns(stocks)
stocks = addVol(stocks,50)
stocks.keys()
features = ['High', 'Low', 'Open', 'Close', 'Volume', 'MA20', '20dSTD', 'UpperBB', 'LowerBB', 'macd', 'macd_signal', 'RSI']
stocks_lagged = lagFeatures(stocks,features,1, ['log_returns', 'simple_returns'])
df = dict_2_panel(stocks_lagged)
port_returns = getPortReturns(stocks)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:26<00:00,  1.89it/s]


The columns wanted are ['High', 'Low', 'Open', 'Close', 'Volume', 'MA20', '20dSTD', 'UpperBB', 'LowerBB', 'macd', 'macd_signal', 'RSI', 'log_returns', 'simple_returns']


In [62]:
df = dict_2_panel(stocks_lagged)

In [63]:
df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,MA20,20dSTD,UpperBB,LowerBB,macd,macd_signal,RSI,log_returns,simple_returns,ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-12-14,,,,,,,,,,,,,,,MMM
2020-12-15,176.199997,172.990005,175.669998,173.080002,3700100.0,,,,,0.0,0.0,,0.009202,0.009244,MMM
2020-12-16,175.059998,172.550003,174.389999,174.679993,2270600.0,,,,,0.127635,0.025527,,0.00417,0.004179,MMM
2020-12-17,175.850006,174.350006,174.610001,175.410004,1949900.0,,,,,0.284413,0.077304,,0.007044,0.007069,MMM
2020-12-18,177.460007,175.389999,176.0,176.649994,2327800.0,,,,,0.502921,0.162428,,-0.001303,-0.001302,MMM


In [19]:
port_returns.head()

Unnamed: 0_level_0,MMM,ABT,ABBV,ABMD,ACN,ATVI,ADBE,AMD,AAP,AES,...,ANSS,ANTM,AON,AOS,APA,AAPL,AMAT,APTV,ADM,ANET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-23,-0.005331,-0.015437,0.000356,0.010485,0.006838,0.003343,0.010132,0.031982,0.011705,0.011759,...,0.010527,0.018318,-0.000771,0.000364,-0.016949,-0.006134,-0.012156,0.001398,0.003676,0.000654
2020-10-26,-0.021437,0.011226,-0.00498,0.024518,-0.045581,-0.006787,-0.027226,0.003294,-0.022686,-0.00339,...,-0.027921,-0.035429,-0.024794,-0.018205,-0.077586,8.7e-05,-0.023626,-0.020939,-0.01311,-0.024473
2020-10-27,-0.030874,-0.003853,-0.014419,0.007596,0.006523,0.005839,0.007071,-0.040739,-0.005291,-0.011662,...,0.003645,-0.002902,-0.013306,-0.016688,-0.024533,0.013472,-0.011427,-0.01935,-0.016214,0.008043
2020-10-28,-0.015525,-0.019893,-0.026841,-0.025851,-0.030454,-0.024086,-0.045115,-0.03144,-0.026793,-0.041298,...,-0.04389,-0.043763,-0.023962,-0.029606,-0.067066,-0.046312,-0.025667,-0.028248,-0.01668,-0.029304
2020-10-29,-0.000315,-0.013343,0.002236,-0.100318,0.011686,-0.015694,0.00906,0.021204,-0.002596,0.008937,...,0.009155,-0.043877,-0.040575,0.042578,0.051348,0.03705,0.038032,0.050764,0.008077,-0.005969


In [34]:
#port_returns.plot()
df['simple_returns'].plot(style='.')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x255e015de88>

In [2]:
# Calculate the daily returns of the adjusted close price
StockPrices['Returns'] = StockPrices['Adjusted'].pct_change()

# Check the first five rows of StockPrices
print(StockPrices.head())

# Plot the returns column over time
StockPrices['Returns'].plot()
plt.show()

NameError: name 'StockPrices' is not defined

In [37]:
# Convert the decimal returns into percentage returns
percent_return = df['simple_returns']*100

# Drop the missing values
returns_plot = percent_return.dropna()

# Plot the returns histogram
plt.hist(returns_plot, bins=75)
plt.show()

<IPython.core.display.Javascript object>

In [None]:
# Import numpy as np
import numpy as np

# Calculate the average daily return of the stock
mean_return_daily = np.mean(StockPrices['Returns'])
print(mean_return_daily)

# Calculate the implied annualized average return
mean_return_annualized = ((1+mean_return_daily)**252)-1
print(mean_return_annualized)

# Calculate the standard deviation of daily return of the stock
sigma_daily = np.std(StockPrices['Returns'])
print(sigma_daily)

# Calculate the daily variance
variance_daily = sigma_daily**2
print(variance_daily)

# Annualize the standard deviation
sigma_annualized = sigma_daily*np.sqrt(252)
print(sigma_annualized)

# Calculate the annualized variance
variance_annualized = sigma_annualized**2
print(variance_annualized)

# Import skew from scipy.stats
from scipy.stats import skew

# Drop the missing values
clean_returns = StockPrices['Returns'].dropna()

# Calculate the third moment (skewness) of the returns distribution
returns_skewness = skew(clean_returns)
print(returns_skewness)

# Import kurtosis from scipy.stats
from scipy.stats import kurtosis

# Calculate the excess kurtosis of the returns distribution
excess_kurtosis = kurtosis(clean_returns)
print(excess_kurtosis)

# Derive the true fourth moment of the returns distribution
fourth_moment = excess_kurtosis +3
print(fourth_moment)

# Run the Shapiro-Wilk test on the stock returns
shapiro_results = shapiro(clean_returns)
print("Shapiro results:", shapiro_results)

# Extract the p-value from the shapiro_results
p_value = shapiro_results
print("P-value: ", p_value)

# if p value is less than 0.05, null is rejected, so the data is non-normal