In [45]:
# Provides ways to work with large multidimensional arrays
import numpy as np 
# Allows for further data manipulation and analysis
import pandas as pd 
import matplotlib.pyplot as plt # Plotting
import matplotlib.dates as mdates # Styling dates
%matplotlib inline

# pip install numpy
# conda install -c anaconda pandas
# conda install -c conda-forge matplotlib

import datetime as dt # For defining dates

import time

# In Powershell Prompt : conda install -c conda-forge multitasking
# pip install -i https://pypi.anaconda.org/ranaroussi/simple yfinance

import yfinance as yf

# To show all your output File -> Preferences -> Settings Search for Notebook
# Notebook Output Text Line Limit and set to 100

# Used for file handling like deleting files
import os

# conda install -c conda-forge cufflinks-py
# conda install -c plotly plotly
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Use Plotly locally
cf.go_offline()

from plotly.subplots import make_subplots

# New Imports
# Used to get data from a directory
import os
from os import listdir
from os.path import isfile, join

# Used for calculating regressions
from statsmodels.tsa.ar_model import AutoReg, ar_select_order

import warnings
warnings.simplefilter("ignore")

### Constants

In [46]:
PATH = "/Users/oppoudel/dev/Python4Finance/Update/"

# Start end date defaults
S_DATE = "2017-02-01"
E_DATE = "2022-12-06"
S_DATE_DT = pd.to_datetime(S_DATE)
E_DATE_DT = pd.to_datetime(E_DATE)

### Get Stocks as list

In [47]:
files = [x for x in listdir(PATH) if isfile(join(PATH, x))]
tickers = [os.path.splitext(x)[0] for x in files]
tickers

# On MacOS Only
# tickers.remove('.DS_Store')
# 2907 total stocks
tickers.sort()
len(tickers)

2907

### Function that returns a dataframe from CSV

In [48]:
def get_df_from_csv(folder, ticker):
    try:
        df = pd.read_csv(folder + ticker + '.csv', index_col='Date', 
                         parse_dates=True)
    except FileNotFoundError:
        pass
        print("File Doesn't Exist")
    else:
        return df

### Save dataframe to CSV

In [49]:
def save_dataframe_to_csv(df, folder, ticker):
    df.to_csv(folder + ticker + '.csv')

### Add Daily Returns to dataframe

In [50]:
# We calculate a percentage rate of return for each day to compare investments.
# Simple Rate of Return = (End Price - Beginning Price) / Beginning Price OR (EP / BP) - 1

# Shift provides the value from the previous day
# NaN is displayed because there was no previous day price for the 1st calculation
def add_daily_return_to_df(df, ticker):
    df['daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
    # Save data to a CSV file
    # For Windows
    df.to_csv(PATH + ticker + '.csv')
    # For MacOS
    # df.to_csv(PATH + ticker + '.csv')
    return df  

### Update Daily Returns

In [51]:
for x in tickers:
    try:
        print("Working on :", x)
        new_df = get_df_from_csv(PATH, x)
        new_df = add_daily_return_to_df(new_df, x)
        save_dataframe_to_csv(new_df, PATH, x)
    except Exception as ex:
        print(ex)

Working on : A
Working on : AA
Working on : AAL
Working on : AAME
Working on : AAOI
Working on : AAON
Working on : AAP
Working on : AAPL
Working on : AAT
Working on : AAWW
Working on : ABBV
Working on : ABC
Working on : ABCB
Working on : ABEO
Working on : ABG
Working on : ABIO
Working on : ABM
Working on : ABMD
Working on : ABR
Working on : ABT
Working on : ABTX
Working on : AC
Working on : ACAD
Working on : ACBI
Working on : ACC
Working on : ACCO
Working on : ACER
Working on : ACGL
Working on : ACHC
Working on : ACHV
Working on : ACIW
Working on : ACLS
Working on : ACM
Working on : ACMR
Working on : ACN
Working on : ACNB
Working on : ACOR
Working on : ACRE
Working on : ACRS
Working on : ACRX
Working on : ACTG
Working on : ACU
Working on : ACY
Working on : ADBE
Working on : ADC
Working on : ADES
Working on : ADI
Working on : ADM
Working on : ADMA
Working on : ADMP
Working on : ADMS
Working on : ADNT
Working on : ADP
Working on : ADS
Working on : ADSK
Working on : ADTN
Working on : ADUS

In [52]:
df = get_df_from_csv(PATH, 'AAPL')
df.head()

Unnamed: 0_level_0,Close,daily_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-04-21,33.581406,
2017-04-24,33.904781,0.00963
2017-04-25,34.114864,0.006196
2017-04-26,33.914227,-0.005881
2017-04-27,33.940193,0.000766


### Regression

In [53]:
def calc_projected_roi(ticker):
    a_df = get_df_from_csv(PATH, ticker) 

    a_df = a_df.asfreq('d') # Change frequency to day
    a_df.index # Check frequency
    a_df = a_df.fillna(method='ffill') # Fill missing values

    # Delete unnamed column
    a_df.drop(a_df.columns[a_df.columns.str.contains('unnamed',case = False)],
          axis = 1, inplace = True)

    # Delete daily return column
    a_df = a_df.drop(['daily_return'], axis=1)
    
    # Figure out optimum lags which will be 1 or 2 for this data set
    lags = ar_select_order(a_df, maxlag=30)

    # Create our model using whole data set
    model = AutoReg(a_df['Close'], lags.ar_lags)
    model_fit = model.fit()

    # Define training and testing area
    print("Length :",len(a_df)) # 1826 observations
    train_df = a_df.iloc[50:1460] # 80% minus 1st 50
    test_df = a_df.iloc[1460:] # Last 20%

    # Define training model for 500 days (Play with Number & Test)
    # and White's covariance estimator
    train_model = AutoReg(a_df['Close'], 500).fit(cov_type="HC0")

    # Define start and end for prediction 
    start = len(train_df)
    end = len(train_df) + len(test_df) - 1

    prediction = train_model.predict(start=start, end=end, dynamic=True)

    # Predict 60 days into the future
    forecast = train_model.predict(start=end, end=end+30, dynamic=True)

    # Get starting price of prediction
    s_price = forecast.head(1).iloc[0]

    # Get the last price of prediction
    e_price = forecast.iloc[-1]

    # Get return over prediction
    return (e_price - s_price) / s_price

In [54]:
def get_proj_rois():
    # Will hold all tickers & stock rois
    ticker = []
    roi = []
    
    for x in tickers:
        print("Working on :", x)
        try:
            the_roi = calc_projected_roi(x)
        except Exception as ex:
            print("Stock Data Corrupted")
        else:
            ticker.append(x)
            print("ROI :", the_roi)
            roi.append(the_roi)
        
    return pd.DataFrame({'Ticker':ticker, 'ROI':roi})

In [55]:
proj_roi_df = get_proj_rois()
proj_roi_df

Working on : A
Length : 1826
ROI : 0.007893854034674474
Working on : AA
Length : 1826
ROI : 0.05537536594309513
Working on : AAL
Length : 1826
ROI : -0.07772329485755199
Working on : AAME
Length : 1826
ROI : 0.15549854925506948
Working on : AAOI
Length : 1826
ROI : 0.2837188806887125
Working on : AAON
Stock Data Corrupted
Working on : AAP
Length : 1826
ROI : -0.006699726505738451
Working on : AAPL
Length : 1826
ROI : 0.023305746561951352
Working on : AAT
Length : 1826
ROI : -0.008225358155372257
Working on : AAWW
Length : 1826
ROI : -0.0038043471077357288
Working on : ABBV
Length : 1826
ROI : 0.03469240613527972
Working on : ABC
Length : 1826
ROI : 0.03053774813432565
Working on : ABCB
Length : 1826
ROI : -0.026659171277047068
Working on : ABEO
Length : 1826
ROI : 0.7657556721919921
Working on : ABG
Length : 1826
ROI : -0.0746740878392042
Working on : ABIO
Length : 1826
ROI : 0.05078321208144962
Working on : ABM
Length : 1826
ROI : -0.022784870289560295
Working on : ABMD
Length : 1826


Unnamed: 0,Ticker,ROI
0,A,0.007894
1,AA,0.055375
2,AAL,-0.077723
3,AAME,0.155499
4,AAOI,0.283719
...,...,...
2763,ZSAN,0.900838
2764,ZTS,0.022099
2765,ZUMZ,0.078837
2766,ZYNE,-0.267576


In [56]:
proj_roi_df.sort_values(by=['ROI'], ascending=False)[0:20]

Unnamed: 0,Ticker,ROI
483,CEI,3.34646
56,ADXS,2.580588
2409,TEUM,1.716982
1331,JAGX,1.703884
55,ADVM,1.391088
203,ARDX,1.216265
1214,HUSA,1.077588
2298,SPPI,0.972564
276,AXAS,0.970485
2763,ZSAN,0.900838
