In [None]:
# ==============================================================================
#                      Project : Predicting Stock Price
# ==============================================================================


# Clear Screen
# 
print(chr(27) + "[2J")


# Banner
#
print('')
print('')
print('           # #                 # #                 # #      ')
print('           # #                 # #                 # #      ')
print('          $$$$$               $$$$$               $$$$$     ')
print('        $$ # # $$           $$ # # $$           $$ # # $$   ')
print('       $$$ # #             $$$ # #             $$$ # #      ')
print('        $$$# #              $$$# #              $$$# #      ')
print('          $$$#                $$$#                $$$#      ')
print('           #$$$                #$$$                #$$$     ')
print('           # #$$$              # #$$$              # #$$$   ')
print('           # # $$$             # # $$$             # # $$$  ')
print('       $$$ # #  $$$$       $$$ # #  $$$$       $$$ # #  $$$$')
print('        $$$# # $$$$         $$$# # $$$$         $$$# # $$$$ ')
print('          $$$$$$$             $$$$$$$             $$$$$$$   ')
print('           # #                 # #                 # #      ')
print('           # #                 # #                 # #      ')
print('')
print('')




In [None]:
import os

In [None]:
# ==============================================================================
# Enabling Logging
# ==============================================================================

import logging

# Logger Settings:
logging.basicConfig(level=logging.WARNING)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')

def setup_logger(name, log_file, level=logging.INFO):
    """To setup as many loggers as you want"""
    handler = logging.FileHandler(log_file)        
    handler.setFormatter(formatter)
    logger = logging.getLogger(name)
    logger.setLevel(level)
    logger.addHandler(handler)
    return logger

# Phases Logger
phases_logger_file = 'log_phases.log'
os.system('rm -rf '+ phases_logger_file)
phases_logger = setup_logger('Phases', phases_logger_file)
phases_logger.info('Project : Predicting Stock Price')
phases_logger.info('Phases Logger Init')

# Details Logger
details_logger_file = 'log_details.log'
os.system('rm -rf '+ details_logger_file)
details_logger = setup_logger('Details', details_logger_file)
details_logger.info('Project : Predicting Stock Price')
details_logger.info('Details Logger Init')


In [None]:
# ==============================================================================
# Importing Basic Packages
# ==============================================================================

phases_logger.info('Importing Basic Packages : Start')

details_logger.info('Import : re')
import re

details_logger.info('Import : os')
import os

details_logger.info('Import : sys')
import sys

phases_logger.info('Importing Basic Packages : End')


In [None]:
# ==============================================================================
# Importing Project Specific Packages
# ==============================================================================

phases_logger.info('Importing Project Specific Packages : Start')

# Identifying Customer Targets (Python)
details_logger.info('Import : inline')
%matplotlib inline

# import packages for text processing and machine learning
details_logger.info('Import : pandas')
import pandas as pd  # DataFrame structure and operations
from pandas.plotting import scatter_matrix  # scatter plot matrix

details_logger.info('Import : numpy')
import numpy as np  # arrays and numerical processing
import matplotlib.pyplot as plt  # 2D plotting

details_logger.info('Import : scipy')
from scipy.stats import uniform  # for training-and-test split

details_logger.info('Import : patsy')
import patsy  # translate model specification into design matrices

details_logger.info('Import : seaborn')
import seaborn as sns  # PROVIDES TRELLIS AND SMALL MULTIPLE PLOTTING

# import user-defined module
# details_logger.info('import evaluate_classifier')
# import evaluate_classifier as eval

# FOLLOWING PACKAGE BEST IMPORTED AND INSTALLED VIA CONDA PROMPT
# conda install -c conda-forge mlxtend

# Association Rules Mining
# details_logger.info('import mlxtend')
# from mlxtend.frequent_patterns import apriori            # EASY ASSOCIATION RULES PACKAGE FROM RABST
# from mlxtend.frequent_patterns import association_rules

details_logger.info('Import : queue')
from queue import Queue

details_logger.info('Import : threading')
import threading

details_logger.info('Import : time')
import time

details_logger.info('Import : sklearn')
from sklearn.tree import DecisionTreeRegressor  # machine learning tree
from sklearn.ensemble import RandomForestRegressor # ensemble method
from sklearn.metrics import mean_squared_error

details_logger.info('Import : statsmodels')
import statsmodels.api as sm  # logistic regression
import statsmodels.formula.api as smf  # R-like model specification
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import acf

details_logger.info('Import : pmdarima')
import pmdarima as pm

phases_logger.info('Project Specific Package Load : End')

In [None]:
# ==============================================================================
# Present Working Directory
# ==============================================================================

# This will print the current directory : Debugging purposes
details_logger.info('PWD: ' + os.getcwd())


In [None]:
# ==============================================================================
# Information
# ==============================================================================

# ------------------------------------------------------------------------------
# Following should be your tree such that the link 'stock_market_data' points to
# actual directory 'stock_market_data' that is 2 directories above

# MY_PROJECT_DIR
# ├── BITBUCKET_CHECKOUT_DIR_000
# │   └── predictingstockprice
# │       ├── OtherTSVs
# │       ├── lib
# │       └── stock_market_data -> ../../stock_market_data
# └── stock_market_data
#     ├── forbes2000
#     │   ├── csv
#     │   └── json
#     ├── nasdaq
#     │   ├── csv
#     │   └── json
#     ├── nyse
#     │   ├── csv
#     │   └── json
#     └── sp500
#         ├── csv
#         └── json

# ------------------------------------------------------------------------------

In [None]:
# ==============================================================================
# User Settings
# ==============================================================================

phases_logger.info('User Settings : Start')


# Using stock_market_data link in the current directory
# stock_market_data -> ../../stock_market_data
#
stock_market_data_path = os.path.realpath('stock_market_data')


## Uncomment following to override the stock_market_data_path

# USER_SETTING_HERE
# 
# If the tree structure thing does not work for you, you may have to use the following paths instead:
## Ayush:
# stock_market_data_path = 'Add_your_path_here'
## CB:
# stock_market_data_path = '/Users/cbgarrett/Documents/ist718project/stock_market_data'
## Richard:
# stock_market_data_path = 'Add_your_path_here'
## Niranjan:
# stock_market_data_path = '/Users/niranjanjuvekar/MyStuff/Education_Niranjan/Data_Science_Syracuse_University/12_IST_718/Project/stock_market_data'


# Logging User Settings:
#
details_logger.info('User Settings:')
details_logger.info('User Setting : stock_market_data_path = ' + stock_market_data_path)

# USER_SETTING_HERE
# 
workOnMiniDataFrame = True


if workOnMiniDataFrame:
    construct_Low_df           = False
    construct_Open_df          = True
    construct_Volume_df        = False
    construct_High_df          = False
    construct_Close_df         = False
    construct_AdjustedClose_df = False
else:
    # USER_SETTING_HERE
    # 
    construct_Low_df           = False
    construct_Open_df          = True
    construct_Volume_df        = False
    construct_High_df          = False
    construct_Close_df         = False
    construct_AdjustedClose_df = False


if construct_Low_df:
    details_logger.info('User Setting : construct_Low_df = True')
else:
    details_logger.info('User Setting : construct_Low_df = False')

if construct_Open_df:
    details_logger.info('User Setting : construct_Open_df = True')
else:
    details_logger.info('User Setting : construct_Open_df = False')

if construct_Volume_df:
    details_logger.info('User Setting : construct_Volume_df = True')
else:
    details_logger.info('User Setting : construct_Volume_df = False')

if construct_High_df:
    details_logger.info('User Setting : construct_High_df = True')
else:
    details_logger.info('User Setting : construct_High_df = False')

if construct_Close_df:
    details_logger.info('User Setting : construct_Close_df = True')
else:
    details_logger.info('User Setting : construct_Close_df = False')

if construct_AdjustedClose_df:
    details_logger.info('User Setting : construct_AdjustedClose_df = True')
else:
    details_logger.info('User Setting : construct_AdjustedClose_df = False')


# Variable breakAtIteration to be used for debugging purposes
# if breakAtIteration = 0 : goes through all files
#
# USER_SETTING_HERE
#
if workOnMiniDataFrame:
    breakAtIteration = 500
else:
    breakAtIteration = 0


details_logger.info('User Setting : breakAtIteration = ' + str(breakAtIteration))


# Once the dataFrame has been processed, we can
# write it to the disk in TSV or CSV format
#
# USER_SETTING_HERE
#
write_CSV = False
write_TSV = False


details_logger.info('User Setting : write_CSV = ' + str(write_CSV))
details_logger.info('User Setting : write_TSV = ' + str(write_TSV))


# USER_SETTINGS_HERE
# 
# maxThreads = 2
maxThreads = 6


# Number of models to be run
N = 1 # Run only one model (Default)
if workOnMiniDataFrame:
    # N = 4
    N = 500
else:
    # USER_SETTING_HERE
    # 
    # N = df.shape[1]-2
    # N = 10000
    # N = 1000
    # N = 1500
    # N = 100
    # N = 10
    pass

    
ARIMA_models_dict1 = {}
ARIMA_forecast_dict1 = {}
ARIMA_model_residuals_dict1 = {}
    
phases_logger.info('User Settings : End')

In [None]:
''' GOOD CODE

# ==============================================================================
# DataFrame Construction
# ==============================================================================

phases_logger.info('Data Frames Construction : Start')


# Initializing variables in needed for constructing the dataFrames
# Dictionary of all CSVs is CSV_dict
#
CSV_dict = {}
iterationNumber=0
breakOuterLoop = False
iter1TickerSymbol = ''


# Date Column of all the dataFrames
#
if construct_Low_df:
    Low_df           = pd.read_csv('OtherTSVs/AllDates.csv', sep=',', header='infer')
if construct_Open_df:
    Open_df          = pd.read_csv('OtherTSVs/AllDates.csv', sep=',', header='infer')
if construct_Volume_df:
    Volume_df        = pd.read_csv('OtherTSVs/AllDates.csv', sep=',', header='infer')
if construct_High_df:
    High_df          = pd.read_csv('OtherTSVs/AllDates.csv', sep=',', header='infer')
if construct_Close_df:
    Close_df         = pd.read_csv('OtherTSVs/AllDates.csv', sep=',', header='infer')
if construct_AdjustedClose_df:
    AdjustedClose_df = pd.read_csv('OtherTSVs/AllDates.csv', sep=',', header='infer')

    
# Find all the CSV files from the stock_market_data directory
#
for root, dirs, files in os.walk(stock_market_data_path):
    for file in files:
        if file.endswith(".csv"):
            details_logger.info('ConstructDF: Root = ' + root)
            details_logger.info('ConstructDF: File = ' + file)
            details_logger.info('ConstructDF: FullFilePath = ' + os.path.join(root, file))
            tickerSymbol = re.sub('.csv','',file)
            details_logger.info('ConstructDF: tickerSymbol = ' + tickerSymbol)

            
            # Process only those files whose corresponding
            # ticker symbols are not already processed
            # 
            if tickerSymbol not in CSV_dict.keys():
                iterationNumber += 1
                details_logger.info('ConstructDF: iterationNumber = ' + str(iterationNumber) + ' : tickerSymbol = ' + tickerSymbol)
                CSV_dict[tickerSymbol] = pd.read_csv(os.path.join(root, file), sep=',', header='infer')
                
                
                # Process the datatype of the columns here (If needed)
                #
                
                
                # Merging the dataFrames
                #
                if construct_Low_df:
                    Low_df = pd.merge(Low_df, CSV_dict[tickerSymbol][['Date', 'Low']],  how='left', left_on=['Date'], right_on = ['Date'], suffixes=('', '_'+tickerSymbol))
                if construct_Open_df:
                    Open_df = pd.merge(Open_df, CSV_dict[tickerSymbol][['Date', 'Open']],  how='left', left_on=['Date'], right_on = ['Date'], suffixes=('', '_'+tickerSymbol))
                if construct_Volume_df:
                    Volume_df = pd.merge(Volume_df, CSV_dict[tickerSymbol][['Date', 'Volume']],  how='left', left_on=['Date'], right_on = ['Date'], suffixes=('', '_'+tickerSymbol))
                if construct_High_df:
                    High_df = pd.merge(High_df, CSV_dict[tickerSymbol][['Date', 'High']],  how='left', left_on=['Date'], right_on = ['Date'], suffixes=('', '_'+tickerSymbol))
                if construct_Close_df:
                    Close_df = pd.merge(Close_df, CSV_dict[tickerSymbol][['Date', 'Close']],  how='left', left_on=['Date'], right_on = ['Date'], suffixes=('', '_'+tickerSymbol))
                if construct_AdjustedClose_df:
                    AdjustedClose_df = pd.merge(AdjustedClose_df, CSV_dict[tickerSymbol][['Date', 'Adjusted Close']],  how='left', left_on=['Date'], right_on = ['Date'], suffixes=('', '_'+tickerSymbol))
                
                # Save the first tickerSymbol (for column name renaming later)
                if iterationNumber == 1:
                    iter1TickerSymbol = tickerSymbol                    
                
                # Testing Purposes : Break the iterations at given threshold
                if breakAtIteration == iterationNumber:
                    details_logger.info('ConstructDF: Breaking inner loop on iteration ' + str(iterationNumber))
                    breakOuterLoop = True
                    break
                    
    # If inner loop is broken, outer loop also needs to be broken
    if breakOuterLoop:
        details_logger.info('ConstructDF: Breaking outer loop')
        break

# Rename the columns
details_logger.info('ConstructDF: Renaming dataFrame columns')
if construct_Low_df:
    Low_df.rename(columns=lambda x: re.sub('Low_','',x), inplace=True)
    Low_df.rename(columns=lambda x: re.sub('Low',iter1TickerSymbol,x), inplace=True)

if construct_Open_df:
    Open_df.rename(columns=lambda x: re.sub('Open_','',x), inplace=True)
    Open_df.rename(columns=lambda x: re.sub('Open',iter1TickerSymbol,x), inplace=True)

if construct_Volume_df:
    Volume_df.rename(columns=lambda x: re.sub('Volume_','',x), inplace=True)
    Volume_df.rename(columns=lambda x: re.sub('Volume',iter1TickerSymbol,x), inplace=True)

if construct_High_df:
    High_df.rename(columns=lambda x: re.sub('High_','',x), inplace=True)
    High_df.rename(columns=lambda x: re.sub('High',iter1TickerSymbol,x), inplace=True)

if construct_Close_df:
    Close_df.rename(columns=lambda x: re.sub('Close_','',x), inplace=True)
    Close_df.rename(columns=lambda x: re.sub('Close',iter1TickerSymbol,x), inplace=True)

if construct_AdjustedClose_df:
    AdjustedClose_df.rename(columns=lambda x: re.sub('Adjusted Close_','',x), inplace=True)
    AdjustedClose_df.rename(columns=lambda x: re.sub('Adjusted Close',iter1TickerSymbol,x), inplace=True)
        
details_logger.info('ConstructDF: Data Frames Construction Done')
phases_logger.info('Data Frames Construction : End')

'''

In [None]:
''' GOOD CODE

# ==============================================================================
# Sampling Constructed DataFrames
# ==============================================================================

phases_logger.info('Sampling Constructed DataFrames : Start')

print('--------------------------------------------------')

if construct_Low_df:
    print(Low_df)
    print('--------------------------------------------------')
if construct_Open_df:
    print(Open_df)
    print('--------------------------------------------------')
if construct_Volume_df:
    print(Volume_df)
    print('--------------------------------------------------')
if construct_High_df:
    print(High_df)
    print('--------------------------------------------------')
if construct_Close_df:
    print(Close_df)
    print('--------------------------------------------------')
if construct_AdjustedClose_df:
    print(AdjustedClose_df)
    print('--------------------------------------------------')

phases_logger.info('Sampling Constructed DataFrames : End')
'''

In [None]:
''' GOOD CODE
# ==============================================================================
# Writing Constructed DataFrames
# ==============================================================================

phases_logger.info('Writing Constructed DataFrames : Start')

if construct_Low_df:
    if write_TSV:
        Low_df.to_csv('Low_df.tsv', sep='\t', encoding='utf-8')
    if write_CSV:
        Low_df.to_csv('Low_df.csv', sep=',', encoding='utf-8')

if construct_Open_df:
    if write_TSV:
        Open_df.to_csv('Open_df.tsv', sep='\t', encoding='utf-8')
    if write_CSV:
        Open_df.to_csv('Open_df.csv', sep=',', encoding='utf-8')

if construct_Volume_df:
    if write_TSV:
        Volume_df.to_csv('Volume_df.tsv', sep='\t', encoding='utf-8')
    if write_CSV:
        Volume_df.to_csv('Volume_df.csv', sep=',', encoding='utf-8')

if construct_High_df:
    if write_TSV:
        High_df.to_csv('High_df.tsv', sep='\t', encoding='utf-8')
    if write_CSV:
        High_df.to_csv('High_df.csv', sep=',', encoding='utf-8')

if construct_Close_df:
    if write_TSV:
            Close_df.to_csv('Close_df.tsv', sep='\t', encoding='utf-8')
    if write_CSV:
        Close_df.to_csv('Close_df.csv', sep=',', encoding='utf-8')

if construct_AdjustedClose_df:
    if write_TSV:
        AdjustedClose_df.to_csv('AdjustedClose_df.tsv', sep='\t', encoding='utf-8')
    if write_CSV:
        AdjustedClose_df.to_csv('AdjustedClose_df.csv', sep=',', encoding='utf-8')

phases_logger.info('Writing Constructed DataFrames : End')
'''

In [None]:
''' SAMPLE CODE : UNCOMMENT WITH DISCRETION

# ==============================================================================
# Sample code to get smaller dataframe out of Open_df
# ==============================================================================


phases_logger.info('Mini Data Frame Construction : Start')

miniDF_tickerSymbols_Energy       = ['XOM', 'XEL', 'PLUG', 'FCEL']
miniDF_tickerSymbols_Retail       = ['COST', 'MCD', 'SBUX', 'TJX', 'DLTR']
miniDF_tickerSymbols_Banks        = ['BAC', 'V', 'WFC', 'JPM', 'MA', 'C', 'AXP']
miniDF_tickerSymbols_HealthPharma = ['UNH', 'JNJ', 'LLY', 'ABBV', 'PFE', 'RHHBY', 'MRK', 'TMO', 'AZN', 'ABT', 'WBA', 'RIGL']
miniDF_tickerSymbols_Technology   = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'NVDA', 'CSCO']
miniDF_tickerSymbols_Automotive   = ['TM', 'TSLA', 'GM', 'F', 'AZO', 'HMC', 'NIO', 'CAR', 'PII', 'GT']
miniDF_tickerSymbols_Cannabis     = ['INCR']


miniDF_tickerSymbols = ['Date', 'WeekDay']


miniDF_tickerSymbols.extend(miniDF_tickerSymbols_Energy)
miniDF_tickerSymbols.extend(miniDF_tickerSymbols_Retail)
miniDF_tickerSymbols.extend(miniDF_tickerSymbols_Banks)
miniDF_tickerSymbols.extend(miniDF_tickerSymbols_HealthPharma)
miniDF_tickerSymbols.extend(miniDF_tickerSymbols_Technology)
miniDF_tickerSymbols.extend(miniDF_tickerSymbols_Automotive)
miniDF_tickerSymbols.extend(miniDF_tickerSymbols_Cannabis)


miniOpenDF_tickerSymbols = Open_df[miniDF_tickerSymbols]
miniOpenDF_tickerSymbols
                                     
phases_logger.info('Mini Data Frame Construction : Start')
'''

In [None]:
# TEMPORARY HACK

import zipfile
with zipfile.ZipFile('Open_df.csv.zip', 'r') as zip_ref:
    zip_ref.extractall(os.getcwd())


In [None]:
miniOpenDF_tickerSymbols = pd.read_csv('Open_df.csv', sep=',', header='infer')
# miniOpenDF_tickerSymbols

print(miniOpenDF_tickerSymbols.columns)

# miniOpenDF_tickerSymbols = miniOpenDF_tickerSymbols[['CSCO', 'PLCE']]
miniOpenDF_tickerSymbols['Date'] = miniOpenDF_tickerSymbols['Date'].str.replace('-','/')
print(miniOpenDF_tickerSymbols)
pd.to_datetime(miniOpenDF_tickerSymbols['Date'], infer_datetime_format=True)
miniOpenDF_tickerSymbols = miniOpenDF_tickerSymbols.set_index('Date')

# print(miniOpenDF_tickerSymbols)

# Filling empty cells in between the filled cells
miniOpenDF_tickerSymbols = miniOpenDF_tickerSymbols.fillna(method='ffill')

In [None]:
def dropNan (df):
    ignoreColumns = ['Unnamed: 0', 'Date', 'WeekDay']
    columns = df.columns
    # print(columns)
    for i in columns:
        if i not in ignoreColumns:
            # print('i = ', i)
            df = df[df[i].notna()]
    # print(df)
    return(df)

In [None]:
''' Example:
mydf = dropNan(miniOpenDF_tickerSymbols[['CSCO', 'SBUX']])
mydf
'''

In [None]:
phases_logger.info('Initialize for Models Generation : Start')
# Initialize for models

# Threads Dictionary
t = {}

# Queue Settings
q = Queue(maxsize = maxThreads)

# Model Dictionary
model = {}

nan_value = float("NaN")

phases_logger.info('Initialize for Models Generation : End')


In [None]:
# Create Model Function
phases_logger.info('Create Model Functions : Start')

def createARIMAModel1(modelNumber):
    details_logger.info('Model number : ' + str(modelNumber))
    print('AAA 1')
    # columnNumber = miniOpenDF_tickerSymbols.columns[modelNumber+3]
    columnName = miniOpenDF_tickerSymbols.columns[modelNumber+3]
    # X_sel = (miniOpenDF_tickerSymbols[[columnName]])

    X_sel = (miniOpenDF_tickerSymbols[[columnName]])
    # X_sel = X_sel.set_index('Date')

    print('AAA 4')
    X_sel = X_sel.dropna()

    print('AAA 2')

    print(X_sel.shape)
    print('AAA 3')

    # X_sel = X_sel.set_index('Date')

    print(X_sel)
    
    
    X_sel = X_sel.replace("", nan_value)


    # print(np.asarray(X_sel))
    print('AAA 5')

    model[columnName] = ARIMA(X_sel.astype(float), order=(2, 0, 2))
    # model[modelNumber] = ARIMA(X_sel[modelNumber], order=(1, 0, 0))
    # print('Fitting model : ', modelNumber)
    print('AAA 6')
    ARIMA_models_dict1[columnName] = model[columnName].fit()

    # Model Summary
    # print(model_fit.summary())
    print('AAA 7')

    # plot residual errors
    # ARIMA_model_residuals_dict1[columnName] = pd.DataFrame(ARIMA_models_dict1[columnName].resid)
    # residuals.plot()
    # plt.show()
    # residuals.plot(kind='kde')
    # plt.show()
    # print(residuals.describe())
    print('AAA 8')


    # Create an array with forecast for the next days of the year
    ARIMA_forecast_dict1[columnName] = ARIMA_models_dict1[columnName].forecast(steps=30)
    print('AAA 9')
    ARIMA_forecast_dict1[columnName] = pd.Series(ARIMA_forecast_dict1[columnName])
    print('AAA A')

    q.get()
    print('AAA B')


phases_logger.info('Create Model Functions : Start')

In [None]:
phases_logger.info('Creating Threads : Start')

for modelNumber in range(0,N,1):
    details_logger.info('t[modelNumber] = ' + str(modelNumber))
    t[modelNumber] = threading.Thread(target=createARIMAModel1, args=(modelNumber,))
    
phases_logger.info('Creating Threads : End')

In [None]:
phases_logger.info('Making Models in Threads : Start')
details_logger.info('Making Models in Threads')

for modelNumber in range(0,N,1):
    details_logger.info('Thread Spawn : ' + str(modelNumber) + ' : Starting')
    while(q.full()):
        print('Sleeping 1 sec')
        details_logger.info('Sleeping 1 sec')
        time.sleep(1)

    details_logger.info('Q : put' + str(modelNumber))
    q.put(modelNumber)
    t[modelNumber].start()
    details_logger.info('Thread Spawn : ' + str(modelNumber) + ' : Done')

phases_logger.info('Making Models in Threads : End')

In [None]:
phases_logger.info('Joining Threads : Start')

for modelNumber in range(0,N,1):
    t[modelNumber].join()
    details_logger.info('Thread Join : ' + str(modelNumber))

phases_logger.info('Joining Threads : End')

In [None]:
#Richie