<a href="https://colab.research.google.com/github/marvin-hansen/SP-contest/blob/master/SP500.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install requirements

In [0]:
# set correct version
!pip install imgaug==0.2.7 

# update pandas 
!pip install --upgrade pandas 

# update numpy 
!pip install --upgrade numpy 

# updated fast.ai to latest 
!pip install --upgrade fastai


Restart runtime

# Imports

In [0]:
import platform
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import datetime, DataFrame
from pandas.io.parsers import TextFileReader

# File & io
from enum import Enum, unique
from pathlib import Path
from urllib.request import urlretrieve

# fast ai 
import fastai
from fastai import *
from fastai.imports import *
from fastai.basics import *
from fastai.tabular import *
from fastai.metrics import *

import torch
print("Done")

In [0]:
print("* Python Version: " + str(platform.python_version()))
print("* Pandas Version: " + str(pd.__version__))
print("* Numpy Version: " + str(np.__version__))

print("* FastAI Version: " + str(fastai.__version__))
print("* PyTorch Version: " + str(torch.__version__))
print()
!nvcc --version

In [0]:
torch.cuda.current_device()

print("Cuda available: " + str(torch.cuda.is_available()))
print("Cuda enabled:" + str(torch.backends.cudnn.enabled))

#https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()


#Additional Info when using cuda
if device.type == 'cuda':
    print("GPU used: " + torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

# Tools 

In [0]:
## file utils
@unique
class Data(Enum):
    SP500_50Y_RAW = 0
    SP500_90Y_RAW = 1
    SP500_ALL = 2
    SP500_TRAIN = 3
    SP500_TEST = 4
    SP500_VALID = 5
    

In [0]:
def get_path(data_name: Data, url: bool):
    """
    Returns the path corresponding to the data set specified in the enum Data.
    Note, the enum is @unique so no two datasets can have the same path.

    ONLY "raw" data have web url's to download the official reference dataset.
    train, test, validate, and all are generated files.

    When URL = True, the corresponding web url for the data set will be returned.

    By default, path is relative /Data/filename.end

    Update data_folder to set a different path.

    :param data_name: Enum - Dataset
    :param url: bool flag to indicate whether to return a local path or a web url
    :return: file path or url
    """

    data_folder = "" #"Data/"
    sp_name = "SP500"
    sp50_name = "SP500-50Y"
    sp90_name = "SP500-90Y"
    frmt = ".csv"

    path = ""

    if (data_name is Data.SP500_50Y_RAW):
        path = data_folder + sp50_name + "-raw" + frmt
        if (url):
            u = "https://raw.githubusercontent.com/marvin-hansen/SP-contest/master/Data/SP500-50Y-raw.csv"
            path = requests.get(u).content

    if (data_name is Data.SP500_90Y_RAW):
        path = data_folder + sp90_name + "-raw" + frmt
        if (url):
            u = "https://raw.githubusercontent.com/marvin-hansen/SP-contest/master/Data/SP500-90Y-raw.csv"
            path = requests.get(u).content


    if (data_name is Data.SP500_ALL):
        path = data_folder + sp_name + "-all" + frmt
        if (url):
            u = ""
            path = requests.get(u).content
    if (data_name is Data.SP500_TRAIN):
        path = data_folder + sp_name + "-train" + frmt
        if (url): path = ""
    if (data_name is Data.SP500_TEST):
        path = data_folder + sp_name + "-test" + frmt
        if (url): path = ""
    if (data_name is Data.SP500_VALID):
        path = data_folder + sp_name + "-valid" + frmt
        if (url): path = ""

    return path

In [0]:
def load_csv_file(data_name: Data, url: bool):
    """ loads the S&P 500 index file from the path in the path function
    :param path:
    :return: pandas data frame
    """
    if url:
        return pd.read_csv(io.StringIO(get_path(data_name=data_name, url=url).decode('utf-8')),infer_datetime_format=True)

    else:
        return pd.read_csv(get_path(data_name=data_name, url=url), infer_datetime_format=True, index_col="Date")

In [0]:
def load_data(data: Data, force_download: bool = False):
    """ Loads the requested dataset, either from the web or from a local copy.
    @depends: Data - Enum that specifies available datasets
    @depends: get_path Adjust local file path and URL's.
    Default relative path is data/
    Default  URL is public github repo.
    
    :param data: dataset to load
    :param force_download: Download the web-version and override local copy. FALSE by default.
    :return: pandas dataframe
    """
    path = Path(get_path(data_name=data, url=False))
    if(force_download or path.exists()== False):
        print("Load from URL")
        df = load_csv_file(data_name=data, url=True)
        # ... store a local copy to accelerate the next data loading
        #df.to_csv(get_path(data_name=data, url=False))
        return df
    else: # local copy must be there b/c path exists
        # load
        print("Load data from local  file")
        return load_csv_file(data_name=data, url=False)

In [0]:
def save_train_test_valid(df: DataFrame, split_ratio: float, valid_size: int, verbose: bool):
    """
    splits a dataframe into train, test, and validation and stores each set in a different file
    :param df: pandas data frame
    :param split_ratio: ratio between train & split
    :param valid_size: number of rows in the validation set
    :param verbose: prints out file paths when set to true
    :return: void
    """
    if (verbose):
        print("Save data to file.. ")
    
    # replace NaN with zero
    df = df.fillna(0)
    
    
    # store validation set as the latest of n data points
    if(valid_size > 0):
      valid = df.head(valid_size)
      valid_file = get_path(data_name=Data.SP500_VALID, url=False)
      valid.to_csv(valid_file)
 
    # split remaining data into train & test sets
    split = int(len(df) * split_ratio)
    train = df[0:split]  #
    test = df[split:len(df)]
  
    ## store train dataset
    train_file = get_path(data_name=Data.SP500_TRAIN, url=False)
    train.to_csv(train_file)

    # store train
    test_file = get_path(data_name=Data.SP500_TEST, url=False)
    test.to_csv(test_file)

    if (verbose):
        print()
        print('All data : %d' % (len(df)))
        print('Training data: %d' % (len(train)))
        print('Testing data: %d' % (len(test)))
        if(valid_size > 0):
          print('Validation data: %d' % (len(valid)))
        print()
        print("Stored train data in file: ")
        print(train_file)
        print()
        print("Stored train data in file: ")
        print(test_file)
        if(valid_size > 0):
          print()
          print("Stored validation data in file: ")
          print(valid_file)
        print()
        print("Done! All data are saved")

In [0]:
def drop_feature(df, col_name):
    """
    Drops the given column(s) on the given data frame
    """
    return df.drop(columns=col_name)

# Load, split & save  data to train & test 

In [0]:
verbose = False

df = load_data(data=Data.SP500_50Y_RAW, force_download=True)
print("Load raw data: Done!")

# Close and Adj. Close have about the same values, thus adj. close isn't needed 
df =  drop_feature(df, "Adj Close")
print("Remove feature Adj Close: Done!")

# That's required to categorify date 
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
print("Convert Date to datetime: Done!")

# That's required to capture trends, seasonality, and time patterns in the dataset. 
add_datepart(df, "Date", drop=False)
print("Categorify Date: Done!")

if (verbose):
    print("Inspect Data: ")
    print(df.info())
    print()
    print(df.tail(3).T)    
    
split = 0.80 # for a 80/20 split
valid_size = 30  # last month for validation. Set to zero for no validation data 

print("Done:Split into Train & Test ")
save_train_test_valid(df=df, split_ratio=split, valid_size=valid_size, verbose=True)##

# Load train & test set

In [0]:
train_df = load_data(data=Data.SP500_TRAIN)
test_df = load_data(data=Data.SP500_TEST)
valid_df = load_data(data=Data.SP500_VALID)  

In [0]:
# check lenths
len(train_df),len(test_df), len(valid_df)

In [0]:
print("Show Train dataset")
train_df.tail(5).T

In [0]:
y_name = 'Close'

y_train = train_df[y_name]
X_train = drop_feature(train_df, y_name)

y_test = train_df[y_name]
X_test = drop_feature(test_df, y_name) 

In [0]:
X_test.head().T