In [3]:
import os
import sys
#import requests
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.pylabtools import figsize
figsize(16, 6)

In [6]:
p = Path(os.getcwd()).parents[0]
# data.to_csv('{base_dir}/data-sets/AirPassengers.csv'.format(base_dir=str(p)), index=False)
data = pd.read_csv(
    '{base_dir}/data-sets/yahoofinance-stock-data.csv'.format(base_dir=str(p)), 
    index_col=0, 
    parse_dates=True, 
    infer_datetime_format=True
)
data.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1995-01-03,3.976563,4.000975,3.9375,3.984375,2.669861,41721600
1995-01-04,4.0,4.023438,3.953125,3.976563,2.664625,41865600
1995-01-05,3.984375,4.0625,3.976563,4.007813,2.685566,60480000


In [7]:
quotes = data.copy()

In [9]:
from __future__ import print_function

import datetime
from matplotlib import cm, pyplot as plt
from matplotlib.dates import YearLocator, MonthLocator

from hmmlearn.hmm import GaussianHMM

# Unpack quotes
dates = np.array(quotes.index, dtype=int)
close_v = np.array(quotes.Close)
volume = np.array(quotes.Volume)[1:]

# Take diff of close value. Note that this makes
# ``len(diff) = len(close_t) - 1``, therefore, other quantities also
# need to be shifted by 1.
diff = np.diff(close_v)
dates = dates[1:]
close_v = close_v[1:]

# Pack diff and volume for training.
X = np.column_stack([diff, volume])

# Make an HMM instance and execute fit
model = GaussianHMM(n_components=4, covariance_type="diag", n_iter=1000).fit(X)

# Predict the optimal sequence of internal hidden state
hidden_states = model.predict(X)

print("Transition matrix")
print(model.transmat_)
print()

print("Means and vars of each hidden state")
for i in range(model.n_components):
    print("{0}th hidden state".format(i))
    print("mean = ", model.means_[i])
    print("var = ", np.diag(model.covars_[i]))
    print()


Transition matrix
[[8.94061368e-01 6.55870617e-05 1.05873029e-01 1.55428563e-08]
 [1.18876356e-03 9.71712125e-01 4.98746794e-17 2.70991119e-02]
 [1.79287903e-01 1.47790537e-03 8.01778971e-01 1.74552207e-02]
 [2.57190128e-18 3.59843040e-01 2.03633297e-01 4.36523662e-01]]

Means and vars of each hidden state
0th hidden state
mean =  [5.94364779e-03 4.77266881e+07]
var =  [4.90192554e-01 1.51827269e+14]

1th hidden state
mean =  [3.34813078e-02 7.99622920e+07]
var =  [7.24384769e-02 1.02020472e+15]

2th hidden state
mean =  [4.48276033e-02 6.76674025e+07]
var =  [2.46873501e+00 4.18619000e+14]

3th hidden state
mean =  [-6.13425034e-01  1.80532887e+08]
var =  [6.49390474e+00 8.72915532e+15]



In [10]:
fig, axs = plt.subplots(model.n_components, sharex=True, sharey=True)
colours = cm.rainbow(np.linspace(0, 1, model.n_components))
for i, (ax, colour) in enumerate(zip(axs, colours)):
    # Use fancy indexing to plot data in each state.
    mask = hidden_states == i
    ax.plot_date(dates[mask], close_v[mask], ".-", c=colour)
    ax.set_title("{0}th hidden state".format(i))
    # Format the ticks.
    ax.xaxis.set_major_locator(YearLocator())
    ax.xaxis.set_minor_locator(MonthLocator())
    ax.grid(True)
plt.show()


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


OverflowError: signed integer is greater than maximum

In [12]:
"""
Usage: analyse_data.py --company=<company>
"""
import warnings
import logging
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hmmlearn.hmm import GaussianHMM
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Supress warning in hmmlearn
warnings.filterwarnings("ignore")
# Change plot style to ggplot (for better and more aesthetic visualisation)
plt.style.use('ggplot')


class StockPredictor(object):
    def __init__(self, company, test_size=0.33,
                 n_hidden_states=4, n_latency_days=10,
                 n_steps_frac_change=50, n_steps_frac_high=10,
                 n_steps_frac_low=10):
        self._init_logger()

        self.company = company
        self.n_latency_days = n_latency_days

        self.hmm = GaussianHMM(n_components=n_hidden_states)

        self._split_train_test_data(test_size)

        self._compute_all_possible_outcomes(
            n_steps_frac_change, n_steps_frac_high, n_steps_frac_low)

    def _init_logger(self):
        self._logger = logging.getLogger(__name__)
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
        handler.setFormatter(formatter)
        self._logger.addHandler(handler)
        self._logger.setLevel(logging.DEBUG)

    def _split_train_test_data(self, test_size):
        data = pd.read_csv(
            'data/company_data/{company}.csv'.format(company=self.company))
        _train_data, test_data = train_test_split(
            data, test_size=test_size, shuffle=False)

        self._train_data = _train_data
        self._test_data = test_data

    @staticmethod
    def _extract_features(data):
        open_price = np.array(data['open'])
        close_price = np.array(data['close'])
        high_price = np.array(data['high'])
        low_price = np.array(data['low'])

        # Compute the fraction change in close, high and low prices
        # which would be used a feature
        frac_change = (close_price - open_price) / open_price
        frac_high = (high_price - open_price) / open_price
        frac_low = (open_price - low_price) / open_price

        return np.column_stack((frac_change, frac_high, frac_low))

    def fit(self):
        self._logger.info('>>> Extracting Features')
        feature_vector = StockPredictor._extract_features(self._train_data)
        self._logger.info('Features extraction Completed <<<')

        self.hmm.fit(feature_vector)

    def _compute_all_possible_outcomes(self, n_steps_frac_change,
                                       n_steps_frac_high, n_steps_frac_low):
        frac_change_range = np.linspace(-0.1, 0.1, n_steps_frac_change)
        frac_high_range = np.linspace(0, 0.1, n_steps_frac_high)
        frac_low_range = np.linspace(0, 0.1, n_steps_frac_low)

        self._possible_outcomes = np.array(list(itertools.product(
            frac_change_range, frac_high_range, frac_low_range)))

    def _get_most_probable_outcome(self, day_index):
        previous_data_start_index = max(0, day_index - self.n_latency_days)
        previous_data_end_index = max(0, day_index - 1)
        previous_data = self._test_data.iloc[previous_data_end_index: previous_data_start_index]
        previous_data_features = StockPredictor._extract_features(
            previous_data)

        outcome_score = []
        for possible_outcome in self._possible_outcomes:
            total_data = np.row_stack(
                (previous_data_features, possible_outcome))
            outcome_score.append(self.hmm.score(total_data))
        most_probable_outcome = self._possible_outcomes[np.argmax(
            outcome_score)]

        return most_probable_outcome

    def predict_close_price(self, day_index):
        open_price = self._test_data.iloc[day_index]['open']
        predicted_frac_change, _, _ = self._get_most_probable_outcome(day_index)
        return open_price * (1 + predicted_frac_change)

    def predict_close_prices_for_days(self, days, with_plot=False):
        predicted_close_prices = []
        for day_index in tqdm(range(days)):
            predicted_close_prices.append(self.predict_close_price(day_index))

        if with_plot:
            test_data = self._test_data[0: days]
            days = np.array(test_data['date'], dtype="datetime64[ms]")
            actual_close_prices = test_data['close']

            fig = plt.figure()

            axes = fig.add_subplot(111)
            axes.plot(days, actual_close_prices, 'bo-', label="actual")
            axes.plot(days, predicted_close_prices, 'r+-', label="predicted")
            axes.set_title('{company}'.format(company=self.company))

            fig.autofmt_xdate()

            plt.legend()
            plt.show()

        return predicted_close_prices


In [15]:
"""
Usage: get_data.py --year=<year>
"""
import requests
import os

year = 2018

# Create directory if not present
year_directory_name = 'data/{year}'.format(year=year)
if not os.path.exists(year_directory_name):
    os.makedirs(year_directory_name)

# Fetching file list for the corresponding year
year_data_files = requests.get(
    'http://data.pystock.com/{year}/index.txt'.format(year=year)).text.strip().split('\n')

for data_file_name in year_data_files:
    file_location = '{year_directory_name}/{data_file_name}'.format(year_directory_name=year_directory_name,
                                                                    data_file_name=data_file_name)

    with open(file_location, 'wb+') as data_file:
        print('>>> Downloading \t {file_location}'.format(
            file_location=file_location))
        data_file_content = requests.get('http://data.pystock.com/{year}/{data_file_name}'.format(
            year=year, data_file_name=data_file_name)
        ).content
        print('<<< Download Completed \t {file_location}'.format(
            file_location=file_location))
        data_file.write(data_file_content)


>>> Downloading 	 data/2018/<!DOCTYPE html>
<<< Download Completed 	 data/2018/<!DOCTYPE html>
>>> Downloading 	 data/2018/<html>
<<< Download Completed 	 data/2018/<html>
>>> Downloading 	 data/2018/  <head>
<<< Download Completed 	 data/2018/  <head>


FileNotFoundError: [Errno 2] No such file or directory: 'data/2018/    <meta http-equiv="Content-type" content="text/html; charset=utf-8">'

In [14]:
stock_predictor = StockPredictor(company='FB')
stock_predictor.fit()
stock_predictor.predict_close_prices_for_days(500, with_plot=True)


FileNotFoundError: [Errno 2] File b'data/company_data/FB.csv' does not exist: b'data/company_data/FB.csv'