In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import datetime
import os

In [2]:
# Use GPU if available for PyTorch
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

device = torch.device(dev)

In [3]:
def load_data(path, Index=None):
    '''
    Returns 1-D tensor of Open prices
    Moves data to cpu
    
    Args:
        path: path of csv file to open
        Index: specify which company to analyze
    '''

    df = pd.read_csv(path)
    
    if Index:
        df = df[df["Index"] == Index]
    
    df['numerical_date'] = df["Date"].apply(lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"))
    df['x'] = (df["numerical_date"] - df["numerical_date"].min()).dt.days
    
    t = df.loc[:, "Open"].reset_index(drop=True)
    
    data_cpu = torch.tensor(t, dtype=torch.float)
    sub_cpu = torch.tensor(t[:1000], dtype=torch.float)
    
    data = data_cpu.to(device)
    sub = sub_cpu.to(device)

    return df

In [4]:
df = load_data("../data/raw/indexProcessed.csv", "NYA")
df

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/indexProcessed.csv'

In [None]:
# keep track of duration and slope
# slope = (d[i] - d[j]) / (i - j)
# duration = i - j

def convert_to_trend(d, max_error):
    '''
    Convert 1-D tensor of Open costs to 2-D tensor of trend durations and slopes
    '''
    len_x = d.shape[0]
    x = np.array(d["x"])
    y = np.array(d["Open"])
    
    # set buffer and lower and upper bounds
    w = len_x // 7 # buffer ensures that there is enough data for 5 to 6 segments as specified in original paper
    lower_bound = w // 2
    upper_bound = int(2 * w)
    
    sequences = []
    
    i = 0
    while i < len_x:
        # bottom_up
        sequences += bottom_up(x, y, i, w, max_error)
        
        # slide window
        i = w
        w = min(i + int(min(upper_bound, max(lower_bound, best_line(x, y, i, max_error, upper_bound)))), len_x)
        

    trends = [[None, None, None, None] for _ in range(len(sequences))]
    for idx, sequence in enumerate(sequences):
        trends[idx][0] = x[sequence[1]] - x[sequence[0]] # duration
        trends[idx][1] = __calculate_slope(x, y, sequence[0], sequence[1])
        trends[idx][2] = __calculate_intercept(x, y, sequence[0], sequence[1])
        trends[idx][3] = __calculate_error(x, y, sequence[0], sequence[1])
        
#     return torch.tensor(trends, dtype=torch.float)
    return trends
        
        
def __calculate_error(x, y, i, j):
    curr_x, curr_y = x[i:j+1], y[i:j+1]
    A = np.vstack([curr_x, np.ones(len(curr_x))]).T
    try:
        error = np.linalg.lstsq(A, curr_y, rcond=None)[1][0]
    except IndexError:
        error = 0

    return error


def __calculate_slope(x, y, i, j):
    curr_x, curr_y = x[i:j+1], y[i:j+1]
    A = np.vstack([curr_x, np.ones(len(curr_x))]).T
    try:
        m, c = np.linalg.lstsq(A, curr_y, rcond=None)[0]
    except IndexError:
        m = 0

    return m


def __calculate_intercept(x, y, i, j):
    curr_x, curr_y = x[i:j+1], y[i:j+1]
    A = np.vstack([curr_x, np.ones(len(curr_x))]).T
    try:
        m, c = np.linalg.lstsq(A, curr_y, rcond=None)[0]
    except IndexError:
        c = 0

    return c


def best_line(x, y, i, max_error, upper_bound):
    '''
    Return indices for next segment
    '''
    error = 0
    j = i
    while error <= max_error and j < upper_bound:
        j += 15
        curr_x, curr_y = x[i:j], y[i:j]
        error = __calculate_error(curr_x, curr_y, i, j)

    return j


def bottom_up(x, y, i, j, max_error):
    '''
    Performs bottom up algorithm
    '''
    print(f"Performing bottom_up with i={i}, j={j}, max_error={max_error}")
    segments = [[k, k+2] for k in range(i, j, 2)]

    fully_merged = False
    while not fully_merged:

        min_merge_error, min_merge_idx = float("inf"), None
        for idx in range(0, len(segments)-1, 2):
            curr_error = __calculate_error(x, y, segments[idx][0], segments[idx+1][1])
            if curr_error < min_merge_error:
                min_merge_error = curr_error
                min_merge_idx = idx
        
        if min_merge_error < max_error:
            segments[min_merge_idx] = [segments[min_merge_idx][0], min(len(x)-1, segments[min_merge_idx+1][1])]
            segments.pop(min_merge_idx+1)
        else:
            fully_merged = True
            
#     print(f"Length of segments: {len(segments)}")
    return segments

In [None]:
converted_data = convert_to_trend(df, 1000)

In [None]:
x = np.array(df["x"])
y = np.array(df["Open"])

A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
m, c

In [None]:
plt.plot(x, y)
x_vals = np.array(x)
y_vals = c + m * x_vals
plt.plot(x_vals, y_vals, '--')