In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import datetime

In [2]:
class DataLoader:
    def __init__(self, path, file_type=None, index=None, index_col=None, date_col=None, date_format=None): 
        self.path = path
        self.data = None
    
        # Load in data 
        if self.path is not None and file_type is not None: 
            self.load_data(file_type)
        
        # Filter by index if required
        if index is not None: 
            if index_col is not None: 
                self.filter_index(index, index_col)
            else: 
                self.filter_index(index)
        
        # Format date if required 
        if date_col is not None:
            if date_format is not None: 
                self.date_index(date_col, date_format)
            else:
                self.date_index(date_col)
        
    def load_data(self, file_type="csv"):
        """
        Loads data into dataframe object based on file type
        :param file_type: File type of data file 
        :return: None
        """
        if file_type == "csv":
            self.data = pd.read_csv(self.path)
        elif file_type == "json":
            self.data = pd.read_json(self.path)
            
    def filter_index(self, index, index_col="Index"):
        """
        Filters for only certain indices of data 
        :param index: String of index name to filter for 
        :return: True if index column exists and filter is applied, False otherwise 
        """
        if self.data is not None and index_col in self.data.columns: 
            self.data = self.data[self.data[index_col] == index]
            return True
        return False
    
    def date_index(self, date_col, date_format="%Y-%m-%d"):
        """
        Turns a string date column into a date object and calculates an index based on date
        :param date_col: Name of date column 
        :param date_format: String format of string date 
        :return: True if date column exists and data processing is applied, False otherwise 
        """
        if self.data is not None and date_col in self.data.columns:
            self.data['numeric_date'] = self.data[date_col].apply(lambda date: datetime.datetime.strptime(date, date_format))
            self.data['date_index'] = (self.data["numeric_date"] - self.data["numeric_date"].min()).dt.days
            return True
        return False

In [7]:
class LinearApproximation:
    def __init__(self, max_error, min_segment_length, data=None, target_col=None, date_index=None):
        self.d = None
        self.transformed_d = None
        self.x = None
        self.y = None
        self.len_data = None
        self.max_idx = None

        self.max_error = max_error
        self.min_segment_length = min_segment_length
        
        if data is not None and target_col is not None:
            # Add data if provided info
            if date_index is not None:
                self.add_data(data, target_col, date_index)
            else:
                self.add_data(data, target_col)
    
    def add_data(self, data, target_col, date_index="date_index"):
        """
        Load in data to process with a date index column and target prediction column 
        :param data: Dataframe to use 
        :param target_col: Colum name containing target data 
        :param date_index: Column name containing date index values 
        :return: None
        """
        self.d = data 
        self.x = np.array(self.d[date_index])
        self.y = np.array(self.d[target_col])
        self.len_data = self.d.shape[0]
        self.max_idx = len(self.x) - 1


    def process_data(self):
        """
        Transform original data to pandas dataframe containing information about trends
        trends[i] = [trend_duration[i], trend_slope[i], original data points that make up trends[i]]

        :return: Dataframe of processed data
        """

        # set buffer and lower and upper bounds
        w = self.len_data // 7  # buffer ensures that there is enough data for 5 to 6 segments as specified in original paper
        lower_bound = w // 2
        upper_bound = int(2 * w)

        sequences = []

        i = 0
        while i < self.len_data:
            # bottom_up
            sequences += self.bottom_up(i, w)

            # slide window
            i = w
            w = min(i + int(min(upper_bound, max(lower_bound, self.best_line(i, upper_bound)))), self.len_data)

            print(f"{round(i/self.len_data * 100, 2)}% data processed \n")

        trends = [[None, None, None] for _ in range(len(sequences))]
        for idx, seq in enumerate(sequences):
            trends[idx][0] = seq[1] - seq[0]  # duration
            trends[idx][1] = self.__calculate_slope(self.x[seq[0]:seq[1]+1], self.y[seq[0]:seq[1]+1])
            trends[idx][2] = [self.y[i] for i in range(seq[0], seq[1])]

        #     return torch.tensor(trends, dtype=torch.float)
        self.transformed_d = pd.DataFrame(trends, columns=["trend_duration", "trend_slope", "trend_points"])
        return self.transformed_d


    def best_line(self, i, upper_bound):
        """
        Calculates end index of current window

        :param i: starting index of current window
        :param upper_bound: maximum size of window
        :return: ending index of current window
        """
        error = 0
        j = i
        while error <= self.max_error and j < i + upper_bound:
            j += self.min_segment_length
            curr_x, curr_y = self.x[i:j], self.y[i:j]
            error = self.__calculate_error(curr_x, curr_y)

        return j


    def bottom_up(self, i, j):
        """
        Performs bottom up algorithm on data[i:j] as described in: http://www.cs.ucr.edu/~eamonn/icdm-01.pdf
        and returns list of segments represented by indices

        :param i: starting index of current window
        :param j: ending index of current window
        :return: segments (2-D list)
                 segments[i] = [starting index of segments[i], ending index of segments[i]]
        """
        # print(f"Performing bottom_up with i={i}, j={j}, max_error={self.max_error}")
        # segment_increase = self.min_segment_length//4
        segments = [[k, k + 2] for k in range(i, j, 2)]

        fully_merged = False
        while not fully_merged:

            min_merge_error, min_merge_idx = float("inf"), None
            min_seg_length, min_seg_idx = float("inf"), None

            # get min error
            for idx in range(0, len(segments) - 1, 2):
                sub_i, sub_j = segments[idx][0], segments[idx+1][1]+1
                curr_x, curr_y = self.x[sub_i:sub_j], self.y[sub_i:sub_j]
                curr_error = self.__calculate_error(curr_x, curr_y)

                if curr_error < min_merge_error:
                    min_merge_error = curr_error
                    min_merge_idx = idx

            # get min segment length
            for idx in range(len(segments)):
                if segments[idx][1] - segments[idx][0] < min_seg_length:
                    min_seg_length = segments[idx][1] - segments[idx][0]
                    min_seg_idx = idx

            # find spots to merge if necessary
            replace, first_half, second_half = None, None, None
            if min_merge_error < self.max_error:
                segments[min_merge_idx] = [segments[min_merge_idx][0], min(self.max_idx, segments[min_merge_idx+1][1])]
                segments.pop(min_merge_idx+1)

            elif min_seg_length < self.min_segment_length:
                if min_seg_idx == len(segments) - 1:
                    min_seg_idx -= 1
                    
                segments[min_seg_idx] = [segments[min_seg_idx][0], min(self.max_idx, segments[min_seg_idx+1][1])]
                segments.pop(min_seg_idx+1)
                      
            else: 
                fully_merged = True

        return segments


    def __calculate_error(self, x, y):
        """
        Calculates least squared error of linear approximation of x, y

        :param x: inputs to linear approximation
        :param y: targets of linear approximation
        :return: error
        """
        A = np.vstack([x, np.ones(len(x))]).T
        try:
            error = np.linalg.lstsq(A, y, rcond=None)[1][0]
        except IndexError:
            error = 0

        return error


    def __calculate_slope(self, x, y):
        """
        Calculates slope of linear approximation of x, y

        :param x: inputs to linear approximation
        :param y: targets of linear approximation
        :return: slope
        """
        A = np.vstack([x, np.ones(len(x))]).T
        try:
            m, c = np.linalg.lstsq(A, y, rcond=None)[0]
        except IndexError:
            m = 0

        return m


    def __calculate_intercept(self, x, y):
        """
        Calculates intercept of linear approximation of x, y

        :param x: inputs to linear approximation
        :param y: targets of linear approximation
        :return: intercept
        """
        A = np.vstack([x, np.ones(len(x))]).T
        try:
            m, c = np.linalg.lstsq(A, y, rcond=None)[0]
        except IndexError:
            c = 0

        return c


    def save_to_csv(self, file_path):
        """
        Saves transformed data to csv file for use

        :return: boolean
        """
        try:
            self.transformed_d.to_csv(file_path)
            return True

        except AttributeError:
            print("Transformed data does not exist: failed to save to csv file")
            return False


In [8]:
df_hparams = {
    "path": "../data/raw/indexProcessed.csv",
    "file_type": "csv", 
    "index": "NYA", 
    "date_col": "Date"
}

dl = DataLoader(**df_hparams)
dl.data.head()

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume,CloseUSD,numeric_date,date_index
8492,NYA,1965-12-31,528.690002,528.690002,528.690002,528.690002,528.690002,0.0,528.690002,1965-12-31,0
8493,NYA,1966-01-03,527.210022,527.210022,527.210022,527.210022,527.210022,0.0,527.210022,1966-01-03,3
8494,NYA,1966-01-04,527.840027,527.840027,527.840027,527.840027,527.840027,0.0,527.840027,1966-01-04,4
8495,NYA,1966-01-05,531.119995,531.119995,531.119995,531.119995,531.119995,0.0,531.119995,1966-01-05,5
8496,NYA,1966-01-06,532.070007,532.070007,532.070007,532.070007,532.070007,0.0,532.070007,1966-01-06,6


In [10]:
la_hparams = {
    "max_error": 100,
    "min_segment_length": 10,
    "data": dl.data, 
    "target_col": "Close"
}

la = LinearApproximation(**la_hparams)

In [12]:
la.process_data()

14.28% data processed 

28.64% data processed 

57.2% data processed 

85.77% data processed 

100.0% data processed 



Unnamed: 0,trend_duration,trend_slope,trend_points
0,20,0.347832,"[528.690002, 527.210022, 527.840027, 531.11999..."
1,22,-0.502117,"[535.669983, 533.340027, 529.109985, 531.44000..."
2,10,-0.737391,"[512.830017, 514.940002, 513.349976, 506.38000..."
3,16,0.762770,"[504.26001, 505.850006, 508.279999, 512.190002..."
4,16,-0.129549,"[525.940002, 525.72998, 524.140015, 524.349976..."
...,...,...,...
895,16,12.045573,"[14839.05957, 14975.42969, 15069.59961, 15226...."
896,16,28.251844,"[15010.46973, 15327.76953, 15277.01953, 15199...."
897,16,27.710624,"[15551.55957, 15346.53027, 15276.55957, 15410...."
898,16,16.326452,"[16000.15039, 16116.84961, 16186.29004, 16107...."
