In [None]:
import pandas_datareader as pdr
from datetime import datetime
import json
from os import listdir
import pandas as pd
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import csv

In [None]:
start_date = datetime(1970, 1, 1)
end_date = datetime.now().date()
tickers_path = 'data/tickers/sp500.json'
dst_path = 'data/stocks/'

In [None]:
def download_data(tickers_path, dst_path, start_date=datetime(1970, 1, 1), end_date=datetime.now().date(), columns=['Close']):

    assert 'Close' in columns

    with open(tickers_path, 'r') as f:
        tickers = json.load(f)

    success = []
    fail = []

    for ticker in tickers:
        
        ticker = ticker.replace(".", "-")

        try:
            df = pdr.get_data_yahoo(symbols=ticker, start=start_date, end=end_date)

            drop_list = list(set(df.columns) - set(columns))
            df.drop(drop_list,axis=1,inplace=True)


            download_path = dst_path + ticker + '.csv'
            df.to_csv(download_path, index=True)

            success.append(ticker)
        except:
            fail.append(ticker)

    return success, fail

In [None]:
def merge_csv(src_path, dst_path, filename='dataset', how='outer'):
    
    files = [src_path + x for x in listdir(src_path) if 'csv' in x]

    df1 = pd.read_csv(files[0], quoting=csv.QUOTE_NONE, error_bad_lines=False)
    df1 = df1.rename({'Close': files[0].replace(src_path,'').replace('.csv','')}, axis=1)

    for i in range(1,len(files)):
        df2 = pd.read_csv(files[i], quoting=csv.QUOTE_NONE, error_bad_lines=False)
        df2 = df2.rename({'Close': files[i].replace(src_path,'').replace('.csv','')}, axis=1)

        df1 = pd.merge(df1, df2, how=how, on=['Date','Date']) 

    df1['Date'] = pd.to_datetime(df1['Date'])
    df1 = df1.sort_values(by='Date')

    df1 = df1.set_index('Date')
    
    df1.to_csv(dst_path + filename + '.csv')

In [None]:
def dataa(dataset_path, data_size, input_tickers, output_tickers, step_size=0, input_size=60, output_size=20, feature_range=(0,1)):

    if step_size==0:
        step_size = input_size

    df = pd.read_csv(dataset_path, quoting=csv.QUOTE_NONE, error_bad_lines=False)

    predict_df = df[-input_size:]
    df = df[-data_size-input_size:-input_size]
    df = df.dropna(axis=1)

    in_df = df.copy()
    out_df = df.copy()

    in_drop_list = list(set(df.columns) - set(input_tickers))
    out_drop_list = list(set(df.columns) - set(output_tickers))

    in_df.drop(in_drop_list,axis=1,inplace=True)
    out_df.drop(out_drop_list,axis=1,inplace=True)
    predict_df.drop(in_drop_list,axis=1,inplace=True)

    scaler = MinMaxScaler(feature_range=feature_range)
    in_dataset_scaled = scaler.fit_transform(in_df.values)
    out_dataset_scaled = scaler.fit_transform(out_df.values)
    x_predic = scaler.fit_transform(predict_df.values)

    x = []
    y = []

    for i in range(input_size, in_dataset_scaled.shape[0]-output_size, step_size):
        # adicionar passo
        x.append(in_dataset_scaled[i-input_size:i,:])
        y.append(out_dataset_scaled[i:i+output_size,:])

    x, y = np.array(x), np.array(y)

    output_tickers.sort()

    dic = {ticker:[] for ticker in output_tickers}

    return scaler, x, y, dic, x_predic

In [None]:
def train_val_test_split(x, y, train_ratio=0.7, validation_ratio=0.15, test_ratio=0.15):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 - train_ratio)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

    return x_train, y_train, x_val, y_val, x_test, y_test

In [None]:
def get_tickers_in_range(df, day_range):

    (min, max) = day_range
    

    df1 = df.copy()
    df1 = df1[- (min - 1):].dropna(axis=1)
    print(df1.shape)

    if max == 'MAX':
        return list(df1.columns)

    df2 = df.copy()
    df2 = df2[- (max - 1):].dropna(axis=1)
    print(df2.shape)

    tickers_list = list(set(df1.columns) - set(df2.columns))

    return tickers_list

In [None]:
df = pd.read_csv('dataset.csv')
df.set_index('Date')
df.shape

In [None]:
tickers1 = get_tickers_in_range(df, (188,360))
tickers2 = get_tickers_in_range(df, (360,651))
tickers3 = get_tickers_in_range(df, (651,1000))
tickers4 = get_tickers_in_range(df, (1000,2000))
tickers5 = get_tickers_in_range(df, (2000,5000))
tickers6 = get_tickers_in_range(df, (5000,'MAX'))


In [None]:
def array_to_dict(dic, arr, tickers):

    d = dic.copy()

    for i in range(arr.shape[1]):
        ticker = tickers[i]
        d[ticker] = arr[0,i,:]

    return d

In [None]:
dic = {'A':0, 'B':0}
tickers = ['A', 'B']
real_close = np.array(
    [
        [
            [1,1,1,1,1],
            [2,2,2,2,2]
        ]
    ]
)

predict = np.array(
    [
        [
            [2,2,2,2,2],
            [3,3,3,3,3]
        ]
    ]
)

x = np.array([
    [1, 1, 1, 1, 1],
    [2, 3, 4, 5, 6],
    [4, 9, 16, 25, 36],
    [8, 27, 64, 123, 216]
])

In [None]:
def return_log(arr1, arr2):
    return np.log(arr1) - np.log(arr2)


In [None]:
from math import prod

def to_return_log(arr, k, axis):
    n = arr.shape[axis] - k 
    r = []

    for i in range(n):
        r.append(
            return_log(np.take(arr, i+k, axis=axis), (np.take(arr, i, axis=axis)))
        )

    return np.array(r)

    

In [None]:
to_return_log(x,3,axis=0)