# Data Preparation

## Modules

In [1]:
import numpy as np
import pandas as pd
import pickle

## Tickers

In [2]:
tickers_trad = ['PETR4.SA', 'VALE3.SA']

In [3]:
ticker_bench = '^BVSP'

## Data

In [4]:
stock_data = {}
for ticker in tickers_trad:
    df = pd.read_csv(f'../data/{ticker}.csv', index_col = 'Date', parse_dates = True)
    stock_data[ticker] = df

## Split data

In [17]:
# train_time_range = ("2023-01-01", "2024-04-30")
# validation_time_range = ("2024-05-01", "2024-08-31")
# test_time_range = ("2024-09-01", "2024-10-31")

train_time_range = ("2020-01-01", "2024-01-31")
validation_time_range = ("2024-02-01", "2024-06-30")
test_time_range = ("2024-07-01", "2024-10-31")

In [18]:
train_data = {}
valid_data = {}
test_data = {}

In [19]:
for ticker, df in stock_data.items():
    train_data[ticker] = df.loc[train_time_range[0]:train_time_range[1]]
    valid_data[ticker] = df.loc[validation_time_range[0]:validation_time_range[1]]
    test_data[ticker] = df.loc[test_time_range[0]:test_time_range[1]]

In [20]:
train_data[ticker].shape, valid_data[ticker].shape, test_data[ticker].shape

((1015, 6), (102, 6), (88, 6))

## Add technical indicators

In [21]:
def add_technical_indicators(df):
    # RSI 14
    delta = df['Close'].diff()
    up = delta.where(delta > 0, 0)
    down = -delta.where(delta < 0, 0)
    rs = up.rolling(window=14).mean() / down.rolling(window=14).mean()
    df['RSI'] = 100 - (100 / (1 + rs))

    # MACD
    df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = df['EMA12'] - df['EMA26']
    df['Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    
    # RSI
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # CCI 20
    tp = (df['High'] + df['Low'] + df['Close']) / 3
    sma_tp = tp.rolling(window=20).mean()
    mean_dev = tp.rolling(window=20).apply(lambda x: np.mean(np.abs(x - x.mean())))
    df['CCI'] = (tp - sma_tp) / (0.015 * mean_dev)
    
    # ADX 14
    high_diff = df['High'].diff()
    low_diff = df['Low'].diff()
    df['+DM'] = np.where((high_diff > low_diff) & (high_diff > 0), high_diff, 0)
    df['-DM'] = np.where((low_diff > high_diff) & (low_diff > 0), low_diff, 0)
    tr = pd.concat([df['High'] - df['Low'], np.abs(df['High'] - df['Close'].shift(1)), np.abs(df['Low'] - df['Close'].shift(1))], axis=1).max(axis=1)
    atr = tr.ewm(span=14, adjust=False).mean()
    df['+DI'] = 100 * (df['+DM'].ewm(span=14, adjust=False).mean() / atr)
    df['-DI'] = 100 * (df['-DM'].ewm(span=14, adjust=False).mean() / atr)
    dx = 100 * np.abs(df['+DI'] - df['-DI']) / (df['+DI'] + df['-DI'])
    df['ADX'] = dx.ewm(span=14, adjust=False).mean()

    # drop NaN values
    df.dropna(inplace=True)

    # keep Open, High, Low, Close, Volume, MACD, Signal, RSI, CCI, ADX
    df = df[['Open', 'High', 'Low', 'Close', 'Volume', 'MACD', 'Signal', 'RSI', 'CCI', 'ADX']]

    return df

In [22]:
# add technical indicators to the training data for each stock
for ticker, df in train_data.items():
    train_data[ticker] = add_technical_indicators(df)

# add technical indicators to the validation data for each stock
for ticker, df in valid_data.items():
    valid_data[ticker] = add_technical_indicators(df)

# add technical indicators to the test data for each stock
for ticker, df in test_data.items():
    test_data[ticker] = add_technical_indicators(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RSI'] = 100 - (100 / (1 + rs))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()
A value is trying to be set on a copy of a slice f

## Save data

In [23]:
with open('../data/train.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
with open('../data/valid.pickle', 'wb') as handle:
    pickle.dump(valid_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
with open('../data/test.pickle', 'wb') as handle:
    pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)