In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv("features.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Date,Return,Open-Close,Open-Low,Open-High,Close-MA 20D,RSI 14D,Normalized Volume,Realized Volatility 30D,VIX,VIX Move,VIX Open-Close
0,1995-01-03,-0.000348,-0.000218,-0.004377,0.000131,0.007283,0.53269,-0.556965,0.100032,14.25,0.079545,0.011356
1,1995-01-04,0.003485,0.003441,-0.00342,0.003463,0.009975,0.563792,0.273094,0.099786,13.53,-0.050526,-0.024513
2,1995-01-05,-0.000803,-0.000847,-0.002127,0.001237,0.008364,0.554631,0.115542,0.097703,13.5,-0.002217,-0.014599
3,1995-01-06,0.000739,0.000652,-0.001977,0.004583,0.008066,0.561685,0.143239,0.081286,13.13,-0.027407,-0.039503
4,1995-01-09,0.000326,0.000347,-0.002019,0.002388,0.0067,0.564956,-0.229424,0.081226,13.33,0.015232,-0.014782


# New Section

In [None]:
def prep_data(data: np.array, window_len, scaler) -> np.array:
    
    """
    Args:
      -  data = np.array
      -  window_len = length of window
      -  scaler = sklearn.preprocessing

    Returns:
      - processed: preprocessed data as python list
    """
    # normalize data
    scaler = scaler.fit(data)
    scaled_data = scaler.transform(data)
    
    # group data into windows of length window_len
    windows = []
    for i in range(len(data) - window_len):
        windows.append(scaled_data[i:i+window_len])
        
    # reorder the data
    idx = np.random.permutation(len(windows))

    processed = []
    for i in range(len(windows)):
        processed.append(windows[idx[i]])
    
    return processed


In [None]:
# define minmax scaler
scaler = MinMaxScaler()
df = pd.read_csv("features.csv")
df['Date'] = pd.to_datetime(df['Date'])
# set index to date
try:
    df = df.set_index('Date').sort_index()
except:
    df = df

# prep data
df = df.drop(['Close-MA 20D', 'RSI 14D','Realized Volatility 30D','VIX Move'], axis=1)
data = prep_data(df.values, 30, scaler)


In [None]:
seq_len = 30        # Timesteps
n_seq = 7          # Features

hidden_dim = 24     # Hidden units for generator (GRU & LSTM)

gamma = 1           # discriminator loss

noise_dim = 32      # Used by generator as a starter dimension
dim = 128           # UNUSED
batch_size = 128

learning_rate = 5e-4
beta_1 = 0          # UNUSED
beta_2 = 1          # UNUSED
data_dim = 28       # UNUSED

# batch_size, lr, beta_1, beta_2, noise_dim, data_dim, layers_dim
gan_args = [batch_size, learning_rate, beta_1, beta_2, noise_dim, data_dim, dim]

In [None]:
!pip install ydata-synthetic==0.3.0
from ydata_synthetic.synthesizers.timeseries import TimeGAN




In [None]:
synth = TimeGAN(model_parameters=gan_args, hidden_dim=hidden_dim, seq_len=seq_len, n_seq=n_seq, gamma=1)
synth.train(data, train_steps=500)
synth.save('synth.pkl')

synth_data = synth.sample(len(data))

Emddeding network training: 100%|██████████| 500/500 [04:06<00:00,  2.03it/s]
Supervised network training: 100%|██████████| 500/500 [04:00<00:00,  2.08it/s]
Joint networks training: 100%|██████████| 500/500 [23:39<00:00,  2.84s/it]
Synthetic data generation: 100%|██████████| 53/53 [00:16<00:00,  3.22it/s]


In [None]:
synth_data = np.array(synth_data)
import pickle as pkl
with open('generated.pkl','wb') as f:
    pkl.dump(synth_data, f)