In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt
from pylab import rcParams
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pickle as pkl

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.gridspec as gridspec

# Visualize the Data

In [None]:
df = pd.read_csv("./data/features.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.head()

In [None]:
fix, ax = plt.subplots(figsize=(14,7))

ax.plot(df['Date'], df['Return']*1000, label = 'returns')
ax.plot(df['Date'], df['Open-Close']*1000, label = 'open-close')
ax.plot(df['Date'], df['Open-Low']*1000, label = 'open-low')
ax.plot(df['Date'], df['Open-High']*1000, label = 'open-high')

ax.set_xlabel('Date')
ax.set_ylabel('‰')
plt.xlim(pd.to_datetime('1995-01-03'), pd.to_datetime('1995-02-05'))
plt.legend()
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(14,7))

ax.plot(df['Date'], df['Normalized Volume'], label = 'Normalized Volume')
ax.set_xlabel('Date')
ax.set_ylabel('Normalized Volume')
plt.xlim(pd.to_datetime('1995-01-03'), pd.to_datetime('2000-02-05'))
plt.legend()
plt.show()

# Single layer MLP GAN

In [None]:
def prep_data(data: np.array, window_len, scaler) -> np.array:
    
    """
    Args:
      -  data = np.array
      -  window_len = length of window
      -  scaler = sklearn.preprocessing

    Returns:
      - processed: preprocessed data as python list
    """
    # normalize data
    scaler = scaler.fit(data)
    scaled_data = scaler.transform(data)
    
    # group data into windows of length window_len
    windows = []
    for i in range(len(data) - window_len):
        windows.append(scaled_data[i:i+window_len])
        
    # reorder the data
    idx = np.random.permutation(len(windows))

    processed = []
    for i in range(len(windows)):
        processed.append(windows[idx[i]])
    
    return processed


Very simple GAN implementation for our baseline. Will use convolutions and TimeGAN as the next models

In [None]:
def train_gan(true_data, training_steps, generator, discriminator,lr = 0.001):

    # Optimizers
    generator_optimizer = torch.optim.Adam(generator.parameters(), lr = lr)
    discriminator_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr)

    # binary cross entropy loss
    loss = nn.BCELoss()
    true_data = torch.tensor(true_data).float()
    true_labels = torch.tensor(np.ones((len(true_data),1))).float()

    gen_data_loss = []
    true_data_loss = []
    for i in range(training_steps):
        
        # zero the gradients on each iteration
        generator_optimizer.zero_grad()

        # Create noisy input for generator
        # if minmaxscaling was used, the inputs should be bertween 0 and 1
        noise = torch.rand(true_data.shape[0], true_data.shape[1])
        
        # generator
        generated_data = generator(noise)
       
        generator_discriminator_out = discriminator(generated_data)
        
       # print(generator_discriminator_out.shape, true_labels.shape)
        
        generator_loss = loss(generator_discriminator_out, true_labels)
        generator_loss.backward()
        generator_optimizer.step()

        
        # Train the discriminator on the true/generated data
        discriminator_optimizer.zero_grad()
        true_discriminator_out = discriminator(true_data)        
        true_discriminator_loss = loss(true_discriminator_out, true_labels)

        generator_discriminator_out = discriminator(generated_data.detach())
        generator_discriminator_loss = loss(generator_discriminator_out, torch.zeros((len(true_data),1)))
        discriminator_loss = (true_discriminator_loss + generator_discriminator_loss) / 2
        gen_data_loss.append(generator_discriminator_loss)
        true_data_loss.append(true_discriminator_loss)
        discriminator_loss.backward()
       
        discriminator_optimizer.step()
        if i%30==0:
           
            print("epoch: ", i, "discriminator loss: ",  discriminator_loss.item())
    return generator, discriminator, gen_data_loss, true_data_loss
        

In [None]:
# define minmax scaler
# scaler = MinMaxScaler()
scaler = StandardScaler()
df = pd.read_csv("./data/features.csv")
df['Date'] = pd.to_datetime(df['Date'])
# set index to date
try:
    df = df.set_index('Date').sort_index()
except:
    df = df

# prep data
df = df.drop(['Close-MA 20D', 'RSI 14D','Realized Volatility 30D','VIX Move'], axis=1)
data = prep_data(df.values, 30, scaler)

# check size
print(len(data), data[0].shape)
data = np.array(data)
reshaped_data =  data.reshape(len(data),data.shape[1]*data.shape[2])

print(data.shape)

In [None]:
class Generator2(torch.nn.Module):

    def __init__(self, input_length: int):
        super(Generator2, self).__init__()
        self.net = nn.Sequential(
                                 nn.Linear(input_length,500),
                                 nn.ReLU(inplace=True),
                                 nn.Linear(500,1200),
                                 nn.ReLU(inplace=True),
                                 nn.Linear(1200,800),
                                 nn.ReLU(inplace=True),
                                 nn.Linear(800,input_length)
                                    )
    def forward(self, x):
        return self.net(x)

In [None]:
class Discriminator2(nn.Module):
    def __init__(self, input_length: int):
        super(Discriminator2, self).__init__()
        self.net = nn.Sequential(
                                 nn.Linear(input_length,500),
                                 nn.ReLU(inplace=True),
                                 nn.Linear(500,400),
                                 nn.ReLU(inplace=True),
                                 nn.Linear(400,400),
                                 nn.ReLU(inplace=True),
                                 nn.Linear(400,1),
                                 nn.Sigmoid()
                                    )
    def forward(self, x):
        return self.net(x)

In [None]:
generator2 = Generator2(reshaped_data.shape[1])
discriminator2 = Discriminator2(reshaped_data.shape[1])
gen2, disc2, gen_loss2, disc_loss2 = train_gan(reshaped_data, 500, generator2, discriminator2,0.00005)

In [None]:
plt.plot(gen_loss2)
plt.plot(disc_loss2)
plt.legend(["Generated Data", "True Data"])
plt.xlabel("Epoch")
plt.ylabel("Binary Cross-Entropy")
plt.show()

In [None]:
noise = torch.rand(reshaped_data.shape[0], reshaped_data.shape[1])
generated_data2 = gen2(noise)
generated_data2 = generated_data2.reshape(data.shape[0],data.shape[1],data.shape[2])
generated_data2 = generated_data2.detach().numpy()

In [None]:
plt.plot(generated_data2[0,:,0],label='generated data for GAN2')
plt.legend()
plt.show()

In [None]:
plt.plot(generated_data2[1000,:,0],label='generated data for GAN2')
plt.legend()
plt.show()

There is still the problem, that if you generate 1000 examples, they are always the same

# Analyzing Results

In [None]:
# define minmax scaler
# scaler = MinMaxScaler()
scaler = StandardScaler()
df = pd.read_csv("./data/features.csv")
df['Date'] = pd.to_datetime(df['Date'])
# set index to date
try:
    df = df.set_index('Date').sort_index()
except:
    df = df

# prep data
df = df.drop(['Close-MA 20D', 'RSI 14D','Realized Volatility 30D','VIX Move'], axis=1)
data = prep_data(df.values, 30, scaler)

# check size
print(len(data), data[0].shape)
data = np.array(data)
reshaped_data =  data.reshape(len(data),data.shape[1]*data.shape[2])

print(data.shape)

In [None]:
df.head()

## Visualizing synthetic data for GAN2

In [None]:
cols = [
    "Return","Open-Close",'Open-Low',"Open-High","Normalized Volume", "VIX", "VIX Open-close"
]

# Plotting some generated samples. Both Synthetic and Original data are still standardized with values between [0, 1]
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 10))
axes=axes.flatten()

time = list(range(1,30))
obs = np.random.randint(len(generated_data2.shape))
unscaled_data = scaler.inverse_transform(data)
unscaled_generated2 = scaler.inverse_transform(generated_data2)
for j, col in enumerate(cols):
    frame = pd.DataFrame({'Real': unscaled_data[obs][:, j],
                   'Synthetic': unscaled_generated2[obs][:, j]})
    frame.plot(ax=axes[j],
            title = col,
            secondary_y='Synthetic data', style=['-', '--'])
fig.tight_layout()


## PCA for GAN2

In [None]:
seq_len = 30
sample_size = 100
# get random indexes
idx = np.random.permutation(len(data))[:sample_size]
real_sample = data[idx]
synthetic_sample = generated_data2[idx]

# reshape
real_data_reduced = real_sample.reshape(-1, seq_len)
synth_data_reduced = synthetic_sample.reshape(-1,seq_len)

n_components = 2
pca = PCA(n_components=n_components)

pca.fit(real_data_reduced)

pca_real = pd.DataFrame(pca.transform(real_data_reduced))
pca_synth = pd.DataFrame(pca.transform(synth_data_reduced))

data_reduced = np.concatenate((real_data_reduced, synth_data_reduced), axis=0)

In [None]:

fig = plt.figure(constrained_layout=True, figsize=(10, 5))
spec = gridspec.GridSpec(ncols=2, nrows=1, figure=fig)

ax = fig.add_subplot(spec[0,0])
ax.set_title('PCA on baseline GAN')

# PCA scatter plot
plt.scatter(pca_real.iloc[:, 0].values, pca_real.iloc[:, 1].values,
            c='black', alpha=0.2, label='Original')

plt.scatter(pca_synth.iloc[:, 0], pca_synth.iloc[:, 1],
            c='red', alpha=0.2, label='Synthetic')


ax.legend()

Better, but still what's happening is that if i generate 6000 samples, they are all exactly the same, which is why the pca is a single point. At least this time the points are neared to the true distribution

## TNSE for GAN2

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=n_components, n_iter=300)

tsne_results = pd.DataFrame(tsne.fit_transform(data_reduced))

fig = plt.figure(constrained_layout=True, figsize=(10, 5))
spec = gridspec.GridSpec(ncols=2, nrows=1, figure=fig)

ax2 = fig.add_subplot(spec[0,0])
ax2.set_title('t-SNE on baseline GAN')

# t-SNE scatter plot
plt.scatter(tsne_results.iloc[:700, 0].values, tsne_results.iloc[:700, 1].values,
            c='black', alpha=0.2, label='Original')
plt.scatter(tsne_results.iloc[700:, 0], tsne_results.iloc[700:, 1],
            c='red', alpha=0.2, label='Synthetic')

ax2.legend()

In [None]:
with open("data.pkl", "wb") as f:
    pkl.dump(data, f)
with open("gen_data.pkl", "wb") as f:
    pkl.dump(generated_data2, f)

print(generated_data2[0])
print(generated_data2[1])