In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from grid_search_parallelized import GAN,Generator, Discriminator, Params
from utils import *
from tqdm import tqdm
import json
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
IMAGES_PATH = 'images/'

## importing data

In [None]:
file_path = 'data_train_log_return.csv'
header = ["stock1", "stock2", "stock3", "stock4"]
df_train = pd.read_csv(file_path, header=None,index_col=0)
df_train.columns = header

In [None]:
print(df_train.describe().to_latex())

## plotting distribution and correlation matrix

In [None]:
cov_matrix  = df_train.cov()
correlation_matrix = df_train.corr()

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Iterate through columns and plot for each subplot
for i, column_name in enumerate(df_train.columns):
    row_index = i // 2
    col_index = i % 2
    sns.histplot(df_train[column_name], kde=True, label='Original Data',stat = 'density', color='blue', alpha=0.5, ax=axes[row_index, col_index])
    axes[row_index, col_index].set_title(f'{column_name} Distribution')
    axes[row_index, col_index].set_xlabel(column_name)
    axes[row_index, col_index].legend()

  
fig.savefig(IMAGES_PATH+'true_data_distribution.png')

### correlation matrix

In [None]:
# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, fmt='.2f', annot=True, cmap=sns.diverging_palette(h_neg=20, h_pos=220), center=0)
plt.title('Original data - Log returns correlation')
plt.tight_layout()

# Saving the plot
plt.savefig(IMAGES_PATH + 'true_data_correlation_matrix.png')

## 1. Analytical solution using cholesky decomposition 

In [None]:

synthetic_data = cholesky(df_train.shape[0],cov_matrix)
synthetic_data = pd.DataFrame(synthetic_data,columns = df_train.columns)
synthetic_data.head()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Iterate through columns and plot for each subplot
for i, column_name in enumerate(df_train.columns):
    row_index = i // 2
    col_index = i % 2
    sns.histplot(df_train[column_name], kde=True, label='Original Data',stat = 'density', color='blue', alpha=0.5, ax=axes[row_index, col_index])
    sns.histplot(synthetic_data[column_name], kde=True, label='Synthetic Data',stat = 'density', color='orange', alpha=0.5, ax=axes[row_index, col_index])
    axes[row_index, col_index].set_title(f'{column_name} Distribution')
    axes[row_index, col_index].set_xlabel(column_name)
    axes[row_index, col_index].legend()

# Adjust layout
plt.tight_layout()
plt.show()
fig.savefig(IMAGES_PATH+'cholesky_synthetic_data')

In [None]:

plt.figure(figsize=(10, 8))
sns.heatmap(synthetic_data.corr(), 
            fmt='.2f',
            annot=True,
            cmap=sns.diverging_palette(h_neg=20,
                                          h_pos=220), center=0).set(title='synthetic data - Log returns correlation')
plt.tight_layout()
# Saving the plot
plt.savefig(IMAGES_PATH + 'cholesky_synthetic_correlation_matrix.png')

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 8))

# Flatten the axis array for easy iteration
axs = axs.ravel()

for i, column in enumerate(df_train.columns):
    # Compute CDFs
    x_train, y_train = compute_cdf(df_train[column])
    x_synthetic, y_synthetic = compute_cdf(synthetic_data[column])

    # Plot CDFs
    axs[i].plot(x_train, y_train, label='True Distribution', color='blue')
    axs[i].plot(x_synthetic, y_synthetic, label='Synthetic Distribution', color='red')

    axs[i].set_title(f'CDF of {column}')
    axs[i].set_xlabel('Value')
    axs[i].set_ylabel('CDF')
    axs[i].legend()

# Adjust layout
fig.tight_layout()
fig.show()
fig.savefig(IMAGES_PATH + 'cholesky_cdf_synthetic_correlation_matrix.png')

In [None]:
ad = []
kendall = []
test_size = 410
for i in tqdm(range(200)):
    test_sample = df_train.sample(test_size)
    cov_matrix_sample = test_sample.cov()
    synthetic_data = cholesky(test_size,cov_matrix_sample)
    ad.append(AndersonDarling(synthetic_data,test_sample.values))
    kendall.append(kendall_tau_distance(pd.DataFrame(synthetic_data,columns =test_sample.columns),test_sample))



In [None]:
simulations = pd.DataFrame({'AndersonDarling':ad,'Kendall':kendall})
import matplotlib.pyplot as plt

# Assuming your DataFrame is named df
df = pd.DataFrame({'AndersonDarling': ad, 'Kendall': kendall})

# Creating subplots
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))

# Plotting histogram for AndersonDarling values
axes[0].hist(df['AndersonDarling'], bins=20, edgecolor='black')
axes[0].set_title('Histogram of Anderson-Darling Test Results')
axes[0].set_xlabel('Anderson-Darling Statistic')
axes[0].set_ylabel('Frequency')

# Plotting histogram for Kendall values
axes[1].hist(df['Kendall'], bins=20, edgecolor='black')
axes[1].set_title('Histogram of Kendall Tau Distance Results')
axes[1].set_xlabel('Kendall Tau Distance')
axes[1].set_ylabel('Frequency')

# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()


fig.savefig(IMAGES_PATH + 'cholesky_iterations_metrics')

## 2. Gaussian mixture models

In [None]:

iterations = 200
n_components = 10
best_anderling = 1e6
best_kendall = 1e6
best_weights = None
for i in tqdm(range(iterations)):
    train, test = train_test_split(df_train, test_size=0.6, random_state=i)
    train.reset_index(drop=True,inplace=True)
    test.reset_index(drop=True,inplace=True)
    if best_weights is None :
        gmm = GaussianMixture(n_components=n_components,
                        covariance_type=  "full",
                        weights_init= best_weights)
    else :
        GaussianMixture(n_components=n_components,
                        covariance_type=  "full")
        
    gmm.fit(train)


    # Generating new samples
    synthetic_data,_  = gmm.sample(test.shape[0])  # Generate 10 new samples
    synthetic_data = pd.DataFrame(synthetic_data,columns = test.columns)
    distance = AndersonDarling(test.values, synthetic_data.values)
    kendall = kendall_tau_distance(test,synthetic_data)
    if distance < best_anderling:
        print('new best', distance)
        best_anderling = distance
        best_kendall = kendall
        best_weights = gmm.weights_


In [None]:
synthetic_data,_ = gmm.sample(df_train.shape[0])
synthetic_data = pd.DataFrame(synthetic_data,columns = df_train.columns.tolist())
fig, axs = plt.subplots(2, 2, figsize=(10, 8))

# Flatten the axis array for easy iteration
axs = axs.ravel()

for i, column in enumerate(df_train.columns):
    # Compute CDFs
    x_train, y_train = compute_cdf(df_train[column])
    x_synthetic, y_synthetic = compute_cdf(synthetic_data[column])

    # Plot CDFs
    axs[i].plot(x_train, y_train, label='True Distribution', color='blue')
    axs[i].plot(x_synthetic, y_synthetic, label='Synthetic Distribution', color='red')

    axs[i].set_title(f'CDF of {column}')
    axs[i].set_xlabel('Value')
    axs[i].set_ylabel('CDF')
    axs[i].legend()

# Adjust layout
fig.tight_layout()
fig.show()
fig.savefig(IMAGES_PATH + 'GMM_cdf_synthetic_correlation_matrix.png')

In [None]:
ad = []
kendall = []
test_size = 410
for i in tqdm(range(200)):
    test_sample = df_train.sample(test_size)
    synthetic_data,_ = gmm.sample(test_size)
    ad.append(AndersonDarling(synthetic_data,test_sample.values))
    kendall.append(kendall_tau_distance(pd.DataFrame(synthetic_data,columns =test_sample.columns),test_sample))

simulations = pd.DataFrame({'AndersonDarling':ad,'Kendall':kendall})
import matplotlib.pyplot as plt

# Assuming your DataFrame is named df
df = pd.DataFrame({'AndersonDarling': ad, 'Kendall': kendall})

# Creating subplots
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))

# Plotting histogram for AndersonDarling values
axes[0].hist(df['AndersonDarling'], bins=20, edgecolor='black')
axes[0].set_title('Histogram of Anderson-Darling Test Results')
axes[0].set_xlabel('Anderson-Darling Statistic')
axes[0].set_ylabel('Frequency')

# Plotting histogram for Kendall values
axes[1].hist(df['Kendall'], bins=20, edgecolor='black')
axes[1].set_title('Histogram of Kendall Tau Distance Results')
axes[1].set_xlabel('Kendall Tau Distance')
axes[1].set_ylabel('Frequency')

# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()


fig.savefig(IMAGES_PATH + 'GMM_iterations_metrics')

In [None]:
print(df.describe().to_latex())

In [None]:

synthetic_data,_ = gmm.sample(df_train.shape[0])
synthetic_data = pd.DataFrame(synthetic_data,columns = df_train.columns)
feature_names = df_train.columns
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Iterate through columns and plot for each subplot
for i, column_name in enumerate(df_train.columns):
    row_index = i // 2
    col_index = i % 2
    sns.histplot(df_train[column_name], kde=True, label='Original Data',stat = 'density', color='blue', alpha=0.5, ax=axes[row_index, col_index])
    sns.histplot(synthetic_data[column_name], kde=True, label='Synthetic Data',stat = 'density', color='orange', alpha=0.5, ax=axes[row_index, col_index])
    axes[row_index, col_index].set_title(f'{column_name} Distribution')
    axes[row_index, col_index].set_xlabel(column_name)
    axes[row_index, col_index].legend()

# Adjust layout
plt.tight_layout()
plt.show()
fig.savefig(IMAGES_PATH+'GMM_synthetic_data')

In [None]:

plt.figure(figsize=(10, 8))
sns.heatmap(synthetic_data.corr(), 
            fmt='.2f',
            annot=True,
            cmap=sns.diverging_palette(h_neg=20,
                                          h_pos=220), center=0).set(title='GMM synthetic data - Log returns correlation')
plt.tight_layout()
# Saving the plot
plt.savefig(IMAGES_PATH + 'GMM_synthetic_correlation_matrix.png')

In [None]:
print(synthetic_data.describe().to_latex(

    
))

## 3. GAN Approach

In [41]:
# def aggregate_results(results_dir='results_old'):
#     all_results = []
#     # Loop through each combination folder in the results directory
#     for combi_name in tqdm(os.listdir(results_dir)):
#         combi_dir = os.path.join(results_dir, combi_name)
#         if os.path.isdir(combi_dir):
#             csv_file = os.path.join(combi_dir, 'results.csv')
#             if os.path.exists(csv_file):
#                 # Read the CSV file and append it to the list
#                 df = pd.read_csv(csv_file)
#                 df['Combination'] = combi_name  # Optionally, add a column indicating the combination
#                 all_results.append(df)

#     # Concatenate all dataframes into one
#     df =  pd.concat(all_results, ignore_index=True)
#     df = df.sort_values(by ='mean_anderling_distance')
#     return df


# results = aggregate_results('results').set_index('name')
# results.to_csv('submission/grid_search_results.csv')
# results.sample(10)

Unnamed: 0_level_0,g_number_layer,g_number_neuron,g_hidden_activation,g_output_activation,d_number_layer,d_number_neuron,d_hidden_activation,d_output_activation,latent_dim,mean_anderling_distance,mean_kendall_tau,Combination
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Generator_16_relu_softplus_Discriminator_8_relu_softplus_LatentDim_100,1,[16],['relu'],softplus,1,[8],['relu'],softplus,100,1.146847,0.074504,Generator_16_relu_softplus_Discriminator_8_rel...
Generator_16_softplus_16_softplus_softplus_Discriminator_16_softplus_16_softplus_sigmoid_LatentDim_20,2,"[16, 16]","['softplus', 'softplus']",softplus,2,"[16, 16]","['softplus', 'softplus']",sigmoid,20,39.332153,0.159008,Generator_16_softplus_16_softplus_softplus_Dis...
Generator_8_leaky_relu_8_leaky_relu_softplus_Discriminator_64_leaky_relu_softplus_LatentDim_50,2,"[8, 8]","['leaky_relu', 'leaky_relu']",softplus,1,[64],['leaky_relu'],softplus,50,7.404184,0.045845,Generator_8_leaky_relu_8_leaky_relu_softplus_D...
Generator_32_leaky_relu_softplus_Discriminator_8_softplus_8_softplus_sigmoid_LatentDim_100,1,[32],['leaky_relu'],softplus,2,"[8, 8]","['softplus', 'softplus']",sigmoid,100,2.324328,0.036774,Generator_32_leaky_relu_softplus_Discriminator...
Generator_64_softplus_64_softplus_softplus_Discriminator_8_softplus_8_softplus_softplus_LatentDim_20,2,"[64, 64]","['softplus', 'softplus']",softplus,2,"[8, 8]","['softplus', 'softplus']",softplus,20,25.638776,0.097676,Generator_64_softplus_64_softplus_softplus_Dis...
Generator_64_relu_64_relu_softplus_Discriminator_64_softplus_softplus_LatentDim_20,2,"[64, 64]","['relu', 'relu']",softplus,1,[64],['softplus'],softplus,20,11.646904,0.041475,Generator_64_relu_64_relu_softplus_Discriminat...
Generator_32_softplus_32_softplus_softplus_Discriminator_32_relu_32_relu_sigmoid_LatentDim_100,2,"[32, 32]","['softplus', 'softplus']",softplus,2,"[32, 32]","['relu', 'relu']",sigmoid,100,10.142577,0.09336,Generator_32_softplus_32_softplus_softplus_Dis...
Generator_64_softplus_softplus_Discriminator_8_leaky_relu_sigmoid_LatentDim_100,1,[64],['softplus'],softplus,1,[8],['leaky_relu'],sigmoid,100,69.971989,0.057828,Generator_64_softplus_softplus_Discriminator_8...
Generator_8_relu_softplus_Discriminator_64_leaky_relu_64_leaky_relu_sigmoid_LatentDim_20,1,[8],['relu'],softplus,2,"[64, 64]","['leaky_relu', 'leaky_relu']",sigmoid,20,35.413362,0.057346,Generator_8_relu_softplus_Discriminator_64_lea...
Generator_64_softplus_64_softplus_softplus_Discriminator_8_softplus_softplus_LatentDim_20,2,"[64, 64]","['softplus', 'softplus']",softplus,1,[8],['softplus'],softplus,20,56.742621,0.101814,Generator_64_softplus_64_softplus_softplus_Dis...


## Loading weights

best model params has been moved to the folder submission 

In [42]:
with open("submission/model_params.json", "r") as json_file:
    config = json.load(json_file)
    latent_dim = config['latent_dim']
    g_config = config['generator']
    d_config = config['discriminator']

config

{'generator': {'num_layers': 1,
  'neurons_per_layer': [16],
  'hidden_activation': ['relu'],
  'output_activation': 'softplus'},
 'discriminator': {'num_layers': 1,
  'neurons_per_layer': [64],
  'hidden_activation': ['leaky_relu'],
  'output_activation': 'sigmoid'},
 'latent_dim': 100}

In [None]:

opt = Params()  
opt.latent_dim = latent_dim 
opt.n_epochs =0 ## model weights are already trained

def generate_noise(n_samples):
    # Create covariance matrix with 1 on the diagonal and random values for non-diagonal elements
    covariance_matrix= 0.75 ** np.abs(np.subtract.outer ( np.arange(opt.latent_dim),np.arange (opt.latent_dim)))
    noise = np.random.multivariate_normal(mean=np.zeros(opt.latent_dim),
                                            cov=covariance_matrix,
                                            size=n_samples)
    squared_noise = noise**2
    cube_noise = noise **3
    noise = np.concatenate([noise, squared_noise,cube_noise], axis=1)
    return noise

## build GAN
generator = Generator(latent_dim,output_shape = opt.shape_data, **g_config)

### load weigths
generator.model.load_weights('submission/generator_weights.h5')
discriminator = Discriminator(opt.shape_data,**d_config)
gan = GAN(generator,discriminator,opt)

In [None]:
noise = generate_noise(df_train.shape[0])
synthetic_data_gan = generator.model.predict(noise)
synthetic_data_gan = pd.DataFrame(synthetic_data_gan,columns = df_train.columns)
synthetic_data_gan

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 8))

# Flatten the axis array for easy iteration
axs = axs.ravel()

for i, column in enumerate(df_train.columns):
    # Compute CDFs
    x_train, y_train = compute_cdf(df_train[column]*100)
    x_synthetic, y_synthetic = compute_cdf(synthetic_data_gan[column])

    # Plot CDFs
    axs[i].plot(x_train, y_train, label='True Distribution', color='blue')
    axs[i].plot(x_synthetic, y_synthetic, label='Synthetic Distribution', color='red')

    axs[i].set_title(f'CDF of {column}')
    axs[i].set_xlabel('Value')
    axs[i].set_ylabel('CDF')
    axs[i].legend()

# Adjust layout
fig.tight_layout()
fig.show()
fig.savefig(IMAGES_PATH + 'GAN_cdf_synthetic.png')

In [None]:
ad = []
kendall = []
test_size = 410
for i in tqdm(range(200)):
    test_sample = df_train.sample(test_size)
    cov_matrix_sample = test_sample.cov()
    synthetic_data = cholesky(test_size,cov_matrix_sample)
    ad.append(AndersonDarling(synthetic_data,test_sample.values))
    kendall.append(kendall_tau_distance(pd.DataFrame(synthetic_data,columns =test_sample.columns),test_sample))



In [None]:
ad = []
kendall = []
test_size = 410
for i in tqdm(range(200)):
    test_sample = df_train.sample(test_size) *100
    noise = generate_noise(test_size)
    synthetic_data = generator.model.predict(noise)

    ad.append(AndersonDarling(synthetic_data,test_sample.values))
    kendall.append(kendall_tau_distance(pd.DataFrame(synthetic_data,columns =test_sample.columns),test_sample))

simulations = pd.DataFrame({'AndersonDarling':ad,'Kendall':kendall})
import matplotlib.pyplot as plt

# Assuming your DataFrame is named df
df = pd.DataFrame({'AndersonDarling': ad, 'Kendall': kendall})

# Creating subplots
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))

# Plotting histogram for AndersonDarling values
axes[0].hist(df['AndersonDarling'], bins=20, edgecolor='black')
axes[0].set_title('Histogram of Anderson-Darling Test Results')
axes[0].set_xlabel('Anderson-Darling Statistic')
axes[0].set_ylabel('Frequency')

# Plotting histogram for Kendall values
axes[1].hist(df['Kendall'], bins=20, edgecolor='black')
axes[1].set_title('Histogram of Kendall Tau Distance Results')
axes[1].set_xlabel('Kendall Tau Distance')
axes[1].set_ylabel('Frequency')

# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()


fig.savefig(IMAGES_PATH + 'GAN_iterations_metrics')

In [None]:
print(df.describe().to_latex())

In [None]:
noise = generate_noise(df_train.shape[0])
synthetic_data_gan = generator.model.predict(noise)
synthetic_data_gan = pd.DataFrame(synthetic_data_gan,columns = df_train.columns)
synthetic_data_gan
feature_names = df_train.columns
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Iterate through columns and plot for each subplot
for i, column_name in enumerate(df_train.columns):
    row_index = i // 2
    col_index = i % 2
    sns.histplot(df_train[column_name]*100, kde=True, label='Original Data',stat = 'density', color='blue', alpha=0.5, ax=axes[row_index, col_index])
    sns.histplot(synthetic_data_gan[column_name], kde=True, label='Synthetic Data',stat = 'density', color='orange', alpha=0.5, ax=axes[row_index, col_index])
    axes[row_index, col_index].set_title(f'{column_name} Distribution')
    axes[row_index, col_index].set_xlabel(column_name)
    axes[row_index, col_index].legend()

# Adjust layout
plt.tight_layout()
plt.show()
fig.savefig(IMAGES_PATH+'GAN_synthetic_data')

In [None]:

plt.figure(figsize=(10, 8))
sns.heatmap(synthetic_data_gan.corr(), 
            fmt='.2f',
            annot=True,
            cmap=sns.diverging_palette(h_neg=20,
                                          h_pos=220), center=0).set(title='synthetic data - Log returns correlation')
plt.tight_layout()
# Saving the plot
plt.savefig(IMAGES_PATH + 'GAN_synthetic_correlation_matrix.png')

## Generating submission files

In [None]:
n2 = 410
noise = generate_noise(n2)
synthetic_data_gan = generator.model.predict(noise)
synthetic_data_gan = pd.DataFrame(synthetic_data_gan,columns = df_train.columns) /100
compare(synthetic_data_gan,df_train.sample(n2))

In [None]:
noise = pd.DataFrame(noise)
noise.to_csv('submission/noise.csv')
noise.head()

In [None]:
synthetic_data_gan.to_csv('submission/synthetic_data.csv')

### Reproducing results

In [43]:
file_path = 'data_train_log_return.csv'
header = ["stock1", "stock2", "stock3", "stock4"]
df_train = pd.read_csv(file_path, header=None,index_col=0)
df_train.columns = header


with open("submission/model_params.json", "r") as json_file:
    config = json.load(json_file)
    latent_dim = config['latent_dim']
    g_config = config['generator']
    d_config = config['discriminator']

opt = Params()  # Assuming Params is a class that contains other GAN parameters
opt.latent_dim = latent_dim 
opt.n_epochs =0 ## model weights are already trained

def generate_noise(n_samples):
    # Create covariance matrix with 1 on the diagonal and random values for non-diagonal elements
    covariance_matrix= 0.75 ** np.abs(np.subtract.outer ( np.arange(opt.latent_dim),np.arange (opt.latent_dim)))
    noise = np.random.multivariate_normal(mean=np.zeros(opt.latent_dim),
                                            cov=covariance_matrix,
                                            size=n_samples)
    squared_noise = noise**2
    cube_noise = noise **3
    noise = np.concatenate([noise, squared_noise,cube_noise], axis=1)
    return noise

## build GAN
generator = Generator(latent_dim,output_shape = opt.shape_data, **g_config)

### load weigths
generator.model.load_weights('submission/generator_weights.h5')
discriminator = Discriminator(opt.shape_data,**d_config)
gan = GAN(generator,discriminator,opt)


### import noise
noise = pd.read_csv('submission/noise.csv',index_col=0)
noise = noise.values
synthetic_data = generator.model.predict(noise)
synthetic_data = pd.DataFrame(synthetic_data,columns = df_train.columns) /100
synthetic_data





Unnamed: 0,stock1,stock2,stock3,stock4
0,0.008924,0.001562,0.008722,0.009693
1,0.029719,0.023891,0.018772,0.019281
2,0.016568,0.004815,0.006206,0.012109
3,0.031298,0.058209,0.002368,0.007062
4,0.028111,0.008061,0.004144,0.034699
...,...,...,...,...
405,0.014071,0.037233,0.006712,0.009907
406,0.006035,0.001339,0.009284,0.004527
407,0.017424,0.018642,0.003280,0.002421
408,0.035604,0.014567,0.005901,0.020795
