### Importació de llibreries

In [None]:
import yfinance as yf
import datetime
import os
import pandas as pd
import numpy as np
import keras
import graphviz
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Reshape, Flatten, Conv1D, Conv1DTranspose, LeakyReLU, Concatenate
from keras.optimizers.legacy import Adam, Adamax
from numpy import expand_dims, ones, zeros, vstack
from numpy.random import randn, randint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.layers import LSTM
from tensorflow.keras.utils import plot_model
from IPython.display import clear_output

from keras.layers import Input, RepeatVector, TimeDistributed, Permute
from matplotlib import pyplot as plt

### Càrrega de dades

In [None]:
# Afegir al path
# Add 'C:\Program Files\Graphviz\bin' to the system path
os.environ['PATH'] += os.pathsep + 'C:\\Program Files\\Graphviz\\bin\\*'
# Get the cwd
cwd = os.getcwd()
cwd_parent = os.path.dirname(cwd)
# Get the paths to the data and src folders
data_path = os.path.join(cwd_parent, 'data')
src_path = os.path.join(cwd_parent, 'src')

# Get the cwd
cwd = os.getcwd()
cwd_parent = os.path.dirname(cwd)
# Get the paths to the data and src folders
data_path = os.path.join(cwd_parent, 'data')

# Defineix el rang de dates desitjat per al 2023
start_date = "2005-01-01"
end_date = "2023-12-31"

start_year = int(start_date[:4])
end_year = int(end_date[:4])
company = 'MMM'


# Descarrega les dades amb intervals d'1 hora
s_and_p_data = yf.download(company, start=start_date, end=end_date, interval='1d')
# Convertir la columna de dates a tipus datetime
s_and_p_data.index = pd.to_datetime(s_and_p_data.index)
# Fer una còpia del conjunt de dades amb les capçaleres correctes
s_and_p_data.reset_index(inplace=True)
#s_and_p_data['Date'] = (s_and_p_data['Date'] - pd.Timestamp(start_date)) // pd.Timedelta('1d')
data_features = s_and_p_data.copy()
# Obtenció del dia de la setmana a partir de la data
data_features['Weekday'] = data_features['Date'].dt.dayofweek
data_features['Company'] = company
data_features.to_csv(os.path.join(data_path, f'{company}_1d_{start_year}_to_{end_year}.csv'))

# Load the csv file 
data = pd.read_csv(os.path.join(data_path, f'{company}_1d_{start_year}_to_{end_year}.csv'))


## Apliquem una rolling window amb numpy a les dades

In [None]:
data_array = np.array(data_features[['Close', 'Volume', 'Weekday']])

window_len = 20

# Escalar la columna 'Volume'
scaler = MinMaxScaler()
data_array[:, 1] = scaler.fit_transform(data_array[:, 1].reshape(-1, 1)).flatten()

x_array = np.zeros((len(data_array)-window_len, window_len, data_array.shape[1]))
y_array = np.zeros((len(data_array)-window_len))
for row in range(data_array.shape[0] - window_len - 1):
    x_array[row] = data_array[row:row + window_len]
    y_array[row] = data_array[row + window_len][0]

print(data_array)
# Comprovem que és correcte
print(y_array[0] == x_array[1, -1, 0])

## Separem entre entrenament i test

In [None]:
# Proporcions de les seccions
sections = [(0.15, 'train'), (0.05, 'test'), (0.15, 'train'), (0.05, 'test'), 
            (0.15, 'train'), (0.05, 'test'), (0.15, 'train'), (0.05, 'test'), 
            (0.15, 'train')]

def split_data_into_sections(x_array, y_array, sections):
    """
    Splits the data into training and testing sections based on the provided proportions.

    Parameters:
    - x_array: The array of features.
    - y_array: The array of labels.
    - sections: A list of tuples where each tuple contains a proportion and a label ('train' or 'test').

    Returns:
    - x_train: List of training features.
    - y_train: List of training labels.
    - x_test: List of testing features.
    - y_test: List of testing labels.
    """
    x_train, y_train = [], []
    x_test, y_test = [], []
    total_rows = len(x_array)
    current_index = 0

    for proportion, label in sections:
        section_size = int(proportion * total_rows)
        if label == 'train':
            x_train.extend(x_array[current_index:current_index + section_size])
            y_train.extend(y_array[current_index:current_index + section_size])
        elif label == 'test':
            x_test.extend(x_array[current_index:current_index + section_size])
            y_test.extend(y_array[current_index:current_index + section_size])
        current_index += section_size

    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = split_data_into_sections(x_array, y_array, sections)

# Convertir llistes a arrays numpy per ser consistents amb l'entrada
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

Definim funcions de la C-GAN i la pròpia C-GAN 

In [None]:
def generate_latent_points(latent_dim, n_samples):

    # Generate n_samples points in the latent space
    x_input = randn(latent_dim * n_samples)
    # Reshape the points into a batch of inputs for the network
    x_input = x_input.reshape(n_samples, latent_dim)
    # Return the generated points
    return x_input

# El discriminador prediu real o fals
# y_discriminador = 1/0
# labels_discriminador = el que condiciona al que generem
# X_discriminador = El que estem generant en si

def generate_real_samples(x_data, y_data, n_samples):
    # Deteminem n_samples a l'atzar
    ix = randint(0, x_data.shape[0], n_samples)
    # Ens quedem amb els valors triats a l'atzar
    x_disc, labels_disc = x_data[ix], y_data[ix]
    # Generem y_disc amb 1, que vol dir reals
    y_disc = ones((n_samples, 1))
    return [x_disc, labels_disc], y_disc


def define_discriminator(n_labels, win_len, opt, loss):
    # label input
    in_feat = Input(shape=(win_len, n_labels))
    in_feat_1 = LSTM(win_len * n_labels, return_sequences=True)(in_feat)
    in_feat_2 = LSTM(win_len * n_labels, return_sequences=False)(in_feat_1)
    in_feat_3 = Dense(win_len * n_labels)(in_feat_2)
    in_feat_4 = Reshape((win_len, n_labels))(in_feat_3)

    # generator output
    x_disc = Input(shape=(1,))
    x_disc_1 = Dense(n_labels * win_len)(x_disc)
    x_disc_2 = Reshape((win_len, n_labels))(x_disc_1)

    # merge generator output and label input
    merge = Concatenate()([in_feat_4, x_disc_2])

    # output layer
    out_layer = Dense(1, activation='sigmoid')(merge)

    # define model
    model = Model([in_feat, x_disc], out_layer)

    # compile model
    model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
    return model


def define_generator(latent_dim, n_labels, win_len):
    # label input
    in_feat = Input(shape=(win_len, n_labels))
    in_feat_1 = LSTM(win_len * n_labels, return_sequences=True)(in_feat)
    in_feat_2 = LSTM(latent_dim * win_len, return_sequences=False)(in_feat_1)
    # reshape to the latent_dim
    in_feat_3 = Reshape((latent_dim, win_len))(in_feat_2)
    
    # latent_dim generator input
    in_lat = Input(shape=(latent_dim, 1))
    in_lat_1 = LSTM(latent_dim, return_sequences=False)(in_lat)
    in_lat_2 = LeakyReLU(alpha=0.2)(in_lat_1)
    in_lat_3 = Reshape((latent_dim, 1))(in_lat_2)

    # merge RA gen and label input
    merge = Concatenate()([in_lat_3, in_feat_3])
    gen = LSTM(50, return_sequences=True)(merge)
    gen_1 = LeakyReLU(alpha=0.2)(gen)
    gen_2 = LSTM(100, return_sequences=False)(gen_1)
    # output is a dense with ReLU activation to ensure non-negative output
    out_layer = Dense(1, activation='relu')(gen_2)
    # define model
    model = Model([in_feat, in_lat], out_layer)
    return model



# Define the GAN model
def define_gan(generator, discriminator, opt, loss, latent_dim, win_len, n_labels):
    discriminator.trainable = False
    noise_input = Input(shape=(latent_dim,))
    label_input = Input(shape=(win_len, n_labels))
    generated_sequence = generator([label_input, noise_input])
    validity = discriminator([label_input, generated_sequence])
    model = Model([label_input, noise_input], validity)
    model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
    return model


#generate latent points
def generate_latent_points(latent_dim, n_samples):
    # Generate points in the latent space
    x_input = randn(latent_dim * n_samples)
    # Reshape the points into a batch of inputs for the network
    x_input = x_input.reshape(n_samples, latent_dim)
    # Return the generated points
    return x_input

# Generate fake samples with class labels
def generate_fake_samples(generator, latent_dim, n_samples, y_data):
    # Generate points in latent space
    latent_input = generate_latent_points(latent_dim, n_samples)
    # Generate GAN input
    X = generator.predict([y_data, latent_input], verbose=0)
    # Create class labels
    y = zeros((n_samples, 1))
    return [X, y_data], y

# Generate real samples
def generate_real_samples(x_data, y_data, n_samples):
    # Deteminem n_samples a l'atzar
    ix = randint(0, x_data.shape[0], n_samples)
    # Ens quedem amb els valors triats a l'atzar
    labels, x_real = x_data[ix], y_data[ix]
    # Generem y_disc amb 1, que vol dir reals
    y_real = ones((n_samples, 1))
    return [labels, x_real], y_real

# Define a printing functions to plot all the x_train values, y_real and y_fake
def plot_all(labels, real_value, prediction):
    # Plot in 3 subplots
    fig, axs = plt.subplots(3)
    # Plot the real values
    axs[0].plot(labels[:,0], label='Close')
    axs[1].plot(labels[:,1], label='Volume')
    axs[2].plot(labels[:,2], label='Weekday')
    plt.show()

# Train the generator and discriminator
def train(g_model, d_model, gan_model, x_train, y_train, latent_dim, exp_name, company, n_epochs=10000, n_batch=128, save_ratio=25, last_epoch=0):
    if not os.path.exists(os.path.join(cwd, company, exp_name, 'images')):
        os.makedirs(os.path.join(cwd, company, exp_name, 'images'))
    bat_per_epo = int(x_train.shape[0] / n_batch)
    half_batch = int(n_batch / 2)
    d_loss_1_hist = []
    d_loss_2_hist = []
    g_loss_hist = []
    # manually enumerate epochs
    for i in range(last_epoch, n_epochs):
        # Clear cell output
        print(f'Epoch {i}/{n_epochs}')
        for j in range(bat_per_epo):
            print(f'Epoch: {i}/{n_epochs}, Batch: {j}/{bat_per_epo}')
            clear_output(wait=True)
            # Get randomly selected 'real' samples
            [labels_real, x_real], y_real = generate_real_samples(x_train, y_train, half_batch)
            # Update discriminator model weights
            d_loss1, _ = d_model.train_on_batch([labels_real, x_real], y_real)
            d_loss_1_hist.append(d_loss1)
            # Generate 'fake' examples
            [x_fake, labels], y_fake = generate_fake_samples(g_model, latent_dim, half_batch, labels_real)
            # Reshape x_fake to have the same shape as x_real
            x_fake = x_fake.reshape(x_real.shape)
            # Update discriminator model weights
            d_loss2, _ = d_model.train_on_batch([labels_real, x_fake], y_fake)
            d_loss_2_hist.append(d_loss2)
            # Prepare points in latent space as input for the generator
            x_gan = generate_latent_points(latent_dim, half_batch)
            # Create inverted labels for the fake samples
            y_gan = ones((half_batch, 1))
            # Update the generator via the discriminator's error
            g_loss, _ = gan_model.train_on_batch([labels_real, x_gan], y_gan)
            g_loss_hist.append(g_loss)
            # plot the loss on each iteration
            #print(f'Epoch {i}/{n_epochs}, Batch {j}/{bat_per_epo}, d1={d_loss1}, d2={d_loss2}, g={g_loss}')
            plt.plot(d_loss_1_hist, label='discriminator real')
            plt.plot(d_loss_2_hist, label='discriminator fake')
            plt.plot(g_loss_hist, label='generator')
            plt.legend(loc = 'upper right')
            plt.show()

        # Save the image and show it
        if (i) % save_ratio == 0 and i != 0:
            # Print a Scatter plot of the real vs fake values
            [eval_labels_real, eval_x_real], eval_y_real = generate_real_samples(x_train, y_train, 32)
            [eval_x_fake, eval_labels], eval_y_fake = generate_fake_samples(g_model, latent_dim, 32, eval_labels_real)
            print(f'Epoch {i}/{n_epochs}')
            plt.scatter(y = eval_x_real, x = range(len(eval_x_real)), color='red', label='Real')
            plt.scatter(y = eval_x_fake, x = range(len(eval_x_fake)), color='blue', label='Predicted')
            # Add vertical lines at each point
            for a in range(len(eval_x_real)):
                # Make the line go from the minimum value to the maximum value
                min_val = min(eval_x_real[a], eval_x_fake[a])
                max_val = max(eval_x_real[a], eval_x_fake[a])
                plt.vlines(x=a, ymin=min_val, ymax=max_val, color='black', linestyles='dotted')
            plt.legend()
            plt.savefig(os.path.join(cwd, company, exp_name, 'images', f'plot_epoch_{i}.png'))
            plt.show() 
            # Save the models
            g_model.save(os.path.join(cwd, company, exp_name, 'models', f'{company}_generator_{i}.h5'))
            d_model.save(os.path.join(cwd, company, exp_name, 'models', f'{company}_discriminator{i}.h5'))
            gan_model.save(os.path.join(cwd, company, exp_name, 'models', f'{company}_gan_{i}.h5'))

# Define a function with 3 inputs, the predicted value, the real value and the actual price
# With this function i want to know if the price is going to go up or down and if the prediction is correct 
def evaluate_prediction(predicted_value_array, real_value_array, today_price_array):
    """
    Evaluate if the predicted value correctly indicates the price movement (up or down)
    compared to the actual price and the real value.

    Parameters:
    - predicted_value_array: Array with the predicted values
    - real_value_array: Array with the real values
    - actual_price_array: Array with the actual prices

    Returns:
    - correct_predictions: Number of correct predictions
    - total_predictions: Total number of predictions
    - accuracy: Accuracy of the predictions
    """

    # Initialize the number of correct predictions
    correct_predictions = 0
    # Initialize the total number of predictions
    total_predictions = 0

    # Iterate over the predicted values
    for i in range(len(predicted_value_array)):
        # If the predicted value is greater than the real value
        if ((predicted_value_array[i] >= today_price_array[i]) and (real_value_array[i] >= today_price_array[i])) or ((predicted_value_array[i] < today_price_array[i])and (real_value_array[i] < today_price_array[i])):
            # Increment the number of correct predictions
            correct_predictions += 1
            # Increment the total number of predictions
        total_predictions += 1
    # Calculate the accuracy of the predictions
    accuracy = correct_predictions / total_predictions 
    accuracy = round(100*accuracy, 2)

    # Return the number of correct predictions, the total number of predictions and the accuracy
    return correct_predictions, total_predictions, accuracy

In [None]:
latent_dim = 50
n_epochs = 151
save_ratio = 25

n_batch = 32

num_features = x_train.shape[2]
sequence_length = 20
loss = 'binary_crossentropy'
opt_d = Adamax(learning_rate=0.0001)
opt_gan = Adamax(learning_rate=0.001)
last_epoch = 0


# Convert the series to a numpy array
my_array = np.array(s_and_p_data['Close'])
# Calculate the number of rows needed
num_rows = len(my_array) - sequence_length - 1
# Initialize an empty list to store the rows
rows = []

# Generate rows of 20 elements each
for i in range(num_rows):
    rows.append(my_array[i:i+sequence_length])
# Convert the list of rows to a numpy array
transposed_array = np.array(rows)
# Print the transposed array
# print(transposed_array)

# Experiment name
exp_name = 'lstm'
if not os.path.exists(os.path.join(cwd, company, exp_name)):
    os.makedirs(os.path.join(cwd, company, exp_name))

if not os.path.exists(os.path.join(cwd, company, exp_name, 'models')):
    os.makedirs(os.path.join(cwd, company, exp_name, 'models'))
    
# Chek if the generator model already exists in the company folder
if os.path.isfile(os.path.join(cwd, company, exp_name, 'models', f'{company}_generator.h5')):
    # Load the generator model
    generator = load_model(os.path.join(cwd, company, exp_name, 'models', f'{company}_generator.h5'))
    # Print the summary of the generator model
    generator.summary()
else:
    # List the content of the company, expereiment and models folders
    model_list = os.listdir(os.path.join(cwd, company, exp_name, 'models'))
    # If there is at least one model in the list load the ones with the highest epoch number
    if len(model_list) > 0:
        print('Loading the models with the highest epoch number')
        for model in model_list:
            if 'generator' in model:
                epoch = int(model.split('_')[-1].split('.')[0])
                if epoch > last_epoch:
                    last_epoch = epoch
            # print(f'Last epoch: {last_epoch}')
        # Load the generator model with the highest epoch number
        generator = load_model(os.path.join(cwd, company, exp_name, 'models', model))
        # load the discriminator model with the highest epoch number
        discriminator = load_model(os.path.join(cwd, company, exp_name, 'models', f'{company}_discriminator{last_epoch}.h5'))
        # load the gan model with the highest epoch number
        gan_model = load_model(os.path.join(cwd, company, exp_name, 'models', f'{company}_gan_{last_epoch}.h5'))
    else:
        last_epoch = 0
        # Define the generator model
        generator = define_generator(latent_dim, num_features, sequence_length)
        # Define the discriminator model
        discriminator = define_discriminator(num_features, sequence_length, opt_d, loss)
        # Define the GAN model
        gan_model = define_gan(generator, discriminator, opt_gan, loss, latent_dim, sequence_length, num_features)
    # Train the GAN model
    train(generator, discriminator, gan_model, x_train, y_train, latent_dim, exp_name, company, n_epochs, n_batch, save_ratio, last_epoch = last_epoch)
    # Save the models
    generator.save(os.path.join(cwd, company, exp_name, 'models', f'{company}_generator.h5'))
    discriminator.save(os.path.join(cwd, company, exp_name, 'models', f'{company}_discriminator.h5'))
    gan_model.save(os.path.join(cwd, company, exp_name, 'models', f'{company}_gan.h5'))