In [None]:
import numpy as np
import pandas as pd

# Charger les données depuis le fichier CSV
df_prices = pd.read_csv("stock_prices.csv")

# Définition des paramètres globaux
alpha = 0.1  # Taux d'apprentissage
gamma = 0.9  # Réduction des récompenses futures
epsilon = 0.1  # Exploration (probabilité d'essayer une action aléatoire)

# Fonction Q-Learning
def q_learning_prices(df):
    actions = ["buy", "sell", "hold"]
    states = len(df)
    Q = np.zeros((states, len(actions)))

    def reward(action, price_today, price_tomorrow):
        if action == "buy":
            return price_tomorrow - price_today
        elif action == "sell":
            return price_today - price_tomorrow
        return 0

    for episode in range(10):
        state = 0
        while state < states - 1:
            if np.random.rand() < epsilon:  # Exploration
                action_index = np.random.randint(len(actions))
            else:  # Exploitation
                action_index = np.argmax(Q[state])

            action = actions[action_index]
            next_state = state + 1
            r = reward(action, df["Price"].iloc[state], df["Price"].iloc[next_state])

            # Mise à jour de la table Q
            Q[state, action_index] += alpha * (
                r + gamma * np.max(Q[next_state]) - Q[state, action_index]
            )
            state = next_state

    return Q

# Exécution de l'algorithme
Q_table_prices = q_learning_prices(df_prices)
print("Table Q pour les prix des actions :\n", Q_table_prices)


In [None]:
# 2. SARSA (Base : Prix des actions)

def sarsa_prices(df):
    actions = ["buy", "sell", "hold"]
    states = len(df)
    Q = np.zeros((states, len(actions)))

    def reward(action, price_today, price_tomorrow):
        if action == "buy":
            return price_tomorrow - price_today
        elif action == "sell":
            return price_today - price_tomorrow
        return 0

    for episode in range(10):
        state = 0
        if np.random.rand() < epsilon:
            action_index = np.random.randint(len(actions))
        else:
            action_index = np.argmax(Q[state])

        while state < states - 1:
            action = actions[action_index]
            next_state = state + 1
            r = reward(action, df["Price"][state], df["Price"][next_state])

            if np.random.rand() < epsilon:
                next_action_index = np.random.randint(len(actions))
            else:
                next_action_index = np.argmax(Q[next_state])

            Q[state, action_index] += alpha * (
                r + gamma * Q[next_state, next_action_index] - Q[state, action_index]
            )

            state = next_state
            action_index = next_action_index

    return Q

# Exécution
Q_table_sarsa = sarsa_prices(df_prices)
print("Table Q pour SARSA :\n", Q_table_sarsa)


In [None]:
# 3. Deep Q-Learning (Base : Gestion de portefeuille)

def deep_q_learning_portfolio(df):
    # Supprimer la colonne 'Date' si elle existe
    df = df.drop(columns=['Date'], errors='ignore')
    
    # Convertir les colonnes en numérique
    df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

    if df.shape[0] == 0:
        raise ValueError("Le DataFrame est vide après nettoyage. Vérifiez vos données.")

    num_states = df.shape[1]  # Nombre d'entrées (Stock_A, Stock_B, Stock_C)
    num_actions = num_states  # Nombre d'actions possibles : investir dans Stock_A, Stock_B, Stock_C

    # ✅ Correction : Définition correcte du modèle avec `Input(shape=...)`
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(num_states,)),  # Correctement défini
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(num_actions, activation="linear")
    ])
    model.compile(optimizer="adam", loss="mse")

    gamma = 0.9
    epsilon = 0.1
    epochs = 10
    memory = []
    max_memory = 1000

    for epoch in range(epochs):
        state = df.iloc[0].values.astype(float)
        for t in range(1, len(df)):
            if np.random.rand() < epsilon:
                action = np.random.randint(0, num_actions)
            else:
                q_values = model.predict(state.reshape(1, -1), verbose=0)
                action = np.argmax(q_values)

            next_state = df.iloc[t].values.astype(float)
            reward = next_state[action]

            memory.append((state, action, reward, next_state))
            if len(memory) > max_memory:
                memory.pop(0)

            state = next_state

            if len(memory) > 32:
                batch = random.sample(memory, 32)
                states, actions, rewards, next_states = zip(*batch)

                states = np.array(states)
                next_states = np.array(next_states)
                q_values = model.predict(states, verbose=0)
                q_next = model.predict(next_states, verbose=0)

                for i in range(32):
                    q_values[i, actions[i]] = rewards[i] + gamma * np.max(q_next[i])

                model.fit(states, q_values, verbose=0)

    return model

# Exécution
model_portfolio = deep_q_learning_portfolio(df_portfolio)
print("Modèle DQN entraîné pour la gestion de portefeuille.",model_portfolio)



In [None]:
# 4. Double Q-Learning (Base : Gestion de portefeuille)

def double_q_learning_portfolio(df):
    actions = ["invest_A", "invest_B", "invest_C"]
    states = len(df)
    Q1 = np.zeros((states, len(actions)))
    Q2 = np.zeros((states, len(actions)))

    for episode in range(10):
        state = 0
        while state < states - 1:
            if np.random.rand() < epsilon:
                action_index = np.random.randint(len(actions))
            else:
                action_index = np.argmax(Q1[state] + Q2[state])

            next_state = state + 1
            reward = df.iloc[next_state, action_index + 1]

            if np.random.rand() < 0.5:
                Q1[state, action_index] += alpha * (
                    reward + gamma * np.max(Q2[next_state]) - Q1[state, action_index]
                )
            else:
                Q2[state, action_index] += alpha * (
                    reward + gamma * np.max(Q1[next_state]) - Q2[state, action_index]
                )

            state = next_state

    return Q1, Q2

# Exécution
Q1, Q2 = double_q_learning_portfolio(df_portfolio)
print("Tables Q1 et Q2 pour Double Q-Learning :\n", Q1, Q2)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(np.max(Q_table_prices, axis=1), label="Q-learning (Prix d'actions)")
plt.plot(np.max(Q_table_sarsa, axis=1), label="SARSA (Prix d'actions)")
plt.title("Comparaison des algorithmes pour les prix d'actions")
plt.legend()
plt.show()
