In [None]:
import numpy as np
import pandas as pd

# Q-learning function
def q_learning(num_episodes, alpha, gamma, epsilon, num_states, num_actions, rewards, q_table):
    for episode in range(num_episodes):
        state = np.random.randint(0, num_states)  # Start from a random state
        done = False

        while not done:
            if np.random.rand() < epsilon:
                action = np.random.randint(0, num_actions)  # Explore with epsilon probability
            else:
                action = np.argmax(q_table[state])  # Exploit learned knowledge

            new_state = state  # Assuming no transition for this example
            reward = rewards[state, action]
            q_table[state, action] = q_table[state, action] + alpha * (reward + gamma * np.max(q_table[new_state]) - q_table[state, action])

            state = new_state

        epsilon = epsilon * 0.99  # Reduce exploration as training progresses

    return q_table

# Epsilon-greedy action selection function
def epsilon_greedy_action(q_values, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(len(q_values))
    else:
        return np.argmax(q_values)

# Main function to demonstrate the offer recommendation system
def main():
    num_episodes = 1000
    alpha = 0.1  # Learning rate
    gamma = 0.9  # Discount factor
    epsilon = 0.5  # Initial exploration probability

    # Sample offer dataset (replace this with your actual dataset)
    df = pd.DataFrame({
        'Amount': [100, 200, 150, 250, 300],
        'InterestRate': [0.05, 0.06, 0.04, 0.07, 0.03],
        'Tenor': [12, 24, 36, 18, 12],
        'Decision': ['Accepted', 'Declined', 'Accepted', 'Declined', 'Accepted']
    })

    # Define the state space and action space based on the dataset
    unique_amounts = df['Amount'].unique()
    unique_interest_rates = df['InterestRate'].unique()
    unique_tenors = df['Tenor'].unique()

    num_states = len(unique_amounts) * len(unique_interest_rates) * len(unique_tenors)
    num_actions = 2  # Assuming 2 actions: Accept and Decline offers

    # Create a rewards matrix (num_states x num_actions)
    rewards = np.zeros((num_states, num_actions))

    # Assign rewards based on the "Decision" column in the dataset
    for _, row in df.iterrows():
        amount_idx = np.where(unique_amounts == row['Amount'])[0][0]
        interest_rate_idx = np.where(unique_interest_rates == row['InterestRate'])[0][0]
        tenor_idx = np.where(unique_tenors == row['Tenor'])[0][0]
        state = amount_idx * len(unique_interest_rates) * len(unique_tenors) + interest_rate_idx * len(unique_tenors) + tenor_idx

        if row['Decision'] == 'Accepted':
            action = 0  # Index of the first action in the actions list
            rewards[state, action] = 1.0  # Assign a higher reward for an accepted offer
        else:
            action = 1  # Index of the second action in the actions list
            rewards[state, action] = -0.5  # Assign a lower reward for a declined offer

    # Initialize Q-value table with zeros
    q_table = np.zeros((num_states, num_actions))

    # Train the Q-learning model
    q_table = q_learning(num_episodes, alpha, gamma, epsilon, num_states, num_actions, rewards, q_table)

    # Customer's state (replace 0 with the index of the customer's state based on their information)
    customer_state = 0

    # Recommend offers to the customer until they accept one
    while True:
        recommended_action = np.argmax(q_table[customer_state])

        if recommended_action == 0:
            print("Recommended offer: Accept the offer.")
            accepted_offer = df[(df['Amount'] == unique_amounts[customer_state // (len(unique_interest_rates) * len(unique_tenors))])
                               & (df['InterestRate'] == unique_interest_rates[(customer_state // len(unique_tenors)) % len(unique_interest_rates))]]
            print("Offer details:")
            print(accepted_offer)
            break
        else:
            print("Recommended offer: Decline the offer.")
            # Show the next best offer by setting the reward for the current state-action pair to a lower value
            rewards[customer_state, recommended_action] = -1.0

    # Train the Q-learning model again (you can use more episodes or continue from the previous Q-table)

if __name__ == "__main__":
    main()
