# Interest Rates Reinforcement Learning: Q-Learning

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from interest_rate_environment import *
import random

### Short Data Analysis
The first step we are doing is to explore the given data and have a short look at the contents of the csv file.

In [None]:
product_path = "../data/interest_rates_p1.csv"

# Load the csv file with pandas
interest_rates = pd.read_csv(product_path)

# Convert column timestamp into datetime objects
interest_rates['timestamp'] = pd.to_datetime(interest_rates["timestamp"])

# Calculate the avg with a rolling window
interest_rates['interest_rate_avg'] = interest_rates['interest_rate'].rolling(window=10, center=False).mean()

# Have a look at the top 10 entries
display(interest_rates.head(10))


# Display descriptive statistics using describe
display(interest_rates['interest_rate'].describe())
display(interest_rates['interest_rate_avg'].describe())

### Visualize interest rates
In the next step we would like to visualize the given data to get an overview.

In [None]:
interest_rates.plot(x="timestamp", y=["interest_rate", "interest_rate_avg"], legend=True, figsize=(20,9))

plt.title("Interest Rate Development")
plt.tight_layout()
plt.show()

### Create environment for Q-Learning

Now we are ready to create an instance of our previously created environment and execute a view steps in the environment.

In [None]:
# Create an interest rate environment with the path of the csv file and optional start/end date
env = InterestRateEnv(product_path, end = '2020-01-01')

# Display statistics from interest_rate and normalized_interest_rate_avg
display(env.df_interest_rates['interest_rate'].describe())
display(env.df_interest_rates['normalized_interest_rate_avg'].describe())

In [None]:
# Getting the action and observation space
print("Action Space: {}".format(env.action_space))
print("Observation Space: {}".format(env.observation_space))

# Reset environment and output first observation
first_observation = env.reset()
print("First Observation: {}".format(first_observation))

In [None]:
# Execute a view random actions and print observation, reward, done, info and position
for _ in range(2):
    env.render()
    action = env.action_space.sample()
    print("Action: " + str(action))
    observation, reward, done, info = env.step(action)
    print("Observation: " +str(observation))
    print("Reward: " + str(reward))
    print("Done: " + str(done))
    print("Info: " + str(info))
    print("Position: " + str(env.current_position))
    print("")

## Helper methods

In [None]:
def get_model(env):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    return Q

# Function that plots the development of the portfolio value within one episode
def plot_developments(statuses):   
    ax = statuses.plot(x="timestamp", y=["position", "value", "cum_interest_rate"], secondary_y=['position'], legend=True, figsize=(16,9), title="Performance of trained agent on evaluation data")
    ax.set_ylim([0.5,1.2])
    plt.tight_layout()
    plt.show()
    
def highlight_max(s): 
    is_max = s == s.max() 
    return ['background: lightgreen' if cell else '' for cell in is_max]

## Q-Learning
Now we are ready to start with the actual implementation of our Q-Learning algorithm.

In [None]:
#Initializing the Q-table
print("Action space: " + str(env.action_space.n))
print("Observation space: " + str(env.observation_space.n))
Q = get_model(env)
print("Q-table shape: " + str(Q.shape))

In [None]:
# Have a look how interest rates are converted to states and vice versa
print(env.get_state_for_interest_rate(-0.97))
print(env.get_state_for_interest_rate(0.95))
print(env.get_interest_rate_for_state(1))
print(env.get_interest_rate_for_state(39))

In [None]:
# Training the agent

def train_q_learning(env, train_episodes=100):
        
    # Setting the parameters for Q-Learning 
    gamma = 0.7  # discount factor
    alpha = 0.7  # learning rate                                      
    nr_steps = env.maximum_episode_steps()

    # max_epsilon, min_epsilon and decay are used for a decayed epsilon greedy approach
    max_epsilon = 1
    min_epsilon = 0.01         
    decay = 0.01
    epsilon = max_epsilon
    
    # Keep track of portfolio value and epsilon in each episode
    epsilons = []
    portfolio_values = []
    
    Q = get_model(env)
    
    for episode in range(train_episodes):
        # Reset the environment at every episode
        state = env.reset()    

        for step in range(nr_steps):
            # TODO Implement Q-Learning

            # Choose an action based on a random number and the given epsilon: Option 1 is to eploit, Option 2 is to explore


            # Perform the action


            # Update the Q-Table using the Bellman equation


            # Update the state


            # End the epsiode if done is true
            if done == True:
                break

        # Decayed epsilon greedy: Cutting down on exploration by reducing the epsilon in each episode
        # TODO (see slides for further information)

        # Add the portfolio value to list of portfolio values
        portfolio_values.append(env.current_value)

        # Add epsilon to list of epsilons
        epsilons.append(epsilon)

        print("Finished episode {}".format(episode))

    print ("Training score: " + str(sum(portfolio_values)/train_episodes))
    return Q, portfolio_values, epsilons

### Training phase

In [None]:
train_episodes = 200
Q, portfolio_values, epsilons = train_q_learning(env, train_episodes=train_episodes)

### Display learned Q-Table
Now we will have a look at the learned Q-values.

In [None]:
# Create a dataframe for Q-Table
q_df = pd.DataFrame(Q, columns= ['SHORT', 'NO_POSITION', 'LONG'])
q_df.reset_index(inplace=True)
q_df['index'] = q_df['index'].apply(lambda x : env.get_interest_rate_for_state(x))
q_df.rename(columns={'index': 'interest_rate'}, inplace=True)

# Highlight highest value per row in lightgreen
q_df = q_df.style.apply(highlight_max, axis=1, subset = q_df.columns[1:4])
display(q_df)

# Print columns with maximum value per row
print(np.argmax(Q, axis=1))

### Visualize results and total portfolio values over all episodes

In [None]:
x = range(train_episodes)
plt.plot(x, portfolio_values)
plt.xlabel('Episode')
plt.ylabel('Training portfolio value')
plt.title('Portfolio values over all episodes') 
plt.show()

### Visualize the epsilons over all episodes

In [None]:
plt.plot(epsilons)
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.title("Epsilon for episode")
plt.show()

### Testing the agent
Now we would like to test our reinforcement learning agent and therefore execute one episode where we always choose the "best" action, based on the learned Q-values.
Think about how to split training and test data. The data which was used for the training, should not be used for test.
Options are to specify an end date for training and a start date for test, or to use interest rates of a different product.


In [None]:
def test_agent(env, Q):
    # Reset the environment
    state = env.reset()    
    infos = []
    done = False

    while not done:
        # No position if all Q values are equal
        all_equal = len(set(Q[state])) <= 1
        if all_equal:
            action = 1
        else:
            # Use the "best" action based on the previously calculated q-table
            action = np.argmax(Q[state,:])

        # Execute one step
        state, reward, done, info = env.step(action)
        infos.append(info)

        # End the episode if done = True
        if done == True:
            break

    print("Final portfolio value: " + str(env.current_value))
    return infos

In [None]:
#Testing the agent
env = InterestRateEnv(product_path, start= '2020-01-01')
infos = test_agent(env, Q)


### Display portfolio development

In [None]:
# Create a data frame with all the statues
info_df = pd.DataFrame.from_dict(infos)
info_df["cum_interest_rate"] = (info_df["interest_rate"] + 1).cumprod()

print(info_df[:5])

# Plot the portfolio developments and the positions over the testing time
plot_developments(info_df)