In [None]:
from sqlalchemy import create_engine
import pandas as pd 
import numpy as np
import random
import math 

In [None]:
table_name = 'crypto'
engine = create_engine(f'postgresql://postgres:postgres@localhost:5432/{table_name}')

df_raw = pd.read_sql_query('select * from klines',con=engine)

In [None]:
df_5min = df_raw.query("period_type == '5min'").reset_index()

In [None]:
def get_day_of_week(df):
    """
    Returns the name of the day of the week for the given day number (0-6)
    """
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    df['local_numeric_day'] =  df['datetime'].apply(lambda x: (x.weekday()) % 7 + 1)
    df['local_day'] =  df['local_numeric_day'].apply(lambda x: days[x-1])
    return df 


def set_action(df, optimum_sell_rewards=15, optimum_buy_rewards=15):
    """
    Adds a new column called 'price_diff' to the given DataFrame,
    containing the difference between the current row's close price and
    the previous row's close price.
    """
    # Create a new column called 'prev_close' that contains the close price from the previous row
    df['prev_close'] = df['close'].shift(1)

    # Compute the difference between the current row's close price and the previous row's close price
    df['price_diff'] = df['close'] - df['prev_close']
    df['sell_rewards'] = df['price_diff'].shift(-1)
    df['buy_rewards'] = (df['price_diff'].shift(-1))*-1
    df['sell_cumulative_rewards'] = df['sell_rewards'].cumsum()
    df['buy_cumulative_rewards'] = df['buy_rewards'].cumsum()
    df['actions'] = -1 # default 0 = buy, 1 = sell, -1 = no action
    df.loc[df['buy_rewards'] >= 5, 'actions'] = 0
    df.loc[df['sell_rewards'] > 5 , 'actions'] = 1
    df.loc[df['actions'] == 1, 'one_time_reward'] = df['sell_rewards']
    df.loc[df['actions'] == 0, 'one_time_reward'] = df['buy_rewards']
    df.loc[df['actions'] == -1, 'one_time_reward'] = 0

    # Return the updated DataFrame
    return df


# normal distribution optimum bin
def get_optimal_normal_distribution_num_bins(df):
    """
    Estimates the optimal number of bins for the 'volume_trade' column
    of the given DataFrame using the Freedman-Diaconis rule, and returns
    the estimated number of bins.
    """
    # Compute the interquartile range of the 'volume_trade' column
    q1, q3 = np.percentile(df['volume_trade'], [25, 75])
    iqr = q3 - q1

    # Estimate the optimal bin width using the Freedman-Diaconis rule
    bin_width = 2 * iqr / np.cbrt(len(df))

    # Compute the estimated number of bins
    num_bins = int(np.ceil((df['volume_trade'].max() - df['volume_trade'].min()) / bin_width))

    # Return the estimated number of bins
    return num_bins


# power law optimum bin 
def get_optimal_pareto_distribution_num_bins(df):
    """
    Estimates the optimal number of bins for the 'volume_trade' column
    of the given DataFrame using the Sturges method for power law distributions,
    and returns the estimated number of bins.
    """
    # Compute the sample size and the maximum value of the 'volume_trade' column
    n = len(df['amount'])
    x_max = df['amount'].max()

    # Estimate the optimal number of bins using the Sturges method
    num_bins = int(np.ceil(np.log2(n) + np.log2(1 + x_max)))

    # Return the estimated number of bins
    return num_bins


def pareto_distribution_bins(df, num_bins):
    """Creates power law bins for the 'volume_trade' column of the given
    DataFrame using the qcut function, and returns the updated DataFrame.
    """
    # Compute the quantiles of the 'volume_trade' column using a power law distribution
    quantiles = pd.qcut(df['amount'], num_bins, labels=False, duplicates='drop')

    # Add a new column to the DataFrame with the bin labels
    df['volume_bins'] = quantiles

    # Return the updated DataFrame
    return df


def encode_time(df):
    """Encodes the time in the given DataFrame as a string representing the time
    in sequential order (hour-minute-second), and returns the updated DataFrame.
    """
    # Convert the 'time' column to a datetime object
    df['time'] = pd.to_datetime(df['datetime'])
    df['date'] = df['datetime'].dt.date

    # Extract the hour, minute, and second from the 'time' column
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df['second'] = df['time'].dt.second

    # Convert the hour, minute, and second to strings
    df['hour_str'] = df['hour'].astype(str).str.zfill(2)
    df['minute_str'] = df['minute'].astype(str).str.zfill(2)
    df['second_str'] = df['second'].astype(str).str.zfill(2)

    # Concatenate the hour, minute, and second strings into a single time string
    df['encoded_time'] = df['hour_str'] + '-' + df['minute_str'] + '-' + df['second_str']

    # Drop the original hour, minute, and second columns
    df = df.drop(['hour', 'minute', 'second', 'hour_str', 'minute_str', 'second_str'], axis=1)

    # Return the updated DataFrame with the encoded time string
    return df


In [None]:
df_5min = get_day_of_week(df_5min)
df_5min = set_action(df_5min)
df_5min = pareto_distribution_bins(df_5min, get_optimal_pareto_distribution_num_bins(df_5min))
df_5min = encode_time(df_5min)
df_5min.head()

In [None]:
df_5min.columns

In [None]:
df_5min[['date', 'local_day']].groupby("local_day").nunique()

In [None]:
df = df_5min.copy()

In [None]:
# Define the state space
num_days = df['local_numeric_day'].nunique()
num_times = df['time'].nunique()
num_volume_bins = df['volume_bins'].nunique()
state_space = np.zeros((num_days, num_times))
train_split = 0.8


# Define the action space
num_action = df['actions'].nunique()
action_space = np.zeros((num_volume_bins, num_action))


# Initialize the Q-values
global Q_star 


# Initialize learning rate, discount factor, exploration
alpha = 0.1
gamma = 0.9
epsilon = 0.1

In [None]:
# Define the reward function
def reward(price_rewards):
    return price_rewards


# Define a function to get the current state
def get_state(day, time, volume_bins, action):
    # concatenate scalar values into a numpy array
    state = np.array((time, day, volume_bins, action))
    print("Get state:", type(state), state.shape, state, state[0], state[1], state[2], state[3])
    return state


def choose_action(state):
    print("Choose action state:", type(state), state, state.shape)
    if random.uniform(0, 1) < epsilon:
        # Randomly choose an action
        action = np.random.choice((0, num_action-1))
    else:
        # Choose the action with highest Q value
        action = np.argmax(Q[state[0]][state[1]][state[2]])
    return action


# Generate the sample data
def generate_sample(df):

    cols = ['local_numeric_day', 'encoded_time', 'volume_bins', 'actions']
    df = df[cols]

    # Get states as tuple
    state_tuple = create_state_tuple(df)

    # Initiate Q table 
    global Q_star 
    Q_star = np.zeros((state_tuple))

    train_data = df[:math.floor(df.shape[0]*train_split)].to_numpy()
    test_data = df[train_data.shape[0]:].to_numpy()

    return train_data, test_data


def train_sarsa(train_data):
    # Define variables to track performance validation
    prev_avg_reward = -1000
    avg_reward = 0
    num_episodes = 0
    
    # Train for multiple episodes until convergence
    while abs(avg_reward - prev_avg_reward) > 0.001:
        prev_avg_reward = avg_reward
        total_reward = 0
        print("Start SARSA")
        
        # Loop through all training data
        for i in range(len(train_data)-1):
            # Get current state and action
            state = get_state(train_data[i][0], train_data[i][1], train_data[i][2], train_data[i][3])
            # action = choose_action(state)
            
            # Get next state and reward
            # next_state = get_state(train_data[i+1][0], train_data[i+1][1], train_data[i+1][2], train_data[i+1][3])
            reward = train_data[i+1][-1] * 0.1
            
            # Choose next action based on epsilon-greedy policy
            # next_action = choose_action(next_state)
            
            # Update Q table
            td_error = reward + gamma * Q[i+1][-1][-1] - Q[i][-1][-1]
            Q[i][-1][-1] += alpha * td_error
        
            total_reward += reward
        
        # Calculate average reward for the current episode
        avg_reward = total_reward / len(train_data)
        num_episodes += 1
        print("Episode:", num_episodes, "Average Reward:", avg_reward)
    
    return Q


def create_state_tuple(df_train):
    '''
    Automatically initiate state tuple based on the number of unique
    values in each state (features) from the given dataframe to ease the initiation of Q table
    '''
    state_list = []

    state_list.append(df_train.shape[0])
    for col in df_train:
        state_list.append(df_train[col].nunique())

    state_tuple = tuple(state_list)

    return state_tuple

In [None]:
rewards = df['one_time_reward'].to_list()

In [None]:
import SARSA_one_state as SARSA

In [None]:
num_states = df['encoded_time'].nunique()
num_actions = df['actions'].nunique()
lr = 0.005
discount_factor = 0.1
epsilon = 0.5
num_episodes = 10000000
num_steps_per_episode = 10
total_rewards = 0
GAMMA = 0.9
print(num_states, num_actions)

sarsa_agent = SARSA.SARSAAgent(
    df,
    learning_rate=lr,
    discount_factor=discount_factor,
    epsilon=epsilon
)

global Q 
sarsa_agent.initialize_q_table(df)
Q = sarsa_agent.q_table
reward_table = sarsa_agent.reward_table

In [None]:
reward_table.shape

In [None]:
action_dict = {
	'buy': 0,
	'sell': 1,
	'no_action': 2
}


def policy(Q, sarsa_agent, state, epsilon = 0.1, verbose = False) -> (int, float): 
	best_action = None
	best_value = float('-inf')
	
	# update allowed actions everytime based on agent current holding unit 
	if sarsa_agent.isHolding == False: # indicate can buy/no action but cannot sell
		allowed_actions = ['buy', 'no_action']
	else:
		allowed_actions = ['sell', 'no_action']

	random.shuffle(allowed_actions)

	for action in allowed_actions:
		if verbose:
			print(f"Holding: {sarsa_agent.isHolding}")
			print(f'action: {action}')
			print(f'value: {Q[state][action_dict.get(action)]} vs best_value: {best_value}')
			print(f'new best action: {action}')
		if Q[state][action_dict.get(action)] > best_value:
			best_action = action_dict.get(action)
			best_value = Q[state][best_action] 
				
	
	r_var = random.random()
	if r_var < epsilon:
		if verbose:
			print(f'Choosing random action')
		best_action = action_dict.get(random.choice(allowed_actions))
		best_value = Q[state][best_action]
		
	if verbose:
		print(f'Final action: {best_action}\n')

	return best_action, best_value


# Update Q-value for a state-action pair based on observed rewards and estimated future Q-values
def update_q_value(state, action, reward, next_state, next_action, verbose=False):
    
    if verbose == True:
	    print(f"State: {state}, Action: {action}, Rewards: {reward}, Next_state: {next_state}, Next_action: {next_action}")
    # Check if the (state, action) pair exists in the Q-table
    # if (state, action) not in Q:
    #     Q[(state, action)] = 0.0
	
    # Compute the updated Q-value using the SARSA update equation
    current_q = Q[state][action_dict.get(action)]
    next_q = Q[next_state][action_dict.get(next_action)]
    new_q = current_q + lr * (reward + GAMMA * next_q - current_q)
    
    # Update the Q-value in the Q-table
    Q[state][action_dict.get(action)] = new_q

In [None]:
# Training loop
# for episode in range(num_episodes):
steps = []
rewards_list = []
steps_list = []

for episode in range(num_episodes):
    current_state = 0  # Starting state
    action, action_value = policy(Q, sarsa_agent, current_state, epsilon)
    # update upcoming allowed actions
    if action == 0:
        sarsa_agent.isHolding = True
    if action == 1:
        sarsa_agent.isHolding = False
    total_rewards = 0

    while (current_state != Q.shape[0] - 1):
        rewards = reward_table[current_state][action]
        total_rewards += rewards
        steps.append(action)
        next_state = current_state + 1 
        next_action, next_action_value = policy(Q, sarsa_agent, next_state, epsilon)
        update_q_value(current_state, action, rewards, next_state, next_action)

        # print(f'After update:, action: {action}, action_value : {action_value}, next_action: {next_action}, next_action_value: {next_action_value}\n')
        # update upcoming allowed actions
        if next_action == 0:
            sarsa_agent.isHolding = True
        if next_action == 1:
            sarsa_agent.isHolding = False

        current_state = next_state
        action = next_action 
        action_value = next_action_value

    rewards_list.append(total_rewards)
    steps_list.append(steps)

In [None]:
df_check = pd.DataFrame(rewards_list, columns=['rewards'])
df_check.describe()

In [None]:
max_value_index = rewards_list.index(max(rewards_list))
print(max_value_index)

In [None]:
steps_list[max_value_index]

In [None]:
Q

In [None]:
# Check if each row has different values
is_different = np.all(np.diff(Q, axis=1), axis=1)

# Display the result
print(is_different)

In [None]:
df.head()

***Perform price and volume aggregation for monthly daily basis reward table***

In [None]:
# the trade volumes here originally is "amount", total usd price is "vol"
# will rename after retrieved from the dataframe
desired_col = [ 
    'date', 
    'time', 
    'local_numeric_day', 
    'amount',
    'vol',
    'sell_rewards', 
    'buy_rewards', 
    'sell_cumulative_rewards', 
    'buy_cumulative_rewards',
    'actions',
    'volume_bins',
    'encoded_time'
]

renamed_col = [ 
    'date', 
    'time', 
    'local_numeric_day', 
    'trade_volumes',
    'trade_total_price',
    'sell_rewards', 
    'buy_rewards', 
    'sell_cumulative_rewards', 
    'buy_cumulative_rewards',
    'actions',
    'volume_bins',
    'encoded_time'
]


df_volumes = df[desired_col]
df_volumes.columns = renamed_col

In [None]:
from datetime import date 

def convert_to_first_day_of_month(df, date_column_name):
    # convert to datetime format
    starting_month_list = df[date_column_name].apply(lambda x: date(x.year, x.month, 1))
    return starting_month_list

def get_week_of_month(df, date_column_name) -> list:

    def compute_week_of_month(date_value):
        first_day = date(date_value.year, date_value.month, 1)
        offset = (date_value.weekday() + 1 - first_day.weekday()) % 7
        week_of_month = (date_value.day + offset - 1) // 7 + 1
        return week_of_month

    week_of_month_list = df[date_column_name].apply(lambda x: compute_week_of_month(x))

    return week_of_month_list

In [None]:
df_volumes['starting_month'] = convert_to_first_day_of_month(df, 'date')
df_volumes['week_of_month'] = get_week_of_month(df, 'date')

In [None]:
df_volumes.head()

**Start aggregating the trading data**

***Based on year, month, day, time to see the max, min, average, median, sd of the price and volumes***

In [None]:
volume_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'trade_volumes']
price_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'trade_volumes', 'trade_total_price']
reward_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'sell_rewards', 'buy_rewards', 'actions']

groupby_keys = ['week_of_month', 'local_numeric_day', 'encoded_time']

df_volume_stats = df_volumes[volume_cols].groupby(groupby_keys).describe().reset_index()

df_price = df_volumes[price_cols].groupby(groupby_keys).sum()
df_price['daily_average_trade_total_price'] = df_price['trade_total_price'] / df_price['trade_volumes']
df_price = df_price.drop(columns=['trade_volumes', 'trade_total_price'])
df_price_stats = df_price.groupby(groupby_keys).mean()

df_reward_stats = df_volumes[reward_cols[:-1]].groupby(groupby_keys).describe().reset_index()

In [None]:
df_volume_stats.head()

In [None]:
df_price_stats.head()

In [None]:
df_reward_stats.head()

In [None]:
df_reward_stats[['week_of_month', 'local_numeric_day', 'encoded_time']]\
.reset_index()\
.groupby(['week_of_month', 'local_numeric_day'])\
.nunique()

***Create nested reward table***

***Keep the df_price_stats and df_volume_stats for reference***

In [None]:
df_reward_stats.head()

***Start creating nested reward table from here***


In [None]:
df_sell_rewards = df_reward_stats['sell_rewards'][['mean']].copy()
df_buy_rewards = df_reward_stats['buy_rewards'][['mean']].copy()

In [None]:
df_sell_rewards.head()

In [None]:
sell_cols = ['sell_rewards']
buy_cols = ['buy_rewards']
df_sell_rewards.columns = sell_cols
df_buy_rewards.columns = buy_cols

In [None]:
df_sell_rewards.head()

In [None]:
df_buy_rewards.head()

In [None]:
#df_nested_rewards = pd.merge([df_reward_stats[['week_of_month', 'local_numeric_day', 'encoded_time']], df_sell_rewards, df_buy_rewards],).reset_index()
df_nested_rewards = pd.concat([df_reward_stats[['week_of_month', 'local_numeric_day', 'encoded_time']], df_sell_rewards, df_buy_rewards], axis=1)

# rename column to remove the tuple-like hierachy syntax for easier retrieve
rename_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'sell_rewards', 'buy_rewards']
df_nested_rewards.columns = rename_cols
df_nested_rewards.head()

In [None]:
# Create a custom aggregation function to fill in values based on conditions
def fill_values(column):
    if column[column > 0].empty:
        return None
    return column[column > 0].values[0]

# Pivot the DataFrame
df_pivoted_rewards = pd.pivot_table(df_nested_rewards, values=['sell_rewards', 'buy_rewards'], index=['week_of_month', 'local_numeric_day', 'encoded_time'],
                            aggfunc=fill_values).reset_index()
df_pivoted_rewards = df_pivoted_rewards.rename(columns={'sell_rewards': 'sell_action', 'buy_rewards': 'buy_action'})
df_pivoted_rewards.head()

In [None]:
# assign reverse action reward for NaN value 
df_pivoted_rewards['buy_action'] = df_pivoted_rewards['buy_action'].fillna(df_pivoted_rewards['sell_action']*-1)
df_pivoted_rewards['sell_action'] = df_pivoted_rewards['sell_action'].fillna(df_pivoted_rewards['buy_action']*-1)
df_pivoted_rewards.head()

In [None]:
# add in no action reward value 
df_pivoted_rewards['no_action'] = 0
df_pivoted_rewards.head()

In [None]:
# transform encoded time into scalar value for easier indexing
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_pivoted_rewards['label_encoded_time'] = encoder.fit_transform(df_pivoted_rewards[['encoded_time']])
df_pivoted_rewards.head()

In [None]:
df_pivoted_rewards.nunique()

In [None]:
df_pivoted_rewards.head()

In [None]:
# get the unique value of each column for each state 
state_unique_counts = df_pivoted_rewards.nunique()

# initialize shape size
state_array_shape = tuple(state_unique_counts[:3])
# add 3 unique actions 
state_array_shape += (num_action ,)
print("State array shape:", state_array_shape)

# create the array with the initialized shape size 
state_array = np.zeros(state_array_shape)

# start padding reward value into each state respectively
# Iterate over the rows of the DataFrame
for index, row in df_pivoted_rewards.iterrows():
    week_index = row['week_of_month']-1
    day_index = row['local_numeric_day']-1
    time_index = row['label_encoded_time']-1
    value = [row['buy_action'], row['sell_action'], row['no_action']]
    state_array[week_index, day_index, time_index] = value

state_array[0]

In [None]:
# assign state array to be reward array for easier reference
reward_table = state_array.copy()

In [None]:
reward_table.shape

In [None]:
# Iterate over dimensions a, b, c sequentially
# 27/7/2023: tried np.diter but doesnt want to waste more time as there are more parameters to discover but knew that this will reduce the computation time complexity
# for now use a simple nested loops first
# in future when data is more then explore np.diter
for month in range(reward_table.shape[0]):
    for day in range(reward_table.shape[1]):
        for time in range(reward_table.shape[2]):
            state = (month, day, time)

In [136]:
# Start modifying the SARSA nested state iteration from here
# Training loop
# for episode in range(num_episodes):
steps = []
rewards_list = []
steps_list = []

Q = np.zeros(reward_table.shape)
lr = 0.005
discount_factor = 0.1
epsilon = 0.3
num_episodes = 10000000
num_steps_per_episode = 1
total_rewards = 0
GAMMA = 0.9

for episode in range(num_episodes):
        
    current_state = (0,0,0) # Starting state
    action, action_value = policy(Q, sarsa_agent, current_state, epsilon)
    # update upcoming allowed actions
    if action == 0:
        sarsa_agent.isHolding = True
    else:
        sarsa_agent.isHolding = False

    # initialize cumulative rewards
    total_rewards = 0

    # when current state has not iterate until the last row of Q table
    # while (current_state != (reward_table.shape[0],reward_table.shape[1],reward_table.shape[2])):

    for month in range(reward_table.shape[0]):
        for day in range(reward_table.shape[1]):
            for time in range(reward_table.shape[2]):

                # when iterating (0,0) start from (0,0,1) to (0,0,287) because (0,0,0) already initialized on top
                if current_state[0] == 0 and current_state[1] == 0:
                    current_state = [month, day, time + 1]
                else:
                    current_state = [month, day, time]

                current_state = tuple(current_state)
                # print("Current state:", current_state)

                rewards = reward_table[current_state][action]
                total_rewards += rewards

                steps.append(action)

                # if not last row of state then + 1 else move up 1 level then + 1
                if current_state[2] < reward_table.shape[2] - 1:
                    next_state = [current_state[0], current_state[1], current_state[2] + 1]
                    # print("In time:", current_state, next_state)
                elif reward_table.shape[2] - 1 == current_state[2] and current_state[1] < reward_table.shape[1] - 1:
                    next_state = [current_state[0], current_state[1] + 1, current_state[2]]
                    # print("In day:", current_state,  next_state)
                elif current_state[0] < reward_table.shape[0] - 1:
                    next_state = [current_state[0] + 1, current_state[1], current_state[2]]
                    # print("In month:", current_state, next_state)
                
                next_state = tuple(next_state)
                next_action, next_action_value = policy(Q, sarsa_agent, next_state, epsilon)

                # if current total rewards earn at least 20 usd
                if total_rewards >= 20:
                    total_rewards += 100
                    next_action_value = 100

                #print("End:", current_state, action, rewards, next_state, next_action)
                update_q_value(current_state, action, rewards, next_state, next_action)

                # print(f'After update:, action: {action}, action_value : {action_value}, next_action: {next_action}, next_action_value: {next_action_value}\n')
                # update upcoming allowed actions
                if next_action == 0:
                    sarsa_agent.isHolding = True
                if next_action == 1:
                    sarsa_agent.isHolding = False

                current_state = next_state
                action = next_action 
                action_value = next_action_value

    rewards_list.append(total_rewards)
    steps_list.append(steps)

In [137]:
df_check = pd.DataFrame(rewards_list, columns=['rewards'])
df_check.describe()


Unnamed: 0,rewards
count,100.0
mean,745496.0
std,350633.3
min,-265.6893
25%,715752.5
50%,916861.5
75%,973586.3
max,1003089.0


In [142]:
steps_list[67]

[2,
 0,
 2,
 1,
 0,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 0,
 1,
 0,
 2,
 1,
 0,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 1,
 0,
 1,
 0,
 2,
 1,
 2,
 2,
 0,
 1,
 0,
 1,
 0,
 1,
 2,
 0,
 1,
 0,
 2,
 1,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 2,
 1,
 0,
 1,
 0,
 1,
 0,
 2,
 2,
 2,
 1,
 2,
 0,
 1,
 2,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 2,
 1,
 0,
 1,
 0,
 1,
 2,
 2,
 2,
 2,
 0,
 1,
 2,
 0,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 2,
 2,
 0,
 1,
 0,
 2,
 1,
 0,
 2,
 2,
 1,
 0,
 1,
 0,
 2,
 2,
 1,
 0,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 2,
 2,
 0,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 1,
 2,
 0,
 1,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 0,
 2,
 1,
 0,
 2,
 1,
 0,
 1,
 0,
 2,
 1,
 2,
 0,
 2,
 2,
 1,
 0,
 1,
 2,
 2,
 2,
 0,
 1,
 2,
 0,
 1,
 0,
 1,
 2,
 0,
 2,
 1,
 0,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 2,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 0,
 1,
 2,
 0,
 2,


In [143]:
max_value_index = rewards_list.index(max(rewards_list))

print(max_value_index)
print("Step list:", steps_list[max_value_index])

# Check if each row has different values
is_different = np.all(np.diff(Q, axis=1), axis=1)

# Display the result
print(is_different)

67
Step list: [2, 0, 2, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1, 0, 2, 1, 0, 2, 2, 1, 2, 2, 0, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2, 1, 0, 1, 0, 2, 1, 2, 2, 0, 1, 0, 1, 0, 1, 2, 0, 1, 0, 2, 1, 0, 1, 2, 0, 1, 2, 0, 2, 1, 0, 1, 0, 1, 0, 2, 2, 2, 1, 2, 0, 1, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2, 1, 0, 1, 0, 1, 2, 2, 2, 2, 0, 1, 2, 0, 2, 2, 2, 2, 1, 2, 2, 0, 1, 2, 2, 0, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 0, 1, 2, 2, 0, 2, 2, 1, 0, 2, 2, 2, 2, 1, 0, 2, 2, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 1, 0, 2, 1, 0, 1, 0, 2, 1, 2, 0, 2, 2, 1, 0, 1, 2, 2, 2, 0, 1, 2, 0, 1, 0, 1, 2, 0, 2, 1, 0, 2, 2, 1, 2, 2, 0, 1, 2, 0, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 1, 2, 0, 1, 2, 0, 2, 2, 1, 2, 2, 0, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 0, 1, 0, 2, 1, 0, 1, 2, 0, 1, 0, 2, 2, 2, 1, 0, 1, 2, 2, 2, 0, 1, 0, 1, 0, 2, 1, 0, 1, 0, 2, 2, 2, 2, 1, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 2, 1, 2, 0, 1, 2, 0, 1, 2, 0, 2, 1, 2, 0