In [None]:
from sqlalchemy import create_engine
import pandas as pd 
import numpy as np
import random
import math 

In [None]:
table_name = 'crypto'
engine = create_engine(f'postgresql://postgres:postgres@localhost:5432/{table_name}')

df_raw = pd.read_sql_query('select * from klines',con=engine)

In [None]:
df_5min = df_raw.query("period_type == '5min'").reset_index()

In [None]:
def get_day_of_week(df):
    """
    Returns the name of the day of the week for the given day number (0-6)
    """
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    df['local_numeric_day'] =  df['datetime'].apply(lambda x: (x.weekday()) % 7 + 1)
    df['local_day'] =  df['local_numeric_day'].apply(lambda x: days[x-1])
    return df 


def set_action(df, optimum_sell_rewards=15, optimum_buy_rewards=15):
    """
    Adds a new column called 'price_diff' to the given DataFrame,
    containing the difference between the current row's close price and
    the previous row's close price.
    """
    # Create a new column called 'prev_close' that contains the close price from the previous row
    df['prev_close'] = df['close'].shift(1)

    # Compute the difference between the current row's close price and the previous row's close price
    df['price_diff'] = df['close'] - df['prev_close']
    df['sell_rewards'] = df['price_diff'].shift(-1)
    df['buy_rewards'] = (df['price_diff'].shift(-1))*-1
    df['sell_cumulative_rewards'] = df['sell_rewards'].cumsum()
    df['buy_cumulative_rewards'] = df['buy_rewards'].cumsum()
    df['actions'] = -1 # default 0 = buy, 1 = sell, -1 = no action
    df.loc[df['buy_rewards'] >= 5, 'actions'] = 0
    df.loc[df['sell_rewards'] > 5 , 'actions'] = 1
    df.loc[df['actions'] == 1, 'one_time_reward'] = df['sell_rewards']
    df.loc[df['actions'] == 0, 'one_time_reward'] = df['buy_rewards']
    df.loc[df['actions'] == -1, 'one_time_reward'] = 0

    # Return the updated DataFrame
    return df


# normal distribution optimum bin
def get_optimal_normal_distribution_num_bins(df):
    """
    Estimates the optimal number of bins for the 'volume_trade' column
    of the given DataFrame using the Freedman-Diaconis rule, and returns
    the estimated number of bins.
    """
    # Compute the interquartile range of the 'volume_trade' column
    q1, q3 = np.percentile(df['volume_trade'], [25, 75])
    iqr = q3 - q1

    # Estimate the optimal bin width using the Freedman-Diaconis rule
    bin_width = 2 * iqr / np.cbrt(len(df))

    # Compute the estimated number of bins
    num_bins = int(np.ceil((df['volume_trade'].max() - df['volume_trade'].min()) / bin_width))

    # Return the estimated number of bins
    return num_bins


# power law optimum bin 
def get_optimal_pareto_distribution_num_bins(df):
    """
    Estimates the optimal number of bins for the 'volume_trade' column
    of the given DataFrame using the Sturges method for power law distributions,
    and returns the estimated number of bins.
    """
    # Compute the sample size and the maximum value of the 'volume_trade' column
    n = len(df['amount'])
    x_max = df['amount'].max()

    # Estimate the optimal number of bins using the Sturges method
    num_bins = int(np.ceil(np.log2(n) + np.log2(1 + x_max)))

    # Return the estimated number of bins
    return num_bins


def pareto_distribution_bins(df, num_bins):
    """Creates power law bins for the 'volume_trade' column of the given
    DataFrame using the qcut function, and returns the updated DataFrame.
    """
    # Compute the quantiles of the 'volume_trade' column using a power law distribution
    quantiles = pd.qcut(df['amount'], num_bins, labels=False, duplicates='drop')

    # Add a new column to the DataFrame with the bin labels
    df['volume_bins'] = quantiles

    # Return the updated DataFrame
    return df


def encode_time(df):
    """Encodes the time in the given DataFrame as a string representing the time
    in sequential order (hour-minute-second), and returns the updated DataFrame.
    """
    # Convert the 'time' column to a datetime object
    df['time'] = pd.to_datetime(df['datetime'])
    df['date'] = df['datetime'].dt.date

    # Extract the hour, minute, and second from the 'time' column
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df['second'] = df['time'].dt.second

    # Convert the hour, minute, and second to strings
    df['hour_str'] = df['hour'].astype(str).str.zfill(2)
    df['minute_str'] = df['minute'].astype(str).str.zfill(2)
    df['second_str'] = df['second'].astype(str).str.zfill(2)

    # Concatenate the hour, minute, and second strings into a single time string
    df['encoded_time'] = df['hour_str'] + '-' + df['minute_str'] + '-' + df['second_str']

    # Drop the original hour, minute, and second columns
    df = df.drop(['hour', 'minute', 'second', 'hour_str', 'minute_str', 'second_str'], axis=1)

    # Return the updated DataFrame with the encoded time string
    return df


In [None]:
df_5min = get_day_of_week(df_5min)
df_5min = set_action(df_5min)
df_5min = pareto_distribution_bins(df_5min, get_optimal_pareto_distribution_num_bins(df_5min))
df_5min = encode_time(df_5min)
df_5min.head()

In [None]:
df_check = df_5min[['datetime', 'encoded_time', 'vol', 'amount', 'average_period_price']].drop_duplicates()
df_check.shape

In [None]:
df_5min.columns

In [None]:
df_check.head()

In [None]:
# Iterate over the rows of the DataFrame

price_table = np.zeros((5,7,288))

for index, row in df_5min.iterrows():
    week_index = row['week_of_month']-1
    day_index = row['local_numeric_day']-1
    time_index = row['label_encoded_time']-1
    value = [row['average_period_price']]
    price_table[week_index, day_index, time_index] = value

price_table

In [None]:
df_5min[['date', 'local_day']].groupby("local_day").nunique()

In [None]:
df = df_5min.copy()

In [None]:
# Define the state space
num_days = df['local_numeric_day'].nunique()
num_times = df['time'].nunique()
num_volume_bins = df['volume_bins'].nunique()
state_space = np.zeros((num_days, num_times))
train_split = 0.8


# Define the action space
num_action = df['actions'].nunique()
action_space = np.zeros((num_volume_bins, num_action))


# Initialize the Q-values
global Q_star 


# Initialize learning rate, discount factor, exploration
alpha = 0.1
gamma = 0.9
epsilon = 0.1

In [None]:
# Define the reward function
def reward(price_rewards):
    return price_rewards


# Define a function to get the current state
def get_state(day, time, volume_bins, action):
    # concatenate scalar values into a numpy array
    state = np.array((time, day, volume_bins, action))
    print("Get state:", type(state), state.shape, state, state[0], state[1], state[2], state[3])
    return state


def choose_action(state):
    print("Choose action state:", type(state), state, state.shape)
    if random.uniform(0, 1) < epsilon:
        # Randomly choose an action
        action = np.random.choice((0, num_action-1))
    else:
        # Choose the action with highest Q value
        action = np.argmax(Q[state[0]][state[1]][state[2]])
    return action


# Generate the sample data
def generate_sample(df):

    cols = ['local_numeric_day', 'encoded_time', 'volume_bins', 'actions']
    df = df[cols]

    # Get states as tuple
    state_tuple = create_state_tuple(df)

    # Initiate Q table 
    global Q_star 
    Q_star = np.zeros((state_tuple))

    train_data = df[:math.floor(df.shape[0]*train_split)].to_numpy()
    test_data = df[train_data.shape[0]:].to_numpy()

    return train_data, test_data


def train_sarsa(train_data):
    # Define variables to track performance validation
    prev_avg_reward = -1000
    avg_reward = 0
    num_episodes = 0
    
    # Train for multiple episodes until convergence
    while abs(avg_reward - prev_avg_reward) > 0.001:
        prev_avg_reward = avg_reward
        total_reward = 0
        print("Start SARSA")
        
        # Loop through all training data
        for i in range(len(train_data)-1):
            # Get current state and action
            state = get_state(train_data[i][0], train_data[i][1], train_data[i][2], train_data[i][3])
            # action = choose_action(state)
            
            # Get next state and reward
            # next_state = get_state(train_data[i+1][0], train_data[i+1][1], train_data[i+1][2], train_data[i+1][3])
            reward = train_data[i+1][-1] * 0.1
            
            # Choose next action based on epsilon-greedy policy
            # next_action = choose_action(next_state)
            
            # Update Q table
            td_error = reward + gamma * Q[i+1][-1][-1] - Q[i][-1][-1]
            Q[i][-1][-1] += alpha * td_error
        
            total_reward += reward
        
        # Calculate average reward for the current episode
        avg_reward = total_reward / len(train_data)
        num_episodes += 1
        print("Episode:", num_episodes, "Average Reward:", avg_reward)
    
    return Q


def create_state_tuple(df_train):
    '''
    Automatically initiate state tuple based on the number of unique
    values in each state (features) from the given dataframe to ease the initiation of Q table
    '''
    state_list = []

    state_list.append(df_train.shape[0])
    for col in df_train:
        state_list.append(df_train[col].nunique())

    state_tuple = tuple(state_list)

    return state_tuple

In [None]:
rewards = df['one_time_reward'].to_list()

In [None]:
import project.SARSA_one_state as SARSA

In [None]:
action_dict = {
	'buy': 0,
	'sell': 1,
	'no_action': 2
}


def policy(Q, sarsa_agent, state, epsilon = 0.1, verbose = False) -> (int, float): 
	best_action = None
	best_value = float('-inf')
	
	# update allowed actions everytime based on agent current holding unit 
	if sarsa_agent.isHolding == False: # indicate can buy/no action but cannot sell
		allowed_actions = ['buy', 'no_action']
	else:
		allowed_actions = ['sell', 'no_action']

	random.shuffle(allowed_actions)

	for action in allowed_actions:
		if verbose:
			print(f"Holding: {sarsa_agent.isHolding}")
			print(f'action: {action}')
			print(f'value: {Q[state][action_dict.get(action)]} vs best_value: {best_value}')
			print(f'new best action: {action}')
		if Q[state][action_dict.get(action)] > best_value:
			best_action = action_dict.get(action)
			best_value = Q[state][best_action]
				
	r_var = random.random()
	if r_var < epsilon:
		if verbose:
			print(f'Choosing random action')
		best_action = action_dict.get(random.choice(allowed_actions))
		best_value = Q[state][best_action]
		
	if verbose:
		print(f'Final action: {best_action}\n')

	return best_action, best_value


# Update Q-value for a state-action pair based on observed rewards and estimated future Q-values
def update_q_value(state:tuple, action:int, rewards:list, rewards_value:float, next_state:tuple, next_action:int, verbose=False):

	if verbose == True:
		print(f"State: {state}, Action: {action}, Rewards: {rewards}, Next_state: {next_state}, Next_action: {next_action}")
		
	# Compute the updated Q-value using the SARSA update equation
	current_q = Q[state][action_dict.get(action)]

	# Additional reward if have been making profit of at least 20 usd
	if sum(rewards) >= 20: current_q += 100
	next_q = Q[next_state][action_dict.get(next_action)]
	new_q = current_q + lr * (rewards_value + GAMMA * next_q - current_q)
    
    # Update the Q-value in the Q-table
	Q[state][action_dict.get(action)] = new_q
	
    # Check if the (state, action) pair exists in the Q-table
    # if (state, action) not in Q:
    #     Q[(state, action)] = 0.0

In [None]:
# for episode in range(num_episodes):
steps = []
rewards_list = []
steps_list = []


num_episodes = 10000000
num_steps_per_episode = 10
total_rewards = 0
GAMMA = 0.9


for episode in range(num_episodes):
    current_state = 0  # Starting state
    action, action_value = policy(Q, sarsa_agent, current_state, epsilon，)
    # update upcoming allowed actions
    if action == 0:
        sarsa_agent.isHolding = True
    if action == 1:
        sarsa_agent.isHolding = False
    total_rewards = 0

    while (current_state != Q.shape[0] - 1):
        rewards = reward_table[current_state][action]
        total_rewards += rewards
        steps.append(action)
        next_state = current_state + 1 
        next_action, next_action_value = policy(Q, sarsa_agent, next_state, epsilon)
        update_q_value(current_state, action, rewards, next_state, next_action)

        # print(f'After update:, action: {action}, action_value : {action_value}, next_action: {next_action}, next_action_value: {next_action_value}\n')
        # update upcoming allowed actions
        if next_action == 0:
            sarsa_agent.isHolding = True
        if next_action == 1:
            sarsa_agent.isHolding = False

        current_state = next_state
        action = next_action 
        action_value = next_action_value

    rewards_list.append(total_rewards)
    steps_list.append(steps)

***Perform price and volume aggregation for monthly daily basis reward table***

In [None]:
# the trade volumes here originally is "amount", total usd price is "vol"
# will rename after retrieved from the dataframe
desired_col = [ 
    'date', 
    'time', 
    'local_numeric_day', 
    'amount',
    'vol',
    'sell_rewards', 
    'buy_rewards', 
    'sell_cumulative_rewards', 
    'buy_cumulative_rewards',
    'actions',
    'volume_bins',
    'encoded_time'
]

renamed_col = [ 
    'date', 
    'time', 
    'local_numeric_day', 
    'trade_volumes',
    'trade_total_price',
    'sell_rewards', 
    'buy_rewards', 
    'sell_cumulative_rewards', 
    'buy_cumulative_rewards',
    'actions',
    'volume_bins',
    'encoded_time'
]


df_volumes = df[desired_col]
df_volumes.columns = renamed_col

In [None]:
from datetime import date 

def convert_to_first_day_of_month(df, date_column_name):
    # convert to datetime format
    starting_month_list = df[date_column_name].apply(lambda x: date(x.year, x.month, 1))
    return starting_month_list

def get_week_of_month(df, date_column_name) -> list:

    def compute_week_of_month(date_value):
        first_day = date(date_value.year, date_value.month, 1)
        offset = (date_value.weekday() + 1 - first_day.weekday()) % 7
        week_of_month = (date_value.day + offset - 1) // 7 + 1
        return week_of_month

    week_of_month_list = df[date_column_name].apply(lambda x: compute_week_of_month(x))

    return week_of_month_list

In [None]:
df_volumes['starting_month'] = convert_to_first_day_of_month(df, 'date')
df_volumes['week_of_month'] = get_week_of_month(df, 'date')

In [None]:
df_volumes.head()

**Start aggregating the trading data**

***Based on year, month, day, time to see the max, min, average, median, sd of the price and volumes***

In [None]:
volume_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'trade_volumes']
price_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'trade_volumes', 'trade_total_price']
reward_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'sell_rewards', 'buy_rewards', 'actions']

groupby_keys = ['week_of_month', 'local_numeric_day', 'encoded_time']

df_volume_stats = df_volumes[volume_cols].groupby(groupby_keys).describe().reset_index()

df_price = df_volumes[price_cols].groupby(groupby_keys).sum()
df_price['daily_average_trade_total_price'] = df_price['trade_total_price'] / df_price['trade_volumes']
df_price = df_price.drop(columns=['trade_volumes', 'trade_total_price'])
df_price_stats = df_price.groupby(groupby_keys).mean()

df_reward_stats = df_volumes[reward_cols[:-1]].groupby(groupby_keys).describe().reset_index()

In [None]:
df_volume_stats.head()

In [None]:
df_price_stats.head()

In [None]:
df_reward_stats.head()

In [None]:
df_reward_stats[['week_of_month', 'local_numeric_day', 'encoded_time']]\
.reset_index()\
.groupby(['week_of_month', 'local_numeric_day'])\
.nunique()

***Create nested reward table***

***Keep the df_price_stats and df_volume_stats for reference***

In [None]:
df_reward_stats.head()

***Start creating nested reward table from here***


In [None]:
df_sell_rewards = df_reward_stats['sell_rewards'][['mean']].copy()
df_buy_rewards = df_reward_stats['buy_rewards'][['mean']].copy()

In [None]:
df_sell_rewards.head()

In [None]:
sell_cols = ['sell_rewards']
buy_cols = ['buy_rewards']
df_sell_rewards.columns = sell_cols
df_buy_rewards.columns = buy_cols

In [None]:
df_sell_rewards.head()

In [None]:
df_buy_rewards.head()

In [None]:
#df_nested_rewards = pd.merge([df_reward_stats[['week_of_month', 'local_numeric_day', 'encoded_time']], df_sell_rewards, df_buy_rewards],).reset_index()
df_nested_rewards = pd.concat([df_reward_stats[['week_of_month', 'local_numeric_day', 'encoded_time']], df_sell_rewards, df_buy_rewards], axis=1)

# rename column to remove the tuple-like hierachy syntax for easier retrieve
rename_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'sell_rewards', 'buy_rewards']
df_nested_rewards.columns = rename_cols
df_nested_rewards.head()

In [None]:
# Create a custom aggregation function to fill in values based on conditions
def fill_values(column):
    if column[column > 0].empty:
        return None
    return column[column > 0].values[0]

# Pivot the DataFrame
df_pivoted_rewards = pd.pivot_table(df_nested_rewards, values=['sell_rewards', 'buy_rewards'], index=['week_of_month', 'local_numeric_day', 'encoded_time'],
                            aggfunc=fill_values).reset_index()
df_pivoted_rewards = df_pivoted_rewards.rename(columns={'sell_rewards': 'sell_action', 'buy_rewards': 'buy_action'})
df_pivoted_rewards.head()

In [None]:
# assign reverse action reward for NaN value 
df_pivoted_rewards['buy_action'] = df_pivoted_rewards['buy_action'].fillna(df_pivoted_rewards['sell_action']*-1)
df_pivoted_rewards['sell_action'] = df_pivoted_rewards['sell_action'].fillna(df_pivoted_rewards['buy_action']*-1)
df_pivoted_rewards.head()

In [None]:
# add in no action reward value 
df_pivoted_rewards['no_action'] = 0
df_pivoted_rewards.head()

In [None]:
# transform encoded time into scalar value for easier indexing
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_pivoted_rewards['label_encoded_time'] = encoder.fit_transform(df_pivoted_rewards[['encoded_time']])
df_pivoted_rewards.head()

In [None]:
df_pivoted_rewards.nunique()

In [None]:
df_pivoted_rewards.head()

In [None]:
# get the unique value of each column for each state 
state_unique_counts = df_pivoted_rewards.nunique()

# initialize shape size
state_array_shape = tuple(state_unique_counts[:3])
# add 3 unique actions 
state_array_shape += (num_action ,)
print("State array shape:", state_array_shape)

# create the array with the initialized shape size 
state_array = np.zeros(state_array_shape)

# start padding reward value into each state respectively
# Iterate over the rows of the DataFrame
for index, row in df_pivoted_rewards.iterrows():
    week_index = row['week_of_month']-1
    day_index = row['local_numeric_day']-1
    time_index = row['label_encoded_time']-1
    value = [row['buy_action'], row['sell_action'], row['no_action']]
    state_array[week_index, day_index, time_index] = value

state_array[0]

In [None]:
# assign state array to be reward array for easier reference
reward_table = state_array.copy()

In [None]:
reward_table.shape

In [None]:
# Iterate over dimensions a, b, c sequentially
# 27/6/2023: tried np.diter but doesnt want to waste more time as there are more parameters to discover but knew that this will reduce the computation time complexity
# for now use a simple nested loops first
# in future when data is more then explore np.diter
for month in range(reward_table.shape[0]):
    for day in range(reward_table.shape[1]):
        for time in range(reward_table.shape[2]):
            state = (month, day, time)

In [None]:
# Start modifying the SARSA nested state iteration from here
# Training loop
# for episode in range(num_episodes):

# Training loop
import project.SARSA_one_state as SARSA

num_states = df['encoded_time'].nunique()
num_actions = df['actions'].nunique()
lr = 0.005
discount_factor = 0.1
epsilon = 0.1
# print(num_states, num_actions)

sarsa_agent = SARSA.SARSAAgent(
    df,
    learning_rate=lr,
    discount_factor=discount_factor,
    epsilon=epsilon
)

global Q 
sarsa_agent.initialize_q_table(df)
Q = np.zeros(reward_table.shape)
print(Q.shape)

In [None]:
environments_list = []
total_rewards_list = []
rewards_list = []
steps_list = []
num_episodes = 10000
# num_steps_per_episode = 1000
GAMMA = 0.9
isVerbose = False

for episode in range(num_episodes):

    print("\nEpisode:", episode)

    # initialize cumulative rewards
    total_rewards = 0
    steps = []
    environments = []
    rewards = []

    current_state = (0,0,0) # Starting state
    action, action_value = policy(Q, sarsa_agent, current_state, epsilon, verbose=isVerbose)
    # update upcoming allowed actions
    if action == 0:
        sarsa_agent.isHolding = True
    else:
        sarsa_agent.isHolding = False

    rewards_value = reward_table[current_state][action]
    total_rewards += rewards_value

    steps.append(action)
    rewards.append(rewards_value)
    environments.append(current_state)

    # when current state has not iterate until the last row of Q table
    # while (current_state != (reward_table.shape[0],reward_table.shape[1],reward_table.shape[2])):

    for month in range(reward_table.shape[0]):
        for day in range(reward_table.shape[1]):
            for time in range(reward_table.shape[2]):

                # when iterating (0,0) start from (0,0,1) to (0,0,287) because (0,0,0) already initialized on top
                if current_state[0] == 0 and current_state[1] == 0:
                    current_state = [month, day, time + 1]
                else:
                    current_state = [month, day, time]

                current_state = tuple(current_state)
                # print("Current state:", current_state)
                # print("Current action:", action, action_value)

                rewards_value = reward_table[current_state][action]
                total_rewards += rewards_value

                # print("Total rewards:", total_rewards)

                steps.append(action)
                rewards.append(rewards_value)
                environments.append(current_state)

                # if not last row of state then + 1 else move up 1 level then + 1
                if current_state[2] < reward_table.shape[2] - 1:
                    next_state = [current_state[0], current_state[1], current_state[2] + 1]
                    # print("In time:", current_state, next_state)
                elif reward_table.shape[2] - 1 == current_state[2] and current_state[1] < reward_table.shape[1] - 1:
                    next_state = [current_state[0], current_state[1] + 1, current_state[2]]
                    # print("In day:", current_state,  next_state)
                elif current_state[0] < reward_table.shape[0] - 1:
                    next_state = [current_state[0] + 1, current_state[1], current_state[2]]
                    # print("In month:", current_state, next_state)
                
                next_state = tuple(next_state)
                next_action, next_action_value = policy(Q, sarsa_agent, next_state, epsilon, verbose=isVerbose)

                #print("End:", current_state, action, rewards, next_state, next_action)
                update_q_value(current_state, action, rewards, rewards_value, next_state, next_action, verbose=False)

                # print(f'After update:, action: {action}, action_value : {action_value}, next_action: {next_action}, next_action_value: {next_action_value}\n')
                # update upcoming allowed actions
                if next_action == 0:
                    sarsa_agent.isHolding = True
                if next_action == 1:
                    sarsa_agent.isHolding = False

                current_state = next_state
                action = next_action 
                action_value = next_action_value

    total_rewards_list.append(sum(rewards))
    rewards_list.append(rewards)
    steps_list.append(steps)
    environments_list.append(environments)

In [None]:
# import pickle
# with open('sarsa_crypto.pickle', 'wb') as file:
#     pickle.dump((Q, reward_table, rewards_list, steps_list), file)

In [None]:
df_check = pd.DataFrame(total_rewards_list, columns=['rewards'])
df_check.describe()

In [None]:
total_rewards_list

In [None]:
max_value_index = total_rewards_list.index(max(total_rewards_list))
worst_value_index = total_rewards_list.index(min(total_rewards_list))

print(max_value_index)
print(worst_value_index)
print(max(total_rewards_list))
print(min(total_rewards_list))
print("Max step list:", steps_list[max_value_index])
print("Bad step list:", steps_list[worst_value_index])

# Check if each row has different values
is_different = np.all(np.diff(Q, axis=1), axis=1)

# Display the result
#print(is_different)

In [None]:
rewards_list[worst_value_index]

In [None]:
rewards_list[max_value_index]

In [None]:
# df_environments = pd.DataFrame(environments_list)
# df_environments.T.head(20)

# df_steps = pd.DataFrame(steps_list)
# df_steps.T.head(20)

In [None]:
sum(rewards_list[max_value_index])

In [None]:
sum(rewards_list[worst_value_index])

In [None]:
# see the main action difference between max and worst step at what timestep
df_compare = pd.DataFrame(steps_list[max_value_index], columns=['max_steps_action'])
df_compare['worst_steps_action'] = steps_list[worst_value_index]
df_compare['is_same'] = df_compare['max_steps_action'].equals(df_compare['worst_steps_action'])
df_compare.head()

In [None]:
# get the first action where both are difference
df_compare[df_compare['is_same'] != True]


In [None]:
# Investigate why same action but giving different reward value

In [None]:
import pickle 

with open('sarsa_crypto.pickle', 'rb') as file:
    loaded_data = pickle.load(file)

print(loaded_data) 

In [None]:
sum(rewards_list[0])

In [None]:
total_rewards_list[0]

In [None]:
import joblib 

joblib.dump(Q, 'sarsa_crypto.joblib')
# [Q, total_rewards_list, rewards_list, steps_list]

# save rewards and steps list
with open('rewards.txt', 'w') as file:
    # Convert each element in the list to a string and write it to the file
    for item in rewards_list:
        file.write(str(item) + '\n')

with open('steps.txt', 'w') as file:
    # Convert each element in the list to a string and write it to the file
    for item in steps_list:
        file.write(str(item) + '\n')

In [None]:
import joblib 

def load_large_text_file(file_path):
    data_list = []
    with open(file_path, 'r') as file:
        for line in file:
            data_list.append(line.rstrip('\n'))
    return data_list

Q = joblib.load('sarsa_crypto.joblib')
rewards_list = load_large_text_file('rewards.txt')
steps_list = load_large_text_file('steps.txt')

In [None]:
Q.shape 

***Validate the trained SARSA performance***

In [None]:
# Load the trained SARSA model
model = joblib.load('sarsa_crypto.joblib')

In [None]:
df_5min.head()

In [None]:

# Load and preprocess your test data
test_data = load_test_data()  # Replace with your code to load test data
preprocessed_data = preprocess(test_data)  # Replace with your code to preprocess the data

# Evaluate model performance on test data
total_rewards = 0
num_episodes = len(preprocessed_data)

for episode in preprocessed_data:
    state = episode['state']
    done = False
    episode_reward = 0

    while not done:
        action = model.predict(state)
        next_state, reward, done = environment.step(action)
        episode_reward += reward
        state = next_state

    total_rewards += episode_reward

average_reward = total_rewards / num_episodes
print(f"Average reward on test data: {average_reward}")

In [None]:
class ParentSARSA:
    def __init__(self, verbose=True):
        self.learning_rate: Optional[float] = 0.005
        self.discount_factor: Optional[float] = 0.1
        self.epsilon: Optional[float] = 0.1
        self.gamma: Optional[float] = 0.9
        self.num_episodes: Optional[int] = 100
        self.data: Optional[pd.Dataframe] = None
        self.Q: Optional[np.array] = None
        self.reward_table: Optional[np.array] = None

        if verbose:
            print("Parent SARSA inherited!")
            print("Parent attributes:", self.__dict__)

In [None]:
sarsa = ParentSARSA()

In [None]:
import numpy as np 

hold_array = np.array([[10, 20]])

hold_array.shape

In [None]:
hold_array[0,1]

In [None]:
import random 

isHolding = random.random() < 0.5
isHolding

In [None]:
current_state = (0,0,0,0)
current_state = (1,) + current_state[1:]
current_state

In [None]:
for episode in range(2):

    print("\nEpisode:", episode)

    # initialize cumulative rewards
    total_rewards = 0
    steps = []
    environments = []
    rewards = []

    # Randomize the starting state
    if random.random() < 0.5: 
        current_state = (0,0,0,0)
    else:
        current_state = (1,0,0,0)

    # the recommended action will be choosen from the allowed actions listS
    action, action_value = policy(current_state)

    # get the reward from the action taken in current state
    rewards_value = Sarsa.reward_table[current_state][action]
    total_rewards += rewards_value

    # determine the next state based on the current state + action
    if action == 0:
        Sarsa.isHolding = True
        # current_state = (1,) + current_state[1:]
    else:
        Sarsa.isHolding = False
        # current_state = (0,) + current_state[1:]

    steps.append(action)
    rewards.append(rewards_value)
    environments.append(current_state)


In [None]:
for month in range(3):
    for day in range(2):
        for time in range(1):
            print(month, day, time)

In [None]:
a = [1,2,3,4]

a[3] + 1

In [None]:
a[1:] == [2,3,4]

In [None]:
a[-1]

In [None]:
a = (0,1,2,3)
a[1:] == [1,2,3]

In [None]:
a[1:]

In [None]:
a = (0,0,0,0)
current_state = list(a)
current_state[3] = 1
next_state = tuple(current_state)

next_state 

In [None]:
next_state[0]

In [None]:
2,5,7,288,3

In [None]:
for month in range(0, 5):
    for day in range(0, 7):
        for time in range(0, 288):
            print(month, day, time)

In [None]:
import sys

def log_print(*args, **kwargs):
    with open('print_log.txt', 'a') as f:
        print(*args, **kwargs, file=f)

# Redirect stdout to the log_print function
sys.stdout = log_print

# Your code here
print("Hello, this will be logged in print_log.txt")
print("You can add any print statements here")

# Restore stdout to the original stream
sys.stdout = sys.__stdout__

In [None]:
import os 

os.getcwd()

In [None]:
import joblib 
Q = joblib.load('sarsa_crypto.joblib')
Q

In [None]:
import pandas as pd 
import numpy as np 

df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

In [None]:
df_train.head(3)

In [None]:
import pandas as pd 
import numpy as np
from datetime import datetime

df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

def get_day_of_week(df) -> pd.DataFrame:
    """
    Returns the name of the day of the week for the given day number (0-6)
    """
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    df['local_numeric_day'] =  df['datetime'].apply(lambda x: (x.weekday()) % 7 + 1)
    df['local_day'] =  df['local_numeric_day'].apply(lambda x: days[x-1])
    return df 


def test_get_day_of_week(df_train, df_test):
    df_days = pd.DataFrame({'local_numeric_day': [1,2,3,4,5,6,7], 'local_day':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']})
    cols = ['local_numeric_day', 'local_day']
    try:
        df_train = get_day_of_week(df_train).sort_values(by='local_numeric_day')[cols]
        df_test = get_day_of_week(df_test).sort_values(by='local_numeric_day')[cols]
        assert df_days == df_train == df_test
    except AttributeError: # str type 'datetime' column doesnt have datetime built-in function
        try:
            df_train['datetime'] = df_train['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            df_test['datetime'] = df_test['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            df_train = get_day_of_week(df_train).sort_values(by='local_numeric_day')[cols]
            df_test = get_day_of_week(df_test).sort_values(by='local_numeric_day')[cols]
            assert df_days == df_train == df_test
        except (ValueError, KeyError):
            raise ValueError("Invalid date format. The date value must be either in datetime format or a string in the format 'YYYY-MM-DD HH:MM:SS'.")                                               

In [None]:
test_get_day_of_week(df_train, df_test)

In [None]:
df_train = pd.read_csv("train_data.csv")
df_train['datetime'] = df_train['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
df_train['local_numeric_day'] =  df_train['datetime'].apply(lambda x: (x.weekday()) % 7 + 1)
df_train.head(1)

In [None]:
import os 

os.getcwd()

In [None]:
import pandas as pd 
import numpy as np
from datetime import datetime
from pandas.testing import assert_frame_equal

print("Start testing")

class Preprocessor():
    def get_day_of_week(self, df) -> pd.DataFrame:
        """
        Returns the name of the day of the week for the given day number (0-6)
        """
        days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
        df['local_numeric_day'] =  df['datetime'].apply(lambda x: (x.weekday()) % 7 + 1)
        df['local_day'] =  df['local_numeric_day'].apply(lambda x: days[x-1])
        return df 
    

    def set_action(df, optimum_sell_rewards=15, optimum_buy_rewards=15) -> pd.DataFrame:
        """
        Adds a new column called 'price_diff' to the given DataFrame,
        containing the difference between the current row's close price and
        the previous row's close price.
        """
        # Create a new column called 'prev_close' that contains the close price from the previous row
        df['prev_close'] = df['close'].shift(1)

        # Compute the difference between the current row's close price and the previous row's close price
        df['price_diff'] = df['close'] - df['prev_close']
        df['sell_rewards'] = df['price_diff'].shift(-1)
        df['buy_rewards'] = (df['price_diff'].shift(-1))*-1
        df['sell_cumulative_rewards'] = df['sell_rewards'].cumsum()
        df['buy_cumulative_rewards'] = df['buy_rewards'].cumsum()
        df['actions'] = -1 # default 0 = buy, 1 = sell, -1 = no action
        df.loc[df['buy_rewards'] >= 5, 'actions'] = 0
        df.loc[df['sell_rewards'] > 5 , 'actions'] = 1
        df.loc[df['actions'] == 1, 'one_time_reward'] = df['sell_rewards']
        df.loc[df['actions'] == 0, 'one_time_reward'] = df['buy_rewards']
        df.loc[df['actions'] == -1, 'one_time_reward'] = 0

        # Return the updated DataFrame
        return df


    # power law optimum bin 
    def get_optimal_pareto_distribution_num_bins(df) -> pd.DataFrame:
        """
        Estimates the optimal number of bins for the 'volume_trade' column
        of the given DataFrame using the Sturges method for power law distributions,
        and returns the estimated number of bins.
        """
        # Compute the sample size and the maximum value of the 'volume_trade' column
        n = len(df['amount'])
        x_max = df['amount'].max()

        # Estimate the optimal number of bins using the Sturges method
        num_bins = int(np.ceil(np.log2(n) + np.log2(1 + x_max)))

        # Return the estimated number of bins
        return num_bins


    def pareto_distribution_bins(df) -> pd.DataFrame:
        """Creates power law bins for the 'volume_trade' column of the given
        DataFrame using the qcut function, and returns the updated DataFrame.
        """

        # Compute the optimal number of bins for quantiles splitting
        # num_bins = get_optimal_pareto_distribution_num_bins(df)
        num_bins = 10

        # Compute the quantiles of the 'volume_trade' column using a power law distribution
        quantiles = pd.qcut(df['amount'], num_bins, labels=False, duplicates='drop')

        # Add a new column to the DataFrame with the bin labels
        df['volume_bins'] = quantiles

        # Return the updated DataFrame
        return df
    
    def encode_time(df) -> pd.DataFrame:
        """Encodes the time in the given DataFrame as a string representing the time
        in sequential order (hour-minute-second), and returns the updated DataFrame.
        """
        # Convert the 'time' column to a datetime object
        df['time'] = pd.to_datetime(df['datetime'])
        df['date'] = df['datetime'].dt.date

        # Extract the hour, minute, and second from the 'time' column
        df['hour'] = df['time'].dt.hour
        df['minute'] = df['time'].dt.minute
        df['second'] = df['time'].dt.second

        # Convert the hour, minute, and second to strings
        df['hour_str'] = df['hour'].astype(str).str.zfill(2)
        df['minute_str'] = df['minute'].astype(str).str.zfill(2)
        df['second_str'] = df['second'].astype(str).str.zfill(2)

        # Concatenate the hour, minute, and second strings into a single time string
        df['encoded_time'] = df['hour_str'] + '-' + df['minute_str'] + '-' + df['second_str']

        # Drop the original hour, minute, and second columns
        df = df.drop(['hour', 'minute', 'second', 'hour_str', 'minute_str', 'second_str'], axis=1)

        # Return the updated DataFrame with the encoded time string
        return df


    def convert_to_first_day_of_month(df) -> pd.DataFrame:
        # convert to datetime format
        date_column_name = 'date'
        assign_column_name = 'starting_month'
        starting_month_list = df[date_column_name].apply(lambda x: date(x.year, x.month, 1))
        df.loc[:, assign_column_name] = starting_month_list

        return df


    def get_week_of_month(df) -> pd.DataFrame:

        def compute_week_of_month(date_value):
            first_day = date(date_value.year, date_value.month, 1)
            offset = (date_value.weekday() + 1 - first_day.weekday()) % 7
            week_of_month = (date_value.day + offset - 1) // 7 + 1
            return week_of_month

        date_column_name = 'date'
        assign_column_name = 'week_of_month'
        week_of_month_list = df[date_column_name].apply(lambda x: compute_week_of_month(x))
        df.loc[:, assign_column_name] = week_of_month_list

        return df


    def get_daily_average_trade_total_price(df) -> pd.DataFrame:
        df = df[Constantor.PRICE_COLS].groupby(Constantor.GROUPBY_KEYS).sum()
        df['daily_average_trade_total_price'] = df['trade_total_price'] / df['trade_volumes']
        df = df.drop(columns=['trade_volumes', 'trade_total_price'])
        return df


    # Create a custom aggregation function to fill in values based on conditions
    def fill_values(column):
        if column[column > 0].empty:
            return None
        return column[column > 0].values[0]



def test_get_day_of_week():
    df_train = pd.read_csv("train_data.csv")
    df_test = pd.read_csv("test_data.csv")

    df_days = pd.DataFrame({'local_numeric_day': [1,2,3,4,5,6,7], 'local_day':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']})
    cols = ['local_numeric_day', 'local_day']
    try:
        print("Trying")
        df_train = Preprocessor.get_day_of_week(df_train).sort_values(by='local_numeric_day')[cols]
        df_test = Preprocessor.get_day_of_week(df_test).sort_values(by='local_numeric_day')[cols]
        print("Try ok")
        assert_frame_equal(df_train, df_test)
        print("The DataFrames are equal.")
    except AttributeError: # str type 'datetime' column doesnt have datetime built-in function
        try:
            print("Caught attribute error")
            df_train['datetime'] = df_train['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            df_test['datetime'] = df_test['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            df_train = Preprocessor.get_day_of_week(df_train).sort_values(by='local_numeric_day')[cols]
            df_test = Preprocessor.get_day_of_week(df_test).sort_values(by='local_numeric_day')[cols]
            print("Exception ok")
            assert_frame_equal(df_train, df_test)
            print("The exception DataFrames are equal.")
        except Exception as e:
            print(f"Error message: {e}") 

Preprocessor = Preprocessor()

In [None]:
test_get_day_of_week()

In [None]:
import pandas as pd 
from datetime import datetime

df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")
cols = ['local_numeric_day', 'local_day']

df_train['datetime'] = df_train['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_test['datetime'] = df_test['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_train = Preprocessor.get_day_of_week(df_train).sort_values(by='local_numeric_day')[cols].drop_duplicates()
df_test = Preprocessor.get_day_of_week(df_test).sort_values(by='local_numeric_day')[cols].drop_duplicates()
print("Exception ok")

In [None]:
assert_frame_equal(df_train, df_test)
print("The DataFrames are equal.")

In [None]:
import os 
os.chdir('project')
os.getcwd()

In [None]:
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.head(3)

In [None]:
df_test.head(3)

In [None]:
df_test.columns

In [None]:
df_test.datetime.dtypes == object

In [None]:
df_test.date.nunique()

In [None]:
df_result = pd.read_csv('validation/result_20230808.csv')
df_result.head()

In [None]:
df_result.steps[0]

In [None]:
import pandas as pd

# Sample dictionary for the new row
new_row_dict = {'A': 10, 'B': 20, 'C': 30}

# Create an empty DataFrame
df = pd.DataFrame(columns=['A', 'B', 'C'])

# Add the new row to the DataFrame using .loc[]
df.loc[len(df)] = new_row_dict

print(df)

In [None]:
q_values = np.array([10, 15, 5])

In [None]:
q_values[[0,2]]

In [None]:
import joblib 
q = joblib.load('sarsa_crypto.joblib')
q.shape

In [None]:
state = (0,0,0,3)

q[state]

In [None]:
import numpy as np 

array = np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])

array

In [None]:
array.shape

In [None]:
import random 
random.choice(array)

In [None]:
def policy(state) -> tuple[int, float]: 

	global Sarsa

	r_var = random.random()
	if r_var < Sarsa.epsilon:
		print("random")
		best_action = np.where(Sarsa.Q[state] == random.choice(Sarsa.Q[state]))[0][0]
		best_value = Sarsa.Q[state][best_action]
	else:
		best_action = np.argmax(Sarsa.Q[state])
		best_value = Sarsa.Q[state][best_action]

	return best_action, best_value

In [None]:
class Sarsa: 
    def __init__(self):
        self.Q = None 
        self.epsilon = 0.5

Sarsa = Sarsa()

In [None]:
Sarsa.Q = np.random.rand(2,5,7,288,3)
Sarsa.Q.shape 

In [None]:
current_state = (0,0,0,0)

for i in range(10):
    action, q_value = policy(current_state)
    print(action, q_value)

In [None]:
Q = np.zeros((5,7,288,3))
Q.shape

In [None]:
price_table = Q[..., np.newaxis]
price_table

In [None]:
import os 
os.getcwd()

In [None]:
df_preprocessed = pd.read_csv("project-2/validation/preprocessed_data.csv")
df_reward_stats = pd.read_csv("project-2/validation/reward_stats_data.csv")

In [None]:
df_reward_stats.head()

In [None]:
def create_price_table(df, Q):
    # get the unique value of each column for each state 
    price_table = Q[..., np.newaxis]
    print("Price table dimension:", price_table.shape)

    encoder = LabelEncoder()
    df['label_encoded_time'] = encoder.fit_transform(df[['encoded_time']])
    df = df[['week_of_month', 'local_numeric_day', 'label_encoded_time', 'average_period_price']]\
        .groupby(['week_of_month', 'local_numeric_day', 'label_encoded_time'])\
        .mean().reset_index()
    
    print(f"\n{df.columns}\n")

    # Iterate over the rows of the DataFrame
    for index, row in df.iterrows():
        week_index = int(row['week_of_month']-1)
        day_index = int(row['local_numeric_day']-1)
        time_index = int(row['label_encoded_time']-1)
        value = row['average_period_price']
        print(week_index, day_index, time_index, value)
        price_table[week_index, day_index, time_index] = value

    return price_table


def pivot_rewards_table(df):
    df_sell_rewards = df_reward_stats['sell_rewards'][['mean']].copy()
    df_buy_rewards = df_reward_stats['buy_rewards'][['mean']].copy()
    df_sell_rewards.columns = ['sell_rewards']
    df_buy_rewards.columns = ['buy_rewards']
    df_nested_rewards = pd.concat([df_reward_stats[['week_of_month', 'local_numeric_day', 'encoded_time']], df_sell_rewards, df_buy_rewards], axis=1)

    # rename column to remove the tuple-like hierachy syntax for easier retrieve
    rename_cols = ['week_of_month', 'local_numeric_day', 'encoded_time', 'sell_rewards', 'buy_rewards']
    df_nested_rewards.columns = rename_cols

    df_pivoted_rewards = pd.pivot_table(df_nested_rewards, values=['sell_rewards', 'buy_rewards'], index=['week_of_month', 'local_numeric_day', 'encoded_time'],
                                aggfunc=fill_values).reset_index()
    df_pivoted_rewards = df_pivoted_rewards.rename(columns={'sell_rewards': 'sell_action', 'buy_rewards': 'buy_action'})

    # assign reverse action reward for NaN value 
    df_pivoted_rewards['buy_action'] = df_pivoted_rewards['buy_action'].fillna(df_pivoted_rewards['sell_action']*-1)
    df_pivoted_rewards['sell_action'] = df_pivoted_rewards['sell_action'].fillna(df_pivoted_rewards['buy_action']*-1)
    df_pivoted_rewards['no_action'] = 0

    encoder = LabelEncoder()
    df_pivoted_rewards['label_encoded_time'] = encoder.fit_transform(df_pivoted_rewards[['encoded_time']])

    # get the unique value of each column for each state 
    state_unique_counts = df_pivoted_rewards.nunique()

    # initialize shape size
    state_array_shape = tuple(state_unique_counts[:3])
    # add 3 unique actions 
    state_array_shape += (3 ,)
    print("State array shape:", state_array_shape)

    # create the array with the initialized shape size 
    state_array = np.zeros(state_array_shape)

    return df_pivoted_rewards, state_array


def create_reward_table(df_pivoted_rewards, state_array) -> tuple[np.array, np.array]:
    # start padding reward value into each state respectively
    # Iterate over the rows of the DataFrame
    for index, row in df_pivoted_rewards.iterrows():
        week_index = row['week_of_month']-1
        day_index = row['local_numeric_day']-1
        time_index = row['label_encoded_time']-1
        value = [row['buy_action'], row['sell_action'], row['no_action']]
        state_array[week_index, day_index, time_index] = value

    # 25/7/2023: Temporarily add in the two dimensional holding or not holding into the state.
    # In the future, all this categorical or continuous factor should be written in a scalable way instead of so ad hoc
    holding_array = np.array([[0,1]]).T
    reward_table  = holding_array[:, np.newaxis, np.newaxis, np.newaxis, :] + state_array

    # assign state array to be reward array for easier reference
    Q = np.zeros(reward_table.shape)

    if verbose:
        print("\nReward table generated!")
        print("Reward table size:", reward_table.shape)
        print("Reward table value peek:", reward_table[0,0,0,0,0])

        print("\nQ table initialized!")
        print("Q table size:", Q.shape)
        print("Q table value peek:", Q[0,0,0,0,0])

    return reward_table, Q

In [None]:
from sklearn.preprocessing import LabelEncoder 

df_result = create_price_table(df, Q)
df_result 

In [None]:
price_table = Q[..., np.newaxis]
price_table.shape 

In [None]:
df_pivoted_rewards, state_array = pivot_rewards_table(df_reward_stats)
print(state_array.shape)
df_pivoted_rewards.head()

In [None]:
_, Sarsa.Q = Preprocessor.create_reward_table(df_reward_stats)

In [None]:
df = df[['week_of_month', 'local_numeric_day', 'label_encoded_time', 'average_period_price']]\
    .groupby(['week_of_month', 'local_numeric_day', 'label_encoded_time'])\
    .mean().reset_index()

df.head()

In [None]:
price_table.shape 

In [None]:
# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    week_index = int(row['week_of_month']-1)
    day_index = int(row['local_numeric_day']-1)
    time_index = int(row['label_encoded_time']-1)
    value = row['average_period_price']
    print(week_index, day_index, time_index, value)
    #price_table[week_index, day_index, time_index] = value

In [None]:
from sklearn.preprocessing import LabelEncoder 

encoder  = LabelEncoder()

df['label_encoded_time'] = encoder.fit_transform(df[['encoded_time']])
df = df[['week_of_month', 'local_numeric_day', 'label_encoded_time', 'average_period_price']]\
    .groupby(['week_of_month', 'local_numeric_day', 'label_encoded_time'])\
    .mean().reset_index()

In [None]:
for index, row in df.iterrows():
    time_index = int(row['label_encoded_time']-1)
    print("Price:", time_index)

In [None]:
import os 

os.getcwd()

In [None]:
import joblib 

sarsa_model = joblib.load('project-2/sarsa_crypto.joblib')
sarsa_model

In [None]:
data = [10, 20, 15, 25, 30, 18, 12, 40, 22, 35]

# Sort the data in reverse and get the sorted indices
sorted_indices = [index for index, _ in sorted(enumerate(data), key=lambda x: x[1], reverse=True)]

print("Original Data:", data)
print("Sorted Indices:", sorted_indices)

In [23]:
len([])

0

In [90]:
import pandas as pd 


keep_top_n_steps = 2

class test:
    def __init__(self):
        self.df_steps=pd.DataFrame(columns=['Step'] + [f'Top_{i+1}' for i in range(keep_top_n_steps)]) 
        self.df_rewards=pd.DataFrame(columns=['Rewards'] + [f'Top_{i+1}' for i in range(keep_top_n_steps)])
        self.top_n_total_rewards = [0]
        self.keep_top_n_steps=10 if keep_top_n_steps is None else keep_top_n_steps
        self.keep_top_n_steps=[0]*self.keep_top_n_steps

    # Update and keep the top n values 
    def update_top_n_values(self, rewards:list[float], steps:list[int]):
        for index, _ in enumerate(self.top_n_total_rewards, start=1):            
            if sum(rewards) > self.top_n_total_rewards[-1]:
                print("Replacing")
                self.top_n_total_rewards.pop()
                self.top_n_total_rewards.append(sum(rewards))
                self.df_rewards.iloc[:, -1] = rewards
                self.df_steps.iloc[:, -1] = steps       

            self.top_n_total_rewards.sort(reverse=True)
            sorted_indices = [
                index for index, _ 
                in sorted(enumerate(self.top_n_total_rewards), key=lambda x: x[1], reverse=True)
            ]
            print("total rewards:", self.top_n_total_rewards)
            print("Sorted indices:", sorted_indices)

            # re-sort df_steps, df_rewards based on the sorted_indices 
            self.df_steps = self.df_steps.iloc[:, sorted_indices]
            self.df_rewards = self.df_rewards.iloc[:, sorted_indices]

In [91]:
tester = test()
tester.__dict__

{'df_steps': Empty DataFrame
 Columns: [Step, Top_1, Top_2]
 Index: [],
 'df_rewards': Empty DataFrame
 Columns: [Rewards, Top_1, Top_2]
 Index: [],
 'top_n_total_rewards': [0],
 'keep_top_n_steps': [0, 0]}

In [92]:
import random 

test_episode = 10
test_values = 5
for episode in range(test_episode):
    random_steps_values = [random.choice([-1, 0, 1]) for _ in range(test_values)]
    random_rewards_values = [random.choice([-1, 0, 1]) for _ in range(test_values)]
    print("random_steps_values", random_steps_values)
    print("random reward values", random_rewards_values, "\n")
    tester.update_top_n_values(random_rewards_values, random_steps_values)

random_steps_values [1, 1, 1, 1, 1]
random reward values [-1, 1, 0, -1, 1] 

total rewards: [0]
Sorted indices: [0]
random_steps_values [-1, 1, -1, -1, -1]
random reward values [0, 0, 0, -1, 1] 

total rewards: [0]
Sorted indices: [0]
random_steps_values [0, 0, -1, 0, 0]
random reward values [-1, 0, 1, -1, -1] 

total rewards: [0]
Sorted indices: [0]
random_steps_values [1, 0, 0, 0, 1]
random reward values [-1, 0, -1, -1, -1] 

total rewards: [0]
Sorted indices: [0]
random_steps_values [0, 1, 0, -1, -1]
random reward values [0, 1, -1, 0, 1] 

Replacing
total rewards: [1]
Sorted indices: [0]
random_steps_values [0, 0, 0, -1, -1]
random reward values [0, 0, 1, 0, -1] 

total rewards: [1]
Sorted indices: [0]
random_steps_values [-1, 1, 0, 0, 0]
random reward values [1, 1, 0, -1, 1] 

Replacing
total rewards: [2]
Sorted indices: [0]
random_steps_values [-1, 0, 0, 0, 1]
random reward values [-1, 0, -1, -1, -1] 

total rewards: [2]
Sorted indices: [0]
random_steps_values [1, 0, 1, 1, -1]
ran

In [22]:
tester.df_rewards

Unnamed: 0,Rewards,Top_1,Top_2


In [81]:
keep_top_n_steps

2

In [79]:
df_steps=pd.DataFrame({f'Step_Top_{i+1}': [0]*keep_top_n_steps for i in range(keep_top_n_steps)})

In [80]:
df_steps.head()

Unnamed: 0,Step_Top_1,Step_Top_2
0,0,0
1,0,0
