In [1]:
from sqlalchemy import create_engine
import pandas as pd 
import numpy as np
import random
import math 

In [2]:
table_name = 'crypto'
engine = create_engine(f'postgresql://postgres:postgres@localhost:5432/{table_name}')

df_raw = pd.read_sql_query('select * from klines',con=engine)

In [3]:
df_5min = df_raw.query("period_type == '5min'").reset_index()

In [4]:
def get_day_of_week(df):
    """
    Returns the name of the day of the week for the given day number (0-6)
    """
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    df['local_numeric_day'] =  df['datetime'].apply(lambda x: (x.weekday()) % 7 + 1)
    df['local_day'] =  df['local_numeric_day'].apply(lambda x: days[x-1])
    return df 


def set_action(df, optimum_sell_rewards=15, optimum_buy_rewards=15):
    """
    Adds a new column called 'price_diff' to the given DataFrame,
    containing the difference between the current row's close price and
    the previous row's close price.
    """
    # Create a new column called 'prev_close' that contains the close price from the previous row
    df['prev_close'] = df['close'].shift(1)

    # Compute the difference between the current row's close price and the previous row's close price
    df['price_diff'] = df['close'] - df['prev_close']
    df['sell_rewards'] = df['price_diff'].shift(-1)
    df['buy_rewards'] = (df['price_diff'].shift(-1))*-1
    df['sell_cumulative_rewards'] = df['sell_rewards'].cumsum()
    df['buy_cumulative_rewards'] = df['buy_rewards'].cumsum()
    df['actions'] = 0 # default 0 = buy, 1 = sell, -1 = no action
    df.loc[df['sell_rewards'] > 10, 'actions'] = 1
    df.loc[df['actions'] == 1, 'one_time_reward'] = df['sell_rewards']
    df.loc[df['actions'] == 0, 'one_time_reward'] = df['buy_rewards']

    # Return the updated DataFrame
    return df


# normal distribution optimum bin
def get_optimal_normal_distribution_num_bins(df):
    """
    Estimates the optimal number of bins for the 'volume_trade' column
    of the given DataFrame using the Freedman-Diaconis rule, and returns
    the estimated number of bins.
    """
    # Compute the interquartile range of the 'volume_trade' column
    q1, q3 = np.percentile(df['volume_trade'], [25, 75])
    iqr = q3 - q1

    # Estimate the optimal bin width using the Freedman-Diaconis rule
    bin_width = 2 * iqr / np.cbrt(len(df))

    # Compute the estimated number of bins
    num_bins = int(np.ceil((df['volume_trade'].max() - df['volume_trade'].min()) / bin_width))

    # Return the estimated number of bins
    return num_bins


# power law optimum bin 
def get_optimal_pareto_distribution_num_bins(df):
    """
    Estimates the optimal number of bins for the 'volume_trade' column
    of the given DataFrame using the Sturges method for power law distributions,
    and returns the estimated number of bins.
    """
    # Compute the sample size and the maximum value of the 'volume_trade' column
    n = len(df['amount'])
    x_max = df['amount'].max()

    # Estimate the optimal number of bins using the Sturges method
    num_bins = int(np.ceil(np.log2(n) + np.log2(1 + x_max)))

    # Return the estimated number of bins
    return num_bins

def pareto_distribution_bins(df, num_bins):
    """Creates power law bins for the 'volume_trade' column of the given
    DataFrame using the qcut function, and returns the updated DataFrame.
    """
    # Compute the quantiles of the 'volume_trade' column using a power law distribution
    quantiles = pd.qcut(df['amount'], num_bins, labels=False, duplicates='drop')

    # Add a new column to the DataFrame with the bin labels
    df['volume_bins'] = quantiles

    # Return the updated DataFrame
    return df


def encode_time(df):
    """Encodes the time in the given DataFrame as a string representing the time
    in sequential order (hour-minute-second), and returns the updated DataFrame.
    """
    # Convert the 'time' column to a datetime object
    df['time'] = pd.to_datetime(df['datetime'])
    df['date'] = df['datetime'].dt.date

    # Extract the hour, minute, and second from the 'time' column
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df['second'] = df['time'].dt.second

    # Convert the hour, minute, and second to strings
    df['hour_str'] = df['hour'].astype(str).str.zfill(2)
    df['minute_str'] = df['minute'].astype(str).str.zfill(2)
    df['second_str'] = df['second'].astype(str).str.zfill(2)

    # Concatenate the hour, minute, and second strings into a single time string
    df['encoded_time'] = df['hour_str'] + '-' + df['minute_str'] + '-' + df['second_str']

    # Drop the original hour, minute, and second columns
    df = df.drop(['hour', 'minute', 'second', 'hour_str', 'minute_str', 'second_str'], axis=1)

    # Return the updated DataFrame with the encoded time string
    return df


In [5]:
df_5min = get_day_of_week(df_5min)
df_5min = set_action(df_5min)
df_5min = pareto_distribution_bins(df_5min, get_optimal_pareto_distribution_num_bins(df_5min))
df_5min = encode_time(df_5min)
df_5min.head()

Unnamed: 0,level_0,index,id,open,close,high,low,vol,amount,period_type,...,prev_close,price_diff,sell_rewards,buy_rewards,sell_cumulative_rewards,buy_cumulative_rewards,actions,one_time_reward,volume_bins,encoded_time
0,151145,0,1672502400,1202.89,1202.79,1203.14,1202.79,1304.147073,1.084,5min,...,,,-0.15,0.15,-0.15,0.15,0,0.15,0,00-00-00
1,151146,1,1672502700,1202.79,1202.64,1202.86,1202.6,19612.44539,16.306101,5min,...,1202.79,-0.15,-0.32,0.32,-0.47,0.47,0,0.32,6,00-05-00
2,151147,2,1672503000,1202.64,1202.32,1202.64,1202.21,27485.863219,22.856,5min,...,1202.64,-0.32,-0.32,0.32,-0.79,0.79,0,0.32,9,00-10-00
3,151148,3,1672503300,1202.2,1202.0,1202.2,1201.0,106878.352877,88.951166,5min,...,1202.32,-0.32,1.22,-1.22,0.43,-0.43,0,-1.22,22,00-15-00
4,151149,4,1672503600,1201.95,1203.22,1203.41,1201.95,102618.915554,85.315059,5min,...,1202.0,1.22,-0.2,0.2,0.23,-0.23,0,0.2,22,00-20-00


In [6]:
df = df_5min.copy()

In [7]:
df.head()

Unnamed: 0,level_0,index,id,open,close,high,low,vol,amount,period_type,...,prev_close,price_diff,sell_rewards,buy_rewards,sell_cumulative_rewards,buy_cumulative_rewards,actions,one_time_reward,volume_bins,encoded_time
0,151145,0,1672502400,1202.89,1202.79,1203.14,1202.79,1304.147073,1.084,5min,...,,,-0.15,0.15,-0.15,0.15,0,0.15,0,00-00-00
1,151146,1,1672502700,1202.79,1202.64,1202.86,1202.6,19612.44539,16.306101,5min,...,1202.79,-0.15,-0.32,0.32,-0.47,0.47,0,0.32,6,00-05-00
2,151147,2,1672503000,1202.64,1202.32,1202.64,1202.21,27485.863219,22.856,5min,...,1202.64,-0.32,-0.32,0.32,-0.79,0.79,0,0.32,9,00-10-00
3,151148,3,1672503300,1202.2,1202.0,1202.2,1201.0,106878.352877,88.951166,5min,...,1202.32,-0.32,1.22,-1.22,0.43,-0.43,0,-1.22,22,00-15-00
4,151149,4,1672503600,1201.95,1203.22,1203.41,1201.95,102618.915554,85.315059,5min,...,1202.0,1.22,-0.2,0.2,0.23,-0.23,0,0.2,22,00-20-00


In [None]:
# Define the state space
num_days = df['local_numeric_day'].nunique()
num_times = df['time'].nunique()
num_volume_bins = df['volume_bins'].nunique()
state_space = np.zeros((num_days, num_times))
train_split = 0.8


# Define the action space
num_action = df['actions'].nunique()
action_space = np.zeros((num_volume_bins, num_action))


# Initialize the Q-values
global Q_star 


# Initialize learning rate, discount factor, exploration
alpha = 0.1
gamma = 0.9
epsilon = 0.1

In [8]:
# Define the reward function
def reward(price_rewards):
    return price_rewards


# Define a function to get the current state
def get_state(day, time, volume_bins, action):
    # concatenate scalar values into a numpy array
    state = np.array((time, day, volume_bins, action))
    print("Get state:", type(state), state.shape, state, state[0], state[1], state[2], state[3])
    return state


def choose_action(state):
    print("Choose action state:", type(state), state, state.shape)
    if random.uniform(0, 1) < epsilon:
        # Randomly choose an action
        action = np.random.choice((0, num_action-1))
    else:
        # Choose the action with highest Q value
        action = np.argmax(Q[state[0]][state[1]][state[2]])
    return action


# Generate the sample data
def generate_sample(df):

    cols = ['local_numeric_day', 'encoded_time', 'volume_bins', 'sell_actions']
    df = df[cols]

    # Get states as tuple
    state_tuple = create_state_tuple(df)

    # Initiate Q table 
    global Q_star 
    Q_star = np.zeros((state_tuple))

    train_data = df[:math.floor(df.shape[0]*train_split)].to_numpy()
    test_data = df[train_data.shape[0]:].to_numpy()

    return train_data, test_data


def train_sarsa(train_data):
    # Define variables to track performance validation
    prev_avg_reward = -1000
    avg_reward = 0
    num_episodes = 0
    
    # Train for multiple episodes until convergence
    while abs(avg_reward - prev_avg_reward) > 0.001:
        prev_avg_reward = avg_reward
        total_reward = 0
        print("Start SARSA")
        
        # Loop through all training data
        for i in range(len(train_data)-1):
            # Get current state and action
            state = get_state(train_data[i][0], train_data[i][1], train_data[i][2], train_data[i][3])
            # action = choose_action(state)
            
            # Get next state and reward
            # next_state = get_state(train_data[i+1][0], train_data[i+1][1], train_data[i+1][2], train_data[i+1][3])
            reward = train_data[i+1][-1] * 0.1
            
            # Choose next action based on epsilon-greedy policy
            # next_action = choose_action(next_state)
            
            # Update Q table
            td_error = reward + gamma * Q[i+1][-1][-1] - Q[i][-1][-1]
            Q[i][-1][-1] += alpha * td_error
        
            total_reward += reward
        
        # Calculate average reward for the current episode
        avg_reward = total_reward / len(train_data)
        num_episodes += 1
        print("Episode:", num_episodes, "Average Reward:", avg_reward)
    
    return Q


def create_state_tuple(df_train):
    '''
    Automatically initiate state tuple based on the number of unique
    values in each state (features) from the given dataframe to ease the initiation of Q table
    '''
    state_list = []

    state_list.append(df_train.shape[0])
    for col in df_train:
        state_list.append(df_train[col].nunique())

    state_tuple = tuple(state_list)

    return state_tuple

In [None]:
train_data, test_data = generate_sample(df)

In [None]:
rewards = df['rewards'].to_list()

In [None]:
train_data.shape

In [None]:
Q_star.shape

In [30]:
import SARSA_one_state as SARSA

In [31]:
df.head()

Unnamed: 0,level_0,index,id,open,close,high,low,vol,amount,period_type,...,prev_close,price_diff,sell_rewards,buy_rewards,sell_cumulative_rewards,buy_cumulative_rewards,actions,one_time_reward,volume_bins,encoded_time
0,151145,0,1672502400,1202.89,1202.79,1203.14,1202.79,1304.147073,1.084,5min,...,,,-0.15,0.15,-0.15,0.15,0,0.15,0,00-00-00
1,151146,1,1672502700,1202.79,1202.64,1202.86,1202.6,19612.44539,16.306101,5min,...,1202.79,-0.15,-0.32,0.32,-0.47,0.47,0,0.32,6,00-05-00
2,151147,2,1672503000,1202.64,1202.32,1202.64,1202.21,27485.863219,22.856,5min,...,1202.64,-0.32,-0.32,0.32,-0.79,0.79,0,0.32,9,00-10-00
3,151148,3,1672503300,1202.2,1202.0,1202.2,1201.0,106878.352877,88.951166,5min,...,1202.32,-0.32,1.22,-1.22,0.43,-0.43,0,-1.22,22,00-15-00
4,151149,4,1672503600,1201.95,1203.22,1203.41,1201.95,102618.915554,85.315059,5min,...,1202.0,1.22,-0.2,0.2,0.23,-0.23,0,0.2,22,00-20-00


In [32]:
num_states = df['encoded_time'].nunique()
num_actions = df['actions'].nunique()
lr = 0.1
discount_factor = 0.9
epsilon = 0.1
num_episodes = 1000
num_steps_per_episode = 10
print(num_states, num_actions)

sarsa_agent = SARSA.SARSAAgent(
    df,
    learning_rate=lr,
    discount_factor=discount_factor,
    epsilon=epsilon
)

global Q 
sarsa_agent.initialize_q_table(df)
Q = sarsa_agent.q_table

288 2
[[-4.56947368e-01  4.56947368e-01]
 [-2.90000000e-01  2.90000000e-01]
 [-4.43368421e-01  4.43368421e-01]
 [-3.88210526e-01  3.88210526e-01]
 [-1.94631579e-01  1.94631579e-01]
 [-3.51789474e-01  3.51789474e-01]
 [-1.62105263e-01  1.62105263e-01]
 [ 6.05578947e-01 -6.05578947e-01]
 [ 2.74526316e-01 -2.74526316e-01]
 [-1.27052632e-01  1.27052632e-01]
 [-3.60105263e-01  3.60105263e-01]
 [ 2.85157895e-01 -2.85157895e-01]
 [ 4.25473684e-01 -4.25473684e-01]
 [ 2.66842105e-01 -2.66842105e-01]
 [-4.25263158e-01  4.25263158e-01]
 [ 2.50842105e-01 -2.50842105e-01]
 [ 3.16526316e-01 -3.16526316e-01]
 [ 1.03684211e-01 -1.03684211e-01]
 [ 6.40526316e-01 -6.40526316e-01]
 [-2.04947368e-01  2.04947368e-01]
 [-4.66315789e-02  4.66315789e-02]
 [ 1.68736842e-01 -1.68736842e-01]
 [ 2.86526316e-01 -2.86526316e-01]
 [-1.30947368e-01  1.30947368e-01]
 [-3.39473684e-01  3.39473684e-01]
 [ 6.16000000e-01 -6.16000000e-01]
 [ 3.85263158e-01 -3.85263158e-01]
 [ 3.10526316e-02 -3.10526316e-02]
 [-1.32947368e

In [33]:
# Function to get the reward for a given state-action pair
def get_reward(state, action):
    # Retrieve the reward from the Q-table based on the state-action pair
    reward = Q[state][action]
    return reward

# Function to get the next state for a given state-action pair
def get_next_state(state, action):
    # Sample state transition function for crypto trading
    # Here, we assume a simple transition where the next state is determined by the current state and action
    next_state = (state + action) % num_states
    return next_state

# After training, you can use the learned Q-table for decision-making
def make_decision(state):
    action = np.argmax(sarsa_agent.q_table[state])
    return action

In [34]:
state = 0  # Starting state
action = sarsa_agent.get_action(state)
reward = get_reward(state, action)
print(state, action, reward)

0 1 0.45694736842104633


In [None]:
# Training loop
for episode in range(num_episodes):
    state = 0  # Starting state
    action = sarsa_agent.get_action(state)

    for _ in range(num_steps_per_episode):
        # Execute action and observe the reward and next state
        reward = get_reward(state, action)
        next_state = get_next_state(state, action)
        next_action = sarsa_agent.get_action(next_state)
        print("Action:", action)
        print("Reward:", reward)
        print("Next state:", next_state)
        print("Next action:", next_action)

        # Update the Q-table
        sarsa_agent.update_q_table(state, action, reward, next_state, next_action)

        # Move to the next state
        state = next_state
        action = next_action

In [None]:
import numpy as np
import random


# Define the action space
BUY = 0
SELL = 1
NO_ACTION = 2


# Define the Q table
Q = None


# Define the learning parameters
alpha = 0.5  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate


# Initialize the Q table
def initialize_q_table(num_states, num_actions):
    global Q
    Q = np.zeros((num_states, num_actions))


def take_action(state, epsilon):
    # Apply the epsilon-greedy policy to select the action
    action = epsilon_greedy_policy(state, epsilon)
    
    # Get the current position in the sequence
    current_position = state[-1]

    # Check if the current position is a buy or sell action
    is_buy = current_position % 2 == 0

    # Check if the selected action violates the alternating buy-sell sequence
    if (is_buy and action == SELL) or (not is_buy and action == BUY):
        # Set the action to "no action" if it violates the sequence
        action = NO_ACTION

    # Return the selected action
    return action


# Epsilon-greedy policy for action selection
def epsilon_greedy_policy(state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, 2)
    else:
        return np.argmax(Q[state])

# Get the initial state
def get_initial_state():
    return 0

# Get the next state
def get_next_state(state, action):
    # Your code to define the state transition function goes here
    next_state = state + 1
    return next_state

# Get the reward for the current state and action
def get_reward(state, action):
    # Your code to define the reward function goes here
    return reward

# Perform SARSA update
def update_sarsa(state, action, next_state, next_action, reward):
    current_value = Q[state, action]
    next_value = Q[next_state, next_action]
    td_error = reward + gamma * next_value - current_value
    new_value = current_value + alpha * td_error
    Q[state, action] = new_value

# SARSA training
def train_sarsa(num_episodes):
    for episode in range(num_episodes):
        state = get_initial_state()
        action = take_action(state, epsilon)
        done = False
        cumulative_reward = 0  # Track the cumulative reward for the current episode

        while not done:
            next_state = get_next_state(state, action)
            next_action = take_action(next_state, epsilon)
            reward = get_reward(state, action)

            update_sarsa(state, action, next_state, next_action, reward)

            state = next_state
            cumulative_reward += reward  # Accumulate the reward

            done = cumulative_reward > max_cumulative_reward

        if cumulative_reward > max_cumulative_reward:
            max_cumulative_reward = cumulative_reward


# Test the trained SARSA model
def test_sarsa():
    state = get_initial_state()
    action = np.argmax(Q[state])

    while True:
        next_state = get_next_state(state, action)
        next_action = np.argmax(Q[next_state])

        # Your code to perform actions based on the current state goes here

        state = next_state
        action = next_action

        # Check for termination
        if state == MAX_STATE:  # Replace MAX_STATE with your desired termination state
            break


# Main function
def main():
    num_states = 288  # Replace with the actual number of states
    num_actions = 2  # Replace with the actual number of actions

    # Initialize the Q table
    initialize_q_table(num_states, num_actions)

    # Train the SARSA model
    #train_sarsa(num_episodes=100)

    # Test the trained SARSA model
    #test_sarsa()

# if __name__ == "__main__":
#     main()

In [None]:
main()