This notebook uses the q_learning_lib.py to perform Q-learning on tic tac toe.

A variation of different hyperparameters are tried, against many different cases.

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import q_learning_lib as qlb
import json
import os

Assuming alpha = 0.1 and tau = e, lets record what happens in each of the 4 cases of training (optimal, non optimal, x, o)

In [66]:
NUMBER_OF_BATCHES = 10000
BATCH_SIZE = 20

In [67]:
def make_alpha(rate_val):
    """Function factorie for creating different alpha functions"""
    def alpha(turn):
        return rate_val * (BATCH_SIZE * NUMBER_OF_BATCHES - turn) / (NUMBER_OF_BATCHES * BATCH_SIZE)

    return alpha

def make_tau(rate_val):
    """Function factorie for creating different tau functions"""
    def tau(turn):
        return 1 + rate_val * (turn) / (NUMBER_OF_BATCHES * BATCH_SIZE) 
    return tau

def record_params(results, filename):

    df = pd.DataFrame(results[::-1], columns=['o_win', 'draw', 'x_win'])

    # Save to CSV (index=False prevents it from adding a row-number column)
    df.to_csv(filename, index=False)

In [68]:
alpha_rates = np.linspace(0.01, 0.5, 10)
tau_values = np.linspace(1, 20, 10)
tau_values

array([ 1.        ,  3.11111111,  5.22222222,  7.33333333,  9.44444444,
       11.55555556, 13.66666667, 15.77777778, 17.88888889, 20.        ])

In [69]:
def try_different_hyperparams(alpha_rates, tau_values, player, strategy):
    """this functions tries a variation of different hyperparameters
    results from every single iteration are saved in a .csv file
    how alpha params, tau values affect the final winrates is recorded in a pandas df and also saved to a file """
 
    df_list = [] #will be turned to a df later
    
    for alpha_rate in alpha_rates: 
        for tau_value in tau_values:

            #trains
            Q_Table_x_optimal, results = qlb.perform_training(player, strategy, NUMBER_OF_BATCHES, BATCH_SIZE, True, make_alpha(alpha_rate), make_tau(tau_value))

            results = results/(BATCH_SIZE * 200)
        
            #records the results
            filename = f"parameter_results/results_{player}_{strategy}_opponent_alpha_{alpha_rate}_tau_{tau_value}.csv"
            record_params(results, filename)

            
            #records end winrates vs hyperparams
            end_winrate = np.mean(results[:, 1][-5:])
            new_results = {'alpha_rate': alpha_rate, 'tau_rate': tau_value, 'final_winrate': end_winrate}
            df_list.append(new_results)
            print(f"alpha rate = {alpha_rate}, tau rate = {tau_value}, end winrates = {end_winrate}")

    df = pd.DataFrame(df_list) 
    df.to_csv("parameter_results/hyperparams.csv", index=False)

In [70]:
try_different_hyperparams(alpha_rates, tau_values, 'x', 'perfect')

o win   draw    x win
200
80.58%   19.43%   0.00%
400
81.00%   19.00%   0.00%
600
81.25%   18.75%   0.00%
800
80.88%   19.12%   0.00%
1000
80.40%   19.60%   0.00%
1200
79.28%   20.73%   0.00%
1400
79.50%   20.50%   0.00%
1600
80.75%   19.25%   0.00%
1800
80.88%   19.12%   0.00%
2000
79.58%   20.43%   0.00%
2200
78.97%   21.02%   0.00%
2400
78.85%   21.15%   0.00%
2600
78.97%   21.02%   0.00%
2800
79.75%   20.25%   0.00%
3000
78.42%   21.57%   0.00%
3200
77.67%   22.32%   0.00%
3400
77.62%   22.38%   0.00%
3600
78.28%   21.73%   0.00%
3800
76.33%   23.68%   0.00%
4000
77.47%   22.52%   0.00%
4200
76.50%   23.50%   0.00%
4400
76.45%   23.55%   0.00%
4600
76.33%   23.68%   0.00%
4800
73.92%   26.07%   0.00%
5000
75.47%   24.52%   0.00%
5200
75.78%   24.23%   0.00%
5400
76.90%   23.10%   0.00%
5600
75.30%   24.70%   0.00%
5800
75.90%   24.10%   0.00%
6000
74.97%   25.02%   0.00%
6200
74.88%   25.12%   0.00%
6400
73.85%   26.15%   0.00%
6600
72.85%   27.15%   0.00%
6800
73.53%   26.48%   0.

In [73]:
df = pd.read_csv("parameter_results/hyperparams.csv")
