### ロジスティック回帰により、flopでboardによってbetするべきかcheckするべきかの２値判別予測をします。
### situation: BU vs SB 3bet、SBのflop戦略

In [17]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [18]:
# ボードがmonotone,twotone,rainbowのどれなのかを判定する関数
# boardは 'Ts6s3c' などの形式
suit_dic = {1:'monotone', 2:'twotone',3:'rainbow'}
pair_dic = {1:'trips', 2:'one pair', 3:'no pair'}

def dist_board_type(board):
    return suit_dic[len(set([*board[1:6:2]]))]
    
# boardにペアがあるかを判定する関数
def dist_board_pair(board):
    return pair_dic[len(set([*board[0:5:2]]))]

In [20]:
card_num_dict = {'A':14, 'K':13, 'Q':12, 'J':11, 'T':10, '9':9, '8':8, '7':7, '6':6, '5':5, '4':4, '3':3, '2':2}

def get_board_sum(board):
    return card_num_dict[board[0]]+card_num_dict[board[2]]+card_num_dict[board[4]]

def get_high_card(board):
    return max([card_num_dict[board[0]], card_num_dict[board[2]], card_num_dict[board[4]]])

In [21]:
df = pd.read_csv('184flops_3bet.csv')

In [22]:
df.head()

Unnamed: 0,board,equity,EV,bet66%,bet33%,check
0,AsQs3s,59.446,13.336,0.035,5.471,94.493
1,4s3s2s,52.193,11.321,0.118,9.239,90.643
2,6s4s3s,51.017,11.494,0.152,10.345,89.503
3,As7s5s,55.429,13.209,3.708,13.781,82.511
4,As3s2d,56.014,11.881,4.105,15.903,79.992


In [23]:
# これだけだとcheck率を予測する変数が足りなさそうなので、feature engineeringにより変数追加
df['board_type'] = df['board'].apply(dist_board_type)
df['board_sum'] = df['board'].apply(get_board_sum)
df['board_pair'] = df['board'].apply(dist_board_pair)
df['high_card'] = df['board'].apply(get_high_card)

In [24]:
# カラムが増えました
df.head()

Unnamed: 0,board,equity,EV,bet66%,bet33%,check,board_type,board_sum,board_pair,high_card
0,AsQs3s,59.446,13.336,0.035,5.471,94.493,monotone,29,no pair,14
1,4s3s2s,52.193,11.321,0.118,9.239,90.643,monotone,9,no pair,4
2,6s4s3s,51.017,11.494,0.152,10.345,89.503,monotone,13,no pair,6
3,As7s5s,55.429,13.209,3.708,13.781,82.511,monotone,26,no pair,14
4,As3s2d,56.014,11.881,4.105,15.903,79.992,twotone,19,no pair,14


In [25]:
# bet55%以上のボードに1, bet45%以下のボードに0をつける関数
def check_or_bet(x):
    if x >= 55:
        return 0
    elif x <= 45:
        return 1

In [26]:
df['check_or_bet'] = df['check'].apply(check_or_bet)

In [27]:
df.shape

(184, 11)

In [28]:
# bet率が45%より大きく55%より小さいデータは取り除く
df.dropna(inplace=True)

In [29]:
df.shape

(167, 11)

In [30]:
df['board_type'].value_counts()

twotone     86
rainbow     69
monotone    12
Name: board_type, dtype: int64

In [31]:
# 脱線してboradタイプごとの平均値での集計をしてみます。
df.groupby('board_type').mean()

Unnamed: 0_level_0,equity,EV,bet66%,bet33%,check,board_sum,high_card,check_or_bet
board_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
monotone,56.044833,13.180917,3.944417,26.620333,69.435167,23.75,11.333333,0.083333
rainbow,58.402928,15.499696,47.073855,42.260478,10.665638,24.072464,11.391304,0.956522
twotone,57.760895,14.486453,34.96186,40.61264,24.425558,25.290698,11.639535,0.883721


In [32]:
# borad_sumの平均が約24なので、それ以上以下で差があるか見てみます。
df[df['board_sum']<=24].groupby('board_type').mean()

Unnamed: 0_level_0,equity,EV,bet66%,bet33%,check,board_sum,high_card,check_or_bet
board_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
monotone,54.663167,12.7125,2.4775,29.8675,67.655,18.5,9.0,0.166667
rainbow,56.231184,14.571842,53.512711,29.621263,16.866,19.815789,10.605263,0.921053
twotone,54.574909,13.157932,34.004659,33.828636,32.166727,20.068182,10.386364,0.772727


In [33]:
#  どうやらmonotone以外だとborad_sumが高いほどbet率が高そうです。
df[df['board_sum']>=24].groupby('board_type').mean()

Unnamed: 0_level_0,equity,EV,bet66%,bet33%,check,board_sum,high_card,check_or_bet
board_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
monotone,57.350375,13.600875,4.756875,26.069625,69.1735,27.75,13.125,0.0
rainbow,60.706806,16.398444,37.861056,57.815889,4.323028,28.555556,12.472222,1.0
twotone,60.474653,15.629837,36.742755,46.313755,16.943592,29.795918,12.714286,1.0


In [34]:
# 脱線終了。ロジスティック回帰をするためにboard_typeをダミー変数化します。
board_dummies = pd.get_dummies(df['board_type'])

In [35]:
# いわゆるone hot encoingというやつですね。linear modelでカテゴリカル変数を扱う時は必須です。
board_dummies.head()

Unnamed: 0,monotone,rainbow,twotone
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,0,0,1


In [36]:
# 他の２列のデータがあれば残りの１列が１なのか0なのかは自明なので、変数間に恣意的な相関を持たせないために１列削除
board_dummies.drop('monotone',axis=1, inplace=True)

In [37]:
board_dummies.head()

Unnamed: 0,rainbow,twotone
0,0,0
1,0,0
2,0,0
3,0,0
4,0,1


In [38]:
# board_pairでも同じことをする
pair_dummies = pd.get_dummies(df['board_pair']).drop('trips', axis=1)

In [39]:
pair_dummies.head()

Unnamed: 0,no pair,one pair
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [40]:
X = df[['equity', 'board_sum', 'high_card']]
X = pd.concat([X,board_dummies],axis=1)
X = pd.concat([X,pair_dummies],axis=1)

In [41]:
# 今回使用する説明変数
X.head()

Unnamed: 0,equity,board_sum,high_card,rainbow,twotone,no pair,one pair
0,59.446,29,14,0,0,1,0
1,52.193,9,4,0,0,1,0
2,51.017,13,6,0,0,1,0
3,55.429,26,14,0,0,1,0
4,56.014,19,14,0,1,1,0


In [42]:
# 正解ラベル
Y = df.check_or_bet.values

In [43]:
Y

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [44]:
# モデル自身への当てはめでは87.4%の精度が得られた
log_model = LogisticRegression() 
log_model.fit(X,Y)
log_model.score(X,Y)

0.87425149700598803

In [45]:
# equityがほとんどcheck or betに影響していないのは意外。board_typeが他の変数と比べるとかなり意思決定に寄与していそう。
# このような分析結果がポーカーの戦略構築に役に立つことがある。
coeff_df = pd.DataFrame([X.columns, log_model.coef_[0]]).T
coeff_df

Unnamed: 0,0,1
0,equity,-0.00661878
1,board_sum,0.0579908
2,high_card,0.159335
3,rainbow,2.25035
4,twotone,1.71738
5,no pair,-1.7199
6,one pair,0.586571


In [46]:
# 過学習かもしれないので、cross validationを行う
skf = StratifiedKFold(n_splits=5)
print('Cross-validation score: {:.1%}'.format(np.mean(cross_val_score(log_model, X, Y, cv=skf))))

Cross-validation score: 86.2%
