#### The goal for this section is to pick out the independent features that most likely affect win rate (mainly from domain knowledge), then transforming their numerical values to categorical values. Finally, I will encode the categorical values so they are machine learning ready.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [2]:
player_info = pd.read_csv(r'result_files/nl_100_player_info.csv')
player_info.head()

Unnamed: 0.1,Unnamed: 0,hands_played,pots_won,amount_won,to_inv,net_win,per_hand,vpip_count,inv_count,vpip,% won,avg_p_size,avg_agg,per_hand_inv,avg_player_num,avg_post_num,p_score,f_score,t_score,r_score
0,HQg4dAYPUt6c2wXfx8Qp2A,26449,3723.0,55512.26,69091.57,-13579.31,-0.513415,4972,13211,0.187984,0.281811,14.910626,0.159628,-1.027879,5.418239,1.240274,0.150667,0.006276,0.001928,0.000756
1,igvhUfnIs8in/SlI6b2cug,19731,2727.0,46252.14,53338.8,-7086.66,-0.359164,5110,10677,0.258983,0.255409,16.960814,0.044042,-0.663731,5.335563,1.443566,0.027926,0.007552,0.006943,0.001622
2,kIbanshKysxtdDXKqLAWsA,16433,1685.0,23442.41,29165.71,-5723.3,-0.348281,2505,5810,0.152437,0.290017,13.912409,0.112761,-0.985077,8.128522,1.721171,0.10698,0.004503,0.000913,0.000365
3,yspTPyicC4Ere+tsoRHI1A,13480,1987.0,36132.37,44339.4,-8207.03,-0.60883,2950,6546,0.218843,0.303544,18.184383,0.166617,-1.253747,5.55816,1.434866,0.15319,0.009125,0.00319,0.001113
4,vETYfpoA+FhBercnDPJrRw,12862,1857.0,22457.02,31077.37,-8620.35,-0.670218,2588,6614,0.201213,0.280768,12.093172,0.147955,-1.303349,5.353133,1.334318,0.138237,0.006609,0.001944,0.001166


In [3]:
# trim down to only players who played more than 1000 hands
player_info_trim = player_info[player_info['hands_played'] > 1000]
player_info_trim.shape

(329, 20)

In [4]:
player_info_trim.columns

Index(['Unnamed: 0', 'hands_played', 'pots_won', 'amount_won', 'to_inv',
       'net_win', 'per_hand', 'vpip_count', 'inv_count', 'vpip', '% won',
       'avg_p_size', 'avg_agg', 'per_hand_inv', 'avg_player_num',
       'avg_post_num', 'p_score', 'f_score', 't_score', 'r_score'],
      dtype='object')

In [5]:
player_info_df = player_info_trim[['p_score','f_score','t_score','r_score', 'vpip', 'avg_p_size', 'avg_player_num', 'avg_post_num', 'per_hand', 'per_hand_inv']]
player_info_df

Unnamed: 0,p_score,f_score,t_score,r_score,vpip,avg_p_size,avg_player_num,avg_post_num,per_hand,per_hand_inv
0,0.150667,0.006276,0.001928,0.000756,0.187984,14.910626,5.418239,1.240274,-0.513415,-1.027879
1,0.027926,0.007552,0.006943,0.001622,0.258983,16.960814,5.335563,1.443566,-0.359164,-0.663731
2,0.106980,0.004503,0.000913,0.000365,0.152437,13.912409,8.128522,1.721171,-0.348281,-0.985077
3,0.153190,0.009125,0.003190,0.001113,0.218843,18.184383,5.558160,1.434866,-0.608830,-1.253747
4,0.138237,0.006609,0.001944,0.001166,0.201213,12.093172,5.353133,1.334318,-0.670218,-1.303349
...,...,...,...,...,...,...,...,...,...,...
324,0.154762,0.002976,0.000000,0.001984,0.583333,7.573333,2.914683,1.250992,-0.940635,-1.012991
325,0.083416,0.006951,0.008937,0.000993,0.315789,14.877037,8.439921,2.144985,-0.800695,-1.783850
326,0.025819,0.005958,0.000993,0.001986,0.202582,13.886905,8.403178,1.827210,0.148798,0.408283
327,0.059642,0.003976,0.000000,0.000994,0.224652,13.420192,8.372763,2.037773,-0.090507,-0.235881


Next, I want to seperate each feature into categories. I will be seperating them by the overall statistics for each columns. By the end of this analysis, I want every category of each feature make sense to a recreational poker player.

For p, f, t, r scores, I will divide each column into 2 categories: aggressive, passive. I will divide at 50th percentile.

In [6]:
p = player_info_df['p_score']
f = player_info_df['f_score']
t = player_info_df['t_score']
r = player_info_df['r_score']

In [7]:
# preflop action
p_agg = []
for i in p:
    if i >= p.quantile(.5):
        p_agg.append('aggressive')
    else:
        p_agg.append('passive')

In [8]:
# flop action
f_agg = []
for i in f:
    if i >= f.quantile(.5):
        f_agg.append('aggressive')
    else:
        f_agg.append('passive')

In [9]:
# turn action
t_agg = []
for i in t:
    if i >= t.quantile(.5):
        t_agg.append('aggressive')
    else:
        t_agg.append('passive')

In [12]:
# river action
r_agg = []
for i in r:
    if i >= r.quantile(.5):
        r_agg.append('aggressive')
    else:
        r_agg.append('passive')

For vpip, I will divide into 2 categories: loose, tight. Using some poker domain knowledge, I will set people who play less than 30% hands as tight.

In [13]:
v = player_info_df['vpip']
vpip_f = []
for i in v:
    if i > 0.3:
        vpip_f.append('loose')
    else:
        vpip_f.append('tight')

For average pot size, I will assign small, large, at 50th percentile.

In [14]:
a = player_info_df['avg_p_size']
pot_size = []
for i in a:
    if i >= a.quantile(.5):
        pot_size.append('large')
    else:
        pot_size.append('small')

For players in hand and players post flop, I will divide them into less and many

In [15]:
player = player_info_df['avg_player_num']
post = player_info_df['avg_post_num']
player_num = []
post_num = []
for i in player:
    if i >= player.quantile(.5):
        player_num.append('many')
    else:
        player_num.append('less')
for i in post:
    if i >= post.quantile(.5):
        post_num.append('many')
    else:
        post_num.append('less')

For the dependant values of this analysis, winning per hand and winning per hand involved, I will divide them into only 2 categories. Because these hands are played at tables with high rake, and the rake is usually 1-2 big blinds, I will classify players who have greater than negative 1 big blind ($1) winning per hand involved as "good". 

Winning per hand is a bit hard to classify because rake taken will differ depending on how many hands they are involved. But poker is a zero sum game if there's no rake, so I will simply classify the top 50 percent of players as "good".

In [16]:
win_per_hand = []
win_per_hand_involve = []
for i in player_info_df['per_hand']:
    if i >= player_info_df['per_hand'].quantile(.5):
        win_per_hand.append('good')
    else:
        win_per_hand.append('bad')
for i in player_info_df['per_hand_inv']:
    if i >= player_info_df['per_hand_inv'].quantile(.5):
        win_per_hand_involve.append('good')
    else:
        win_per_hand_involve.append('bad')

In [17]:
# make a new categorized data frame from the newly created lists
player_info_cat = pd.DataFrame([p_agg, f_agg, t_agg, r_agg, vpip_f, pot_size, post_num, win_per_hand, win_per_hand_involve]).T

In [18]:
player_info_cat.columns = ['p_agg', 'f_agg', 't_agg', 'r_agg', 'vpip', 'pot_size', 'post_num', 'win_per_hand', 'win_per_hand_involve']

In [19]:
player_info_cat

Unnamed: 0,p_agg,f_agg,t_agg,r_agg,vpip,pot_size,post_num,win_per_hand,win_per_hand_involve
0,aggressive,passive,passive,passive,tight,large,less,bad,bad
1,passive,aggressive,aggressive,aggressive,tight,large,less,good,good
2,passive,passive,passive,passive,tight,small,many,good,bad
3,aggressive,aggressive,passive,passive,tight,large,less,bad,bad
4,aggressive,passive,passive,passive,tight,small,less,bad,bad
...,...,...,...,...,...,...,...,...,...
324,aggressive,passive,passive,aggressive,loose,small,less,bad,bad
325,passive,passive,aggressive,passive,loose,large,many,bad,bad
326,passive,passive,passive,aggressive,tight,small,many,good,good
327,passive,passive,passive,passive,tight,small,many,good,good


Finally I will convert the dependent variables into 1s and 0s, and use one hot encoding for all the independent variables.

In [20]:
# convert dependent variables to 1s and 0s
player_info_cat.win_per_hand[player_info_cat.win_per_hand == 'good'] = 1
player_info_cat.win_per_hand_involve[player_info_cat.win_per_hand_involve == 'good'] = 1
player_info_cat.win_per_hand[player_info_cat.win_per_hand == 'bad'] = 0
player_info_cat.win_per_hand_involve[player_info_cat.win_per_hand_involve == 'bad'] = 0

In [21]:
player_info_cat

Unnamed: 0,p_agg,f_agg,t_agg,r_agg,vpip,pot_size,post_num,win_per_hand,win_per_hand_involve
0,aggressive,passive,passive,passive,tight,large,less,0,0
1,passive,aggressive,aggressive,aggressive,tight,large,less,1,1
2,passive,passive,passive,passive,tight,small,many,1,0
3,aggressive,aggressive,passive,passive,tight,large,less,0,0
4,aggressive,passive,passive,passive,tight,small,less,0,0
...,...,...,...,...,...,...,...,...,...
324,aggressive,passive,passive,aggressive,loose,small,less,0,0
325,passive,passive,aggressive,passive,loose,large,many,0,0
326,passive,passive,passive,aggressive,tight,small,many,1,1
327,passive,passive,passive,passive,tight,small,many,1,1


In [22]:
# use one-hot-encoding on all other columns
player_info_dummies = pd.get_dummies(player_info_cat, columns = ['p_agg', 'f_agg', 't_agg', 'r_agg', 'vpip', 'pot_size', 'post_num'])

In [23]:
player_info_dummies

Unnamed: 0,win_per_hand,win_per_hand_involve,p_agg_aggressive,p_agg_passive,f_agg_aggressive,f_agg_passive,t_agg_aggressive,t_agg_passive,r_agg_aggressive,r_agg_passive,vpip_loose,vpip_tight,pot_size_large,pot_size_small,post_num_less,post_num_many
0,0,0,1,0,0,1,0,1,0,1,0,1,1,0,1,0
1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,1,0
2,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1
3,0,0,1,0,1,0,0,1,0,1,0,1,1,0,1,0
4,0,0,1,0,0,1,0,1,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,0,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0
325,0,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1
326,1,1,0,1,0,1,0,1,1,0,0,1,0,1,0,1
327,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1


Now the dataframe is ready to be modeled, export the data frame as csv.

In [24]:
player_info_dummies.to_csv('result_files/nl_100_dummies.csv')