In [156]:
import db_conn
import pandas as pd
import numpy as np
import copy
import collections
import statsmodels.formula.api as sm
import scipy.stats as st
from IPython.display import display

In [2]:
sql = """SELECT * FROM goal_records_rev"""
goal_records = db_conn.select_query(sql)
goal_records_pd = pd.DataFrame(goal_records, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'own_goal'])
goal_records_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,own_goal
0,2013-1-001,2,10,25,N
1,2013-1-001,3,25,10,N
2,2013-1-001,4,10,25,N
3,2013-1-001,5,25,10,N
4,2013-1-002,6,5,19,N


In [3]:
sql = """SELECT * FROM game_records"""
game_records = db_conn.select_query(sql)
game_records_pd = pd.DataFrame(game_records, columns=['game_id', 'home_team_id', 'away_team_id', 'home_team_score', 'away_team_score', 'winning_team'])
game_records_pd.head()

Unnamed: 0,game_id,home_team_id,away_team_id,home_team_score,away_team_score,winning_team
0,2013-1-001,10,25,2,2,0
1,2013-1-002,19,5,2,1,19
2,2013-1-003,21,23,0,1,23
3,2013-1-004,12,13,1,2,13
4,2013-1-005,20,2,0,0,0


## Score Lines

In [4]:
goal_records_pd = pd.DataFrame(goal_records, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'half_type', 'play_time'])
goal_trace_pd = pd.merge(goal_records_pd, game_records_pd[['game_id', 'home_team_id', 'away_team_id', 'winning_team']], how='left', left_on='match_id', right_on='game_id')
goal_trace_pd = pd.DataFrame(goal_trace_pd, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range'])
goal_trace_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,time_range
0,2013-1-001,2,10,25,10,25,0,,,,,1,29,
1,2013-1-001,3,25,10,10,25,0,,,,,1,32,
2,2013-1-001,4,10,25,10,25,0,,,,,2,2,
3,2013-1-001,5,25,10,10,25,0,,,,,2,38,
4,2013-1-002,6,5,19,19,5,19,,,,,1,4,


#### 득점 순간 별 스코어 및 시간대 분류

In [251]:
home_trace_pd = copy.deepcopy(goal_trace_pd)
home_trace_pd.home_team_score = np.where(home_trace_pd.home_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.away_team_score = np.where(home_trace_pd.away_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.home_team_score = home_trace_pd.groupby(['match_id'])['home_team_score'].cumsum()
home_trace_pd.away_team_score = home_trace_pd.groupby(['match_id'])['away_team_score'].cumsum()
home_trace_pd.score_line = home_trace_pd.home_team_score.map(str) + ':' + home_trace_pd.away_team_score.map(str)
home_trace_pd.time_range = (home_trace_pd.play_time / 15).astype(int) + 1
home_trace_pd.time_range = np.where(home_trace_pd.time_range > 3, 3, home_trace_pd.time_range)
home_trace_pd.time_range = np.where(home_trace_pd.half_type == 2, home_trace_pd.time_range + 3, home_trace_pd.time_range)
home_trace_pd.location = 1
home_trace_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,time_range
0,2013-1-001,2,10,25,10,25,0,1,0,1,1:0,1,29,2
1,2013-1-001,3,25,10,10,25,0,1,1,1,1:1,1,32,3
2,2013-1-001,4,10,25,10,25,0,2,1,1,2:1,2,2,4
3,2013-1-001,5,25,10,10,25,0,2,2,1,2:2,2,38,6
4,2013-1-002,6,5,19,19,5,19,0,1,1,0:1,1,4,1


#### 경기 시작 시점 0:0 세팅

In [6]:
zero_game_pd = copy.deepcopy(game_records_pd)
zero_game_pd = pd.DataFrame(zero_game_pd, columns = ['game_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range'])

zero_game_pd[['id', 'score_team_id', 'lost_team', 'home_team_score', 'away_team_score', 'score_line', 'half_type', 'play_time', 'time_range', 'location']] = [0, 0, '0', 0, 0, '0:0', 1, 0, 1, 1]
zero_game_pd.columns = ['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range']
zero_game_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,time_range
0,2013-1-001,0,0,0,10,25,0,0,0,1,0:0,1,0,1
1,2013-1-002,0,0,0,19,5,19,0,0,1,0:0,1,0,1
2,2013-1-003,0,0,0,21,23,23,0,0,1,0:0,1,0,1
3,2013-1-004,0,0,0,12,13,13,0,0,1,0:0,1,0,1
4,2013-1-005,0,0,0,20,2,0,0,0,1,0:0,1,0,1


In [249]:
full_home_trace_pd = home_trace_pd.append(zero_game_pd)
score_line_matrix = [{'match_id': i, 'time_range': r} for i in full_home_trace_pd['match_id'].tolist() for r in range(1,7)]
score_line_records_pd = pd.merge(pd.DataFrame(pd.DataFrame(score_line_matrix)).drop_duplicates(), full_home_trace_pd, how='left', on=['match_id', 'time_range'])

score_line_records_pd[['id', 'score_team_id', 'lost_team', 'half_type', 'play_time']] = score_line_records_pd[['id', 'score_team_id', 'lost_team', 'half_type', 'play_time']].fillna(0)
score_line_records_pd[['home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'score_line']] = score_line_records_pd[['home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'score_line']].fillna(method='ffill')
score_line_records_pd.location = 1

score_line_records_pd = pd.DataFrame(score_line_records_pd, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range', 'winning_flag'])
score_line_records_pd.winning_flag = score_line_records_pd.home_team_id == score_line_records_pd.winning_team
score_line_records_pd = score_line_records_pd.groupby(['match_id', 'time_range']).max()
score_line_records_pd = score_line_records_pd.reset_index()
score_line_records_pd.head(30)

Unnamed: 0,match_id,time_range,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,winning_flag
0,2013-1-001,1,0.0,0.0,0,10.0,25.0,0.0,0.0,0.0,1,0:0,1.0,0.0,False
1,2013-1-001,2,2.0,10.0,25,10.0,25.0,0.0,1.0,0.0,1,1:0,1.0,29.0,False
2,2013-1-001,3,3.0,25.0,10,10.0,25.0,0.0,1.0,1.0,1,1:1,1.0,32.0,False
3,2013-1-001,4,4.0,10.0,25,10.0,25.0,0.0,2.0,1.0,1,2:1,2.0,2.0,False
4,2013-1-001,5,0.0,0.0,0,10.0,25.0,0.0,2.0,1.0,1,2:1,0.0,0.0,False
5,2013-1-001,6,5.0,25.0,10,10.0,25.0,0.0,2.0,2.0,1,2:2,2.0,38.0,False
6,2013-1-002,1,6.0,5.0,19,19.0,5.0,19.0,0.0,1.0,1,0:1,1.0,4.0,True
7,2013-1-002,2,0.0,0.0,0,19.0,5.0,19.0,0.0,0.0,1,0:0,0.0,0.0,True
8,2013-1-002,3,0.0,0.0,0,19.0,5.0,19.0,0.0,0.0,1,0:0,0.0,0.0,True
9,2013-1-002,4,0.0,0.0,0,19.0,5.0,19.0,0.0,0.0,1,0:0,0.0,0.0,True


#### 시간대/득점유형 별 통계
* score_rate: 해당 시간대에 해당 득점유형을 가진 경기 수
* winning_rate: 해당 시간대에 해당 득점유형을 가진 경기 중 승리한 경기 수

In [28]:
sl_1 = score_line_records_pd[['winning_team', 'score_line', 'time_range']].groupby(['score_line', 'time_range'])['winning_team'].count()
sl_2 = score_line_records_pd[['winning_flag', 'score_line', 'time_range']].groupby(['score_line', 'time_range'])['winning_flag'].sum()
score_line_list = pd.concat([sl_1, sl_2], axis=1).reset_index()

In [122]:
score_line_pd = pd.merge(pd.DataFrame(pd.DataFrame([{'score_line': sl, 'time_range': r} for sl in set(score_line_list.score_line) for r in range(1,7)])).drop_duplicates(), score_line_list, how='left', on=['score_line', 'time_range'])
score_line_pd = score_line_pd.sort_values(by=['score_line', 'time_range'])
score_line_pd[['winning_team', 'winning_flag']] = score_line_pd[['winning_team', 'winning_flag']].fillna(0)
score_line_pd = pd.DataFrame(score_line_pd, columns = ['score_line', 'time_range', 'winning_team', 'winning_flag', 'winning_rate_with_score_line', 'winning_rate'])
score_line_pd.columns = ['score_line', 'time_range', 'scored_team_count', 'winning_team_count', 'score_rate', 'winning_rate']
score_line_pd.score_rate = score_line_pd.scored_team_count / len(game_records_pd) # 2136
score_line_pd.winning_rate = score_line_pd.winning_team_count / score_line_pd.scored_team_count
score_line_pd.winning_rate = score_line_pd.winning_rate.fillna(0)
score_line_pd.head(12)

Unnamed: 0,score_line,time_range,scored_team_count,winning_team_count,score_rate,winning_rate
235,8:1,2,0.0,0.0,0.0,0.0
236,8:1,3,0.0,0.0,0.0,0.0
237,8:1,4,0.0,0.0,0.0,0.0
238,8:1,5,0.0,0.0,0.0,0.0
239,8:1,6,1.0,1.0,0.000468,1.0


In [101]:
score_line_by_winning_rate_tb = score_line_pd.pivot(index='score_line', columns='time_range', values='winning_rate')
score_line_by_winning_rate_tb.columns = ['range1', 'range2', 'range3', 'range4', 'range5', 'range6']

score_line_by_winning_rate_tb.describe()
# display(score_line_by_winning_rate_tb[((score_line_by_winning_rate_tb > 0.8) | (score_line_by_winning_rate_tb < 0.2))])

score_line_by_winning_count_tb = score_line_pd.pivot(index='score_line', columns='time_range', values='score_rate')
score_line_by_winning_count_tb.columns = ['range1', 'range2', 'range3', 'range4', 'range5', 'range6']

score_line_by_winning_count_tb.describe()
# display(score_line_by_winning_count_tb[((score_line_by_winning_count_tb > 0.8) | (score_line_by_winning_count_tb < 0.2))])

score_line_combine_pd = pd.concat([score_line_by_winning_rate_tb, score_line_by_winning_rate_tb.range6.apply(lambda x: '|'), score_line_by_winning_count_tb], axis=1) 
score_line_combine_pd.columns = ['range1_w', 'range2_w', 'range3_w', 'range4_w', 'range5_w', 'range6_w', '---', 'range1_s', 'range2_s', 'range3_s', 'range4_s', 'range5_s', 'range6_s']
score_line_combine_pd

Unnamed: 0_level_0,range1_w,range2_w,range3_w,range4_w,range5_w,range6_w,---,range1_s,range2_s,range3_s,range4_s,range5_s,range6_s
score_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0:0,0.384758,0.379896,0.364167,0.322535,0.248936,0.081301,|,0.755618,0.717228,0.480805,0.332397,0.220037,0.115169
0:1,0.162791,0.136905,0.123016,0.080645,0.052174,0.0,|,0.100655,0.078652,0.117978,0.116105,0.107678,0.087079
0:2,0.105263,0.057143,0.041667,0.045455,0.010309,0.0,|,0.008895,0.016386,0.033708,0.051498,0.045412,0.038858
0:3,0.0,0.0,0.0,0.0,0.0,0.0,|,0.0,0.000936,0.004213,0.009363,0.015449,0.018258
0:4,0.0,0.0,0.0,0.0,0.0,0.0,|,0.0,0.000468,0.000936,0.000936,0.002341,0.005618
0:5,0.0,0.0,0.0,0.0,0.0,0.0,|,0.0,0.0,0.000468,0.000468,0.0,0.000936
0:6,0.0,0.0,0.0,0.0,0.0,0.0,|,0.0,0.0,0.0,0.000468,0.000468,0.0
0:8,0.0,0.0,0.0,0.0,0.0,0.0,|,0.0,0.0,0.0,0.0,0.0,0.000468
1:0,0.590517,0.678218,0.706714,0.743056,0.828897,1.0,|,0.108614,0.094569,0.132491,0.134831,0.123127,0.098315
1:1,0.413793,0.27619,0.289474,0.307143,0.250794,0.0,|,0.013577,0.049157,0.106742,0.131086,0.147472,0.118914


#### 스코어라인을 기반으로 한 승률 예측
* Parameter: 현재 홈팀 스코어, 어웨이팀 스코어, 전/후반전 구분, 경기시간
* Return: 현재 스코어라인을 기반으로 다음 시간대에 발생 가능성이 가장 높은 스코어라인, 가장 높은 가능성의 스코어라인으로 진행 시 홈팀의 승률

In [245]:
def get_prob_by_score_line(home_score=0, away_score=0, half_type=1, play_time=0):
    time_range = 0 if half_type == 1 and play_time == 0 else ((half_type - 1) * 3) + (int(play_time / 15) + 1)
    time_range = time_range + 1 if time_range < 6 else time_range

    # 현재 스코어 이후에 가능한 스코어 확인
    available_score_line_pd = pd.DataFrame(score_line_by_winning_rate_tb.reset_index().score_line.str.split(':').tolist(), index = score_line_by_winning_rate_tb.index, columns=['home', 'away'])
    available_score_line_pd = available_score_line_pd.apply(pd.to_numeric)
    avail_future_list = available_score_line_pd[(available_score_line_pd.home >= home_score) & (available_score_line_pd.away >= away_score)].index.values

    most_avail_next_score = ''
    most_avail_next_score_prob = 0

    for sl in avail_future_list:
        temp = score_line_by_winning_count_tb.loc[str(sl), 'range%d' % (time_range)]
        if most_avail_next_score_prob < temp:
                most_avail_next_score = str(sl)
                most_avail_next_score_prob = temp

    if most_avail_next_score != '':   
#         result = 'Current score is %d:%d(range%d) and most available next score is %s(%f%%). If the score-line become predicted score, winning prob. is %f%%' % (home_score, away_score, time_range-1, most_avail_next_score, most_avail_next_score_prob * 100, score_line_by_winning_rate_tb.loc[most_avail_next_score, 'range%d' % (time_range)] * 100)
        result = (time_range, most_avail_next_score, most_avail_next_score_prob * 100, score_line_by_winning_rate_tb.loc[most_avail_next_score, 'range%d' % (time_range)] * 100)
    else:
#         result = 'Current score is %d:%d(range%d). Our database does not have this type of score-line. So we cannot predict available next score and prob.' % (home_score, away_score, time_range-1)
        result = (time_range, most_avail_next_score, -1 -1)

    return result

In [246]:
home_score = 0; away_score = 0; half_type = 2; play_time = 1

result = [home_score, away_score] + list(get_prob_by_score_line(home_score, away_score, half_type, play_time))
result[2] = result[2] - 1

if result[3] != '':
    print('Current score is %d:%d(range%d) and most available next score is %s(%f%%). If the score-line become predicted score, winning prob. is %f%%' % tuple(result))
else : 
    print('Current score is %d:%d(range%d). Our database does not have this type of score-line. So we cannot predict available next score and prob.' % tuple(result[0:3]))

Current score is 0:0(range4) and most available next score is 0:0(22.003745%). If the score-line become predicted score, winning prob. is 24.893617%


In [248]:
home_score = 0; away_score = 0; half_type = 1; play_time = 0;
while True:
    result = [home_score, away_score] + list(get_prob_by_score_line(home_score, away_score, half_type, play_time))
    print(result)
    
    home_score = int(result[3].split(':')[0])
    away_score = int(result[3].split(':')[0])
    play_time = play_time + 15
    
    if half_type == 1 and play_time == 15:
        play_time = 1
    elif play_time > 45:
        play_time = play_time - 45
        half_type = half_type + 1
    
    if half_type == 3:
        break

[0, 0, 1, '0:0', 75.56179775280899, 38.47583643122677]
[0, 0, 2, '0:0', 71.72284644194757, 37.989556135770236]
[0, 0, 3, '0:0', 48.080524344569284, 36.416747809152874]
[0, 0, 4, '0:0', 33.239700374531836, 32.25352112676057]
[0, 0, 5, '0:0', 22.00374531835206, 24.893617021276597]
[0, 0, 6, '1:1', 11.891385767790261, 0.0]
[1, 1, 6, '1:1', 11.891385767790261, 0.0]


## 득점 패턴

#### 모든 득점을 유형별로 구분
* TG(선취골), TG(동점골), LG(리드골), CG(추격골)
* 괄호안은 Home/Away

In [12]:
goal_seq_pd = copy.deepcopy(home_trace_pd)
goal_seq_pd.location = np.where(goal_seq_pd.home_team_id == goal_seq_pd.score_team_id, 1, 0)
goal_seq_pd = pd.DataFrame(goal_seq_pd, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'winning_flag', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal', 'goal_type'])

goal_seq_pd.winning_flag = np.where(goal_seq_pd.winning_team == 0, 0, np.where(goal_seq_pd.winning_team == goal_seq_pd.home_team_id, 1, -1))
goal_seq_pd.first_goal = (goal_seq_pd.home_team_score + goal_seq_pd.away_team_score == 1)
goal_seq_pd.tying_goal = (goal_seq_pd.home_team_score == goal_seq_pd.away_team_score)
goal_seq_pd.lead_goal = np.where(goal_seq_pd.location == 1, (goal_seq_pd.home_team_score > goal_seq_pd.away_team_score), (goal_seq_pd.home_team_score < goal_seq_pd.away_team_score))
goal_seq_pd.lead_goal = np.where(goal_seq_pd.lead_goal & goal_seq_pd.first_goal, False, goal_seq_pd.lead_goal)
goal_seq_pd.chase_goal = (goal_seq_pd.first_goal == False) & (goal_seq_pd.tying_goal == False) & (goal_seq_pd.lead_goal == False)

goal_seq_pd.goal_type = np.where(goal_seq_pd.first_goal, 'FG', np.where(goal_seq_pd.tying_goal, 'TG', np.where(goal_seq_pd.lead_goal, 'LG', np.where(goal_seq_pd.chase_goal, 'CG', 'None'))))
goal_seq_pd.goal_type = np.where(goal_seq_pd.location == 1, goal_seq_pd.goal_type + '(H)', goal_seq_pd.goal_type + '(A)')

# goal_seq_pd.winning_flag = np.where(goal_seq_pd.location == 0, goal_seq_pd.winning_flag * -1, goal_seq_pd.winning_flag)
goal_seq_pd = goal_seq_pd[['match_id', 'id', 'home_team_id', 'away_team_id', 'winning_flag', 'location', 'score_line', 'time_range', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal', 'goal_type']]
goal_seq_pd.head()

Unnamed: 0,match_id,id,home_team_id,away_team_id,winning_flag,location,score_line,time_range,first_goal,tying_goal,lead_goal,chase_goal,goal_type
0,2013-1-001,2,10,25,0,1,1:0,2,True,False,False,False,FG(H)
1,2013-1-001,3,10,25,0,0,1:1,3,False,True,False,False,TG(A)
2,2013-1-001,4,10,25,0,1,2:1,4,False,False,True,False,LG(H)
3,2013-1-001,5,10,25,0,0,2:2,6,False,True,False,False,TG(A)
4,2013-1-002,6,19,5,1,0,0:1,1,True,False,False,False,FG(A)


#### 경기별 득점 패턴

In [13]:
goal_seq_list_pd = goal_seq_pd.groupby('match_id').agg({'goal_type': 'sum', 'winning_flag': 'min'})
goal_seq_list_pd.goal_type = goal_seq_list_pd.goal_type.str.split(')').apply(')-'.join)
goal_seq_list_pd.goal_type = goal_seq_list_pd.goal_type + np.where(goal_seq_list_pd.winning_flag == 0, 'D', np.where(goal_seq_list_pd.winning_flag == 1, 'W', 'L'))
goal_seq_list_pd = goal_seq_list_pd.drop('winning_flag', axis = 1)
goal_seq_list_pd.head()

Unnamed: 0_level_0,goal_type
match_id,Unnamed: 1_level_1
2013-1-001,FG(H)-TG(A)-LG(H)-TG(A)-D
2013-1-002,FG(A)-TG(H)-LG(H)-W
2013-1-003,FG(A)-L
2013-1-004,FG(A)-TG(H)-LG(A)-L
2013-1-006,FG(H)-LG(H)-CG(A)-TG(A)-D


#### 득점 패턴이 나타나는 빈도

In [14]:
goal_seq_type_pd = goal_seq_list_pd.reset_index().groupby('goal_type').agg({'match_id': 'count'}).sort_values(by=['match_id'], ascending=False).reset_index()
goal_seq_type_pd.columns = ['goal_type', 'match_count']
goal_seq_type_pd[goal_seq_type_pd.match_count > 10]

Unnamed: 0,goal_type,match_count
0,FG(H)-W,227
1,FG(A)-L,206
2,FG(A)-TG(H)-D,145
3,FG(H)-LG(H)-W,127
4,FG(H)-TG(A)-D,112
5,FG(A)-LG(A)-L,84
6,FG(H)-TG(A)-LG(H)-W,78
7,FG(A)-TG(H)-LG(H)-W,66
8,FG(A)-TG(H)-LG(A)-L,64
9,FG(H)-TG(A)-LG(A)-L,61


#### 득점 순서별 득점 유형

In [15]:
goal_seq_freq_pd = pd.DataFrame(goal_seq_list_pd.goal_type.str.split('-').tolist(), index=goal_seq_list_pd.index).apply(pd.value_counts).fillna(0)
goal_seq_freq_pd = goal_seq_freq_pd.drop(index=['W', 'D', 'L'])
goal_seq_freq_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
CG(A),0.0,0.0,124.0,32.0,44.0,11.0,11.0,1.0,1.0,0.0
CG(H),0.0,0.0,118.0,21.0,37.0,8.0,4.0,0.0,0.0,0.0
FG(A),901.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FG(H),1033.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LG(A),0.0,284.0,351.0,106.0,90.0,27.0,13.0,1.0,1.0,0.0
LG(H),0.0,392.0,440.0,153.0,97.0,25.0,13.0,7.0,1.0,0.0
TG(A),0.0,414.0,0.0,130.0,0.0,21.0,0.0,2.0,0.0,0.0
TG(H),0.0,411.0,0.0,119.0,0.0,14.0,0.0,2.0,0.0,0.0


#### 득점 패턴을 기반으로 한 승률 예측
* Parameter: 현재까지의 득점 패턴(FG(H)-TG(A)), Home/Away
* Return: 현재까지의 득점 패턴이 발생한 모든 경우의 수 중 Home 혹은 Away가 승리한 경기의 비율

In [16]:
def get_prob_by_goal_situation(current, location):
    flag = 'W' if location == 'H' else 'L'
    total = goal_seq_type_pd[goal_seq_type_pd.goal_type.str.startswith(current)].match_count.sum()
    winning_count = goal_seq_type_pd[goal_seq_type_pd.goal_type.str.startswith(current) & goal_seq_type_pd.goal_type.str.endswith(flag)].match_count.sum()
    prob = winning_count / total * 100
    return ('%f' % prob)

In [17]:
get_prob_by_goal_situation('FG(H)', 'H'), get_prob_by_goal_situation('FG(H)-TG(A)', 'H'), get_prob_by_goal_situation('FG(H)-TG(A)-LG(A)', 'H')

('69.215876', '32.850242', '7.042254')