In [1]:
import db_conn
import pandas as pd
import numpy as np
import copy
import collections
import statsmodels.formula.api as sm
import scipy.stats as st
from IPython.display import display

In [2]:
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#### Score Line Table

In [3]:
sql = """SELECT * FROM score_line"""
score_line = db_conn.select_query(sql)
score_line_pd = pd.DataFrame(score_line, columns=['match_id', 'season_year', 'division', 'id', 'half_type', 'play_time', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'location', 'score_team_id', 'winning_flag', 'winning_team', 'score_player'])
score_line_pd.head()

Unnamed: 0,match_id,season_year,division,id,half_type,play_time,time_range,home_score,away_score,home_team_id,away_team_id,location,score_team_id,winning_flag,winning_team,score_player
0,2013-1-001,2013,1,0,1,0,1,0,0,10,25,,0,,0,0
1,2013-1-001,2013,1,2,1,29,2,1,0,10,25,,10,,0,1
2,2013-1-001,2013,1,3,1,32,3,1,1,10,25,,25,,0,2
3,2013-1-001,2013,1,4,2,2,4,2,1,10,25,,10,,0,3
4,2013-1-001,2013,1,0,2,16,5,2,1,10,25,,-1,,0,-1


#### Game Records Table

In [4]:
sql = """SELECT * FROM game_records"""
game_records = db_conn.select_query(sql)
game_records_pd = pd.DataFrame(game_records)
game_records_pd.head()

Unnamed: 0,away_team_id,away_team_score,division,game_date,game_id,game_stadium,game_time,home_team_id,home_team_score,season_year,winning_team
0,25,2,1,2013-03-02,2013-1-001,서울 월드컵,15:00,10,2,2013,0
1,5,1,1,2013-03-02,2013-1-002,울산 문수,14:45,19,2,2013,19
2,23,1,1,2013-03-02,2013-1-003,광양 전용,15:00,21,0,2013,23
3,13,2,1,2013-03-03,2013-1-004,탄천 종합,14:00,12,1,2013,13
4,2,0,1,2013-03-03,2013-1-005,인천 전용,14:00,20,0,2013,0


#### Data Preparation
* 득점 유형별로 분류

In [5]:
score_line_w_goal_type = pd.DataFrame(score_line_pd, columns=score_line_pd.columns.append(pd.Index(['g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg'])))
score_line_w_goal_type = score_line_w_goal_type.fillna(False)

score_line_w_goal_type.g_fg = (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score == 1) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.home_team_id)
score_line_w_goal_type.l_fg = (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score == 1) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.away_team_id)

score_line_w_goal_type.g_tg = (score_line_w_goal_type.home_score == score_line_w_goal_type.away_score) & (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score > 0) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.home_team_id)
score_line_w_goal_type.l_tg = (score_line_w_goal_type.home_score == score_line_w_goal_type.away_score) & (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score > 0) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.away_team_id)

score_line_w_goal_type.g_lg = (score_line_w_goal_type.home_score > score_line_w_goal_type.away_score) & (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score > 1) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.home_team_id)
score_line_w_goal_type.l_cg = (score_line_w_goal_type.home_score > score_line_w_goal_type.away_score) & (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score > 1) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.away_team_id)

score_line_w_goal_type.g_cg = (score_line_w_goal_type.home_score < score_line_w_goal_type.away_score) & (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score > 1) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.home_team_id)
score_line_w_goal_type.l_lg = (score_line_w_goal_type.home_score < score_line_w_goal_type.away_score) & (score_line_w_goal_type.home_score + score_line_w_goal_type.away_score > 1) & (score_line_w_goal_type.score_team_id == score_line_w_goal_type.away_team_id)

score_line_w_goal_type.winning_flag = np.where(score_line_w_goal_type.winning_team == 0, 0, np.where(score_line_w_goal_type.home_team_id == score_line_w_goal_type.winning_team, 1, -1))

score_line_w_goal_type.head()

Unnamed: 0,match_id,season_year,division,id,half_type,play_time,time_range,home_score,away_score,home_team_id,away_team_id,location,score_team_id,winning_flag,winning_team,score_player,g_fg,g_tg,g_lg,g_cg,l_fg,l_tg,l_lg,l_cg
0,2013-1-001,2013,1,0,1,0,1,0,0,10,25,False,0,0,0,0,False,False,False,False,False,False,False,False
1,2013-1-001,2013,1,2,1,29,2,1,0,10,25,False,10,0,0,1,True,False,False,False,False,False,False,False
2,2013-1-001,2013,1,3,1,32,3,1,1,10,25,False,25,0,0,2,False,False,False,False,False,True,False,False
3,2013-1-001,2013,1,4,2,2,4,2,1,10,25,False,10,0,0,3,False,False,True,False,False,False,False,False
4,2013-1-001,2013,1,0,2,16,5,2,1,10,25,False,-1,0,0,-1,False,False,False,False,False,False,False,False


* 경기 별 득점 및 실점

In [6]:
home_goal_type_statics = pd.DataFrame(score_line_w_goal_type.groupby(['match_id']).agg({'g_fg': 'sum', 'g_tg': 'sum', 'g_lg': 'sum', 'g_cg': 'sum', 'l_fg': 'sum', 'l_tg': 'sum', 'l_lg': 'sum', 'l_cg': 'sum', 'winning_flag': 'max', 'home_score': 'max', 'away_score': 'max', 'score_player': 'max'}).reset_index(), columns=['match_id', 'location', 'g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg', 'winning_flag', 'home_score', 'away_score', 'score_player'])
home_goal_type_statics.location = 1
home_goal_type_statics[['g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg']] = home_goal_type_statics[['g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg']] * 1

away_goal_type_statics = copy.deepcopy(home_goal_type_statics)
away_goal_type_statics.columns = ['match_id', 'location', 'l_fg', 'l_tg', 'l_lg', 'l_cg', 'g_fg', 'g_tg', 'g_lg', 'g_cg', 'winning_flag', 'home_score', 'away_score', 'score_player']
away_goal_type_statics.location = 0
away_goal_type_statics.winning_flag = away_goal_type_statics.winning_flag * -1

total_goal_type_statics = pd.DataFrame(home_goal_type_statics.append(away_goal_type_statics).sort_values(['match_id', 'location']).reset_index(drop=True), columns=['match_id', 'location', 'g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg', 'winning_flag', 'home_score', 'away_score', 'score_player'])
total_goal_type_statics_back = copy.deepcopy(total_goal_type_statics)

total_goal_type_statics.head()

Unnamed: 0,match_id,location,g_fg,g_tg,g_lg,g_cg,l_fg,l_tg,l_lg,l_cg,winning_flag,home_score,away_score,score_player
0,2013-1-001,0,0,2.0,0.0,0.0,1,0.0,1.0,0.0,0,2,2,4
1,2013-1-001,1,1,0.0,1.0,0.0,0,2.0,0.0,0.0,0,2,2,4
2,2013-1-002,0,1,0.0,0.0,0.0,0,1.0,1.0,0.0,-1,2,1,7
3,2013-1-002,1,0,1.0,1.0,0.0,1,0.0,0.0,0.0,1,2,1,7
4,2013-1-003,0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1,0,1,8


### 0. 홈 이점
* Home advantage를 고려하여 홈/어웨이에 따른 승률을 확인
* 해당 승률을 기본 베이스로 각 득점이 승률 변화에 미치는 영향도를 판단

In [7]:
home_base_win_probs = game_records_pd[game_records_pd.winning_team == game_records_pd.home_team_id].__len__() / game_records_pd.__len__()
away_base_win_probs = game_records_pd[game_records_pd.winning_team == game_records_pd.away_team_id].__len__() / game_records_pd.__len__()
base_draw_probs = game_records_pd[game_records_pd.winning_team == 0].__len__() / game_records_pd.__len__()

home_base_win_probs, away_base_win_probs, base_draw_probs

(0.3881086142322097, 0.32724719101123595, 0.2846441947565543)

* 홈인 경우 이길 확률 38.8%, 어웨이인 경우 이길 확률 32.7%

### 1. 선취골
#### 1-1. 득점

In [8]:
g_fg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'g_fg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'g_fg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'g_fg']), columns=['location', 'g_fg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'changed_probs'])
g_fg_statics.columns = ['location', 'g_fg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'changed_probs']
g_fg_statics.probs = g_fg_statics.game_count / g_fg_statics.total_game_count
g_fg_statics.changed_probs = (g_fg_statics.probs - np.where(g_fg_statics.location == 1, home_base_win_probs, away_base_win_probs)) / np.where(g_fg_statics.location == 1, home_base_win_probs, away_base_win_probs)

g_fg_statics[g_fg_statics.winning_flag == 1]
g_fg_statics.sort_values('g_fg')

Unnamed: 0,location,g_fg,winning_flag,game_count,total_game_count,probs,changed_probs
0,0,0,-1,715,1235,0.578947,0.769144
1,0,0,0,396,1235,0.320648,-0.020166
2,0,0,1,124,1235,0.100405,-0.693183
6,1,0,-1,575,1103,0.521306,0.343195
7,1,0,0,414,1103,0.37534,-0.0329
8,1,0,1,114,1103,0.103354,-0.733697
3,0,1,-1,114,901,0.126526,-0.613362
4,0,1,0,212,901,0.235294,-0.28099
5,0,1,1,575,901,0.63818,0.950146
9,1,1,-1,124,1033,0.120039,-0.690708


* Home: 선취골 득점 시 승리 확률 78% 증가
* Away: 선취골 득점 시 승리 확률 95% 증가

#### 1-2. 실점

In [9]:
l_fg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'l_fg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'l_fg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'l_fg']), columns=['location', 'l_fg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'changed_probs'])
l_fg_statics.columns = ['location', 'l_fg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'changed_probs']
l_fg_statics.probs = l_fg_statics.game_count / l_fg_statics.total_game_count
l_fg_statics.changed_probs = (l_fg_statics.probs - np.where(l_fg_statics.location == 1, home_base_win_probs, away_base_win_probs)) / np.where(l_fg_statics.location == 1, home_base_win_probs, away_base_win_probs)

l_fg_statics[l_fg_statics.winning_flag == 1]
l_fg_statics.sort_values('l_fg')

Unnamed: 0,location,l_fg,winning_flag,game_count,total_game_count,probs,changed_probs
0,0,0,-1,114,1103,0.103354,-0.68417
1,0,0,0,414,1103,0.37534,0.146962
2,0,0,1,575,1103,0.521306,0.593002
6,1,0,-1,124,1235,0.100405,-0.741297
7,1,0,0,396,1235,0.320648,-0.173819
8,1,0,1,715,1235,0.578947,0.491715
3,0,1,-1,715,1033,0.692159,1.115095
4,0,1,0,194,1033,0.187803,-0.426114
5,0,1,1,124,1033,0.120039,-0.633186
9,1,1,-1,575,901,0.63818,0.644333


* Home: 선취골 실점 시 승리 확률 67.4% 감소, 패배 확률 64.4% 증가
* Away: 선취골 실점 시 승리 확률 63.3% 감소, 패배 확률 111.5% 증가

#### 1-3. 결과
* 선취골 득점: 기준 기대 승점의 확률 변동폭 만큼 +
* 선취골 실점: 기준 기대 승점의 확률 변동폭 만큼 - 

### 2. 동점골
#### 2-1. 득점

In [10]:
total_goal_type_statics.g_tg = total_goal_type_statics.g_tg.apply(lambda x: 2.0 if x > 2 else x)
total_goal_type_statics.l_tg = total_goal_type_statics.l_tg.apply(lambda x: 2.0 if x > 2 else x)

In [11]:
g_tg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'g_tg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'g_tg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'g_tg']), columns = ['location', 'g_tg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'base_probs', 'changed_probs'])
g_tg_statics.columns = ['location', 'g_tg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'base_probs', 'changed_probs']
g_tg_statics.probs = g_tg_statics.game_count / g_tg_statics.total_game_count
g_tg_statics.base_probs = g_tg_statics.apply(lambda x: home_base_win_probs if (x.location == 1 and x.winning_flag == 1) or (x.location == 0 and x.winning_flag == -1) == 1 else away_base_win_probs if (x.location == 1 and x.winning_flag == -1) or (x.location == 0 and x.winning_flag == 1) else base_draw_probs, axis=1)

g_tg_statics.base_probs = g_tg_statics.apply(lambda x: g_tg_statics[(g_tg_statics.location == x.location) & (g_tg_statics.winning_flag == x.winning_flag) & (g_tg_statics.g_tg == x.g_tg - 1)].probs.values[0] if x.g_tg > 0 else x.base_probs, axis=1)

g_tg_statics.changed_probs = (g_tg_statics.probs - g_tg_statics.base_probs) / g_tg_statics.base_probs

g_tg_statics[g_tg_statics.g_tg > 0].sort_values(['g_tg', 'winning_flag'])

Unnamed: 0,location,g_tg,winning_flag,game_count,total_game_count,probs,base_probs,changed_probs
3,0,1.0,-1,150,471,0.318471,0.414966,-0.232536
12,1,1.0,-1,124,460,0.269565,0.344553,-0.217638
4,0,1.0,0,195,471,0.414013,0.237477,0.743382
13,1,1.0,0,217,460,0.471739,0.22399,1.10607
5,0,1.0,1,126,471,0.267516,0.347557,-0.230297
14,1,1.0,1,119,460,0.258696,0.431457,-0.400413
6,0,2.0,-1,8,48,0.166667,0.318471,-0.476667
15,1,2.0,-1,12,42,0.285714,0.269565,0.059908
7,0,2.0,0,29,48,0.604167,0.414013,0.459295
16,1,2.0,0,25,42,0.595238,0.471739,0.261795


In [12]:
l_tg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'l_tg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'l_tg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'l_tg']), columns = ['location', 'l_tg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'base_probs', 'changed_probs'])
l_tg_statics.columns = ['location', 'l_tg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'base_probs', 'changed_probs']
l_tg_statics.probs = l_tg_statics.game_count / l_tg_statics.total_game_count
l_tg_statics.base_probs = l_tg_statics.apply(lambda x: home_base_win_probs if (x.location == 1 and x.winning_flag == 1) or (x.location == 0 and x.winning_flag == -1) == 1 else away_base_win_probs if (x.location == 1 and x.winning_flag == -1) or (x.location == 0 and x.winning_flag == 1) else base_draw_probs, axis=1)
l_tg_statics.base_probs = l_tg_statics.apply(lambda x: l_tg_statics[(l_tg_statics.location == x.location) & (l_tg_statics.winning_flag == x.winning_flag) & (l_tg_statics.l_tg == x.l_tg - 1)].probs.values[0] if x.l_tg > 0 else 0, axis=1)

l_tg_statics.changed_probs = (l_tg_statics.probs - l_tg_statics.base_probs) / l_tg_statics.base_probs

l_tg_statics[l_tg_statics.l_tg > 0].sort_values(['l_tg', 'winning_flag'])

Unnamed: 0,location,l_tg,winning_flag,game_count,total_game_count,probs,base_probs,changed_probs
3,0,1.0,-1,119,460,0.258696,0.431457,-0.400413
12,1,1.0,-1,126,471,0.267516,0.347557,-0.230297
4,0,1.0,0,217,460,0.471739,0.22399,1.10607
13,1,1.0,0,195,471,0.414013,0.237477,0.743382
5,0,1.0,1,124,460,0.269565,0.344553,-0.217638
14,1,1.0,1,150,471,0.318471,0.414966,-0.232536
6,0,2.0,-1,5,42,0.119048,0.258696,-0.539816
15,1,2.0,-1,11,48,0.229167,0.267516,-0.143353
7,0,2.0,0,25,42,0.595238,0.471739,0.261795
16,1,2.0,0,29,48,0.604167,0.414013,0.459295


### 3. 리드골
#### 3-1. 득점

In [13]:
total_goal_type_statics.g_lg = total_goal_type_statics.g_lg.apply(lambda x: 2 if x > 2 else x)
total_goal_type_statics.l_lg = total_goal_type_statics.l_lg.apply(lambda x: 2 if x > 2 else x)

In [14]:
g_lg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'g_lg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'g_lg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'g_lg']), columns=['location', 'g_lg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'base_probs', 'changed_probs'])
g_lg_statics.columns = ['location', 'g_lg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'base_probs', 'changed_probs']
g_lg_statics.probs = g_lg_statics.game_count / g_lg_statics.total_game_count
g_lg_statics.base_probs = g_lg_statics.apply(lambda x: home_base_win_probs if (x.location == 1 and x.winning_flag == 1) or (x.location == 0 and x.winning_flag == -1) == 1 else away_base_win_probs if (x.location == 1 and x.winning_flag == -1) or (x.location == 0 and x.winning_flag == 1) else base_draw_probs, axis=1)
g_lg_statics.base_probs = g_lg_statics.apply(lambda x: g_lg_statics[(g_lg_statics.location == x.location) & (g_lg_statics.winning_flag == x.winning_flag) & (g_lg_statics.g_lg == x.g_lg - 1)].probs.values[0] if x.g_lg > 0 else 0, axis=1)

g_lg_statics.changed_probs = (g_lg_statics.probs - g_lg_statics.base_probs) / g_lg_statics.base_probs

g_lg_statics[g_lg_statics.g_lg > 0].sort_values(['g_lg', 'winning_flag'])

Unnamed: 0,location,g_lg,winning_flag,game_count,total_game_count,probs,base_probs,changed_probs
3,0,1.0,-1,22,381,0.057743,0.520699,-0.889105
12,1,1.0,-1,27,441,0.061224,0.471751,-0.870219
4,0,1.0,0,69,381,0.181102,0.346054,-0.476665
13,1,1.0,0,78,441,0.176871,0.367938,-0.519292
5,0,1.0,1,290,381,0.761155,0.133247,4.712356
14,1,1.0,1,336,441,0.761905,0.160311,3.752675
6,0,2.0,-1,2,209,0.009569,0.057743,-0.834276
15,1,2.0,-1,4,279,0.014337,0.061224,-0.76583
7,0,2.0,0,4,209,0.019139,0.181102,-0.894321
16,1,2.0,0,9,279,0.032258,0.176871,-0.817618


#### 3-2. 실점

In [15]:
l_lg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'l_lg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'l_lg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'l_lg']), columns=['location', 'l_lg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'base_probs', 'changed_probs'])
l_lg_statics.columns = ['location', 'l_lg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'base_probs', 'changed_probs']
l_lg_statics.probs = l_lg_statics.game_count / l_lg_statics.total_game_count
l_lg_statics.base_probs = l_lg_statics.apply(lambda x: home_base_win_probs if (x.location == 1 and x.winning_flag == 1) or (x.location == 0 and x.winning_flag == -1) == 1 else away_base_win_probs if (x.location == 1 and x.winning_flag == -1) or (x.location == 0 and x.winning_flag == 1) else base_draw_probs, axis=1)
l_lg_statics.base_probs = l_lg_statics.apply(lambda x: l_lg_statics[(l_lg_statics.location == x.location) & (l_lg_statics.winning_flag == x.winning_flag) & (l_lg_statics.l_lg == x.l_lg - 1)].probs.values[0] if x.l_lg > 0 else 0, axis=1)

l_lg_statics.changed_probs = (l_lg_statics.probs - l_lg_statics.base_probs) / l_lg_statics.base_probs

l_lg_statics[l_lg_statics.l_lg > 0].sort_values(['l_lg', 'winning_flag'])

Unnamed: 0,location,l_lg,winning_flag,game_count,total_game_count,probs,base_probs,changed_probs
3,0,1.0,-1,336,441,0.761905,0.160311,3.752675
12,1,1.0,-1,290,381,0.761155,0.133247,4.712356
4,0,1.0,0,78,441,0.176871,0.367938,-0.519292
13,1,1.0,0,69,381,0.181102,0.346054,-0.476665
5,0,1.0,1,27,441,0.061224,0.471751,-0.870219
14,1,1.0,1,22,381,0.057743,0.520699,-0.889105
6,0,2.0,-1,266,279,0.953405,0.761905,0.251344
15,1,2.0,-1,203,209,0.971292,0.761155,0.276077
7,0,2.0,0,9,279,0.032258,0.176871,-0.817618
16,1,2.0,0,4,209,0.019139,0.181102,-0.894321


### 4. 추격골
#### 4-1. 득점

In [16]:
total_goal_type_statics.g_cg = total_goal_type_statics.g_cg.apply(lambda x: 2 if x > 2 else x)
total_goal_type_statics.l_cg = total_goal_type_statics.l_cg.apply(lambda x: 2 if x > 2 else x)

In [17]:
g_cg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'g_cg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'g_cg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'g_cg']), columns=['location', 'g_cg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'base_probs', 'changed_probs'])
g_cg_statics.columns = ['location', 'g_cg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'base_probs', 'changed_probs']
g_cg_statics.probs = g_cg_statics.game_count / g_cg_statics.total_game_count
g_cg_statics.base_probs = g_cg_statics.apply(lambda x: home_base_win_probs if (x.location == 1 and x.winning_flag == 1) or (x.location == 0 and x.winning_flag == -1) == 1 else away_base_win_probs if (x.location == 1 and x.winning_flag == -1) or (x.location == 0 and x.winning_flag == 1) else base_draw_probs, axis=1)
g_cg_statics.base_probs = g_cg_statics.apply(lambda x: g_cg_statics[(g_cg_statics.location == x.location) & (g_cg_statics.winning_flag == x.winning_flag) & (g_cg_statics.g_cg == x.g_cg - 1)].probs.values[0] if x.g_cg > 0 else 0, axis=1)

g_cg_statics.changed_probs = (g_cg_statics.probs - g_cg_statics.base_probs) / g_cg_statics.base_probs

g_cg_statics[g_cg_statics.g_cg > 0].sort_values(['g_cg', 'winning_flag'])

Unnamed: 0,location,g_cg,winning_flag,game_count,total_game_count,probs,base_probs,changed_probs
3,0,1.0,-1,121,166,0.728916,0.351518,1.073621
11,1,1.0,-1,124,154,0.805195,0.284478,1.830425
4,0,1.0,0,34,166,0.204819,0.29439,-0.304259
12,1,1.0,0,21,154,0.136364,0.298728,-0.543519
5,0,1.0,1,11,166,0.066265,0.354092,-0.812859
13,1,1.0,1,9,154,0.058442,0.416794,-0.859783
6,0,2.0,-1,25,27,0.925926,0.728916,0.270279
14,1,2.0,-1,16,17,0.941176,0.805195,0.16888
7,0,2.0,0,2,27,0.074074,0.204819,-0.638344
15,1,2.0,1,1,17,0.058824,0.058442,0.006536


#### 4-2. 실점

In [18]:
l_cg_statics = pd.DataFrame(pd.merge(total_goal_type_statics.groupby(['location', 'l_cg', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), total_goal_type_statics.groupby(['location', 'l_cg']).agg({'match_id': 'count'}).reset_index(), how='left', on=['location', 'l_cg']), columns=['location', 'l_cg', 'winning_flag', 'match_id_x', 'match_id_y', 'probs', 'base_probs', 'changed_probs'])
l_cg_statics.columns = ['location', 'l_cg', 'winning_flag', 'game_count', 'total_game_count', 'probs', 'base_probs', 'changed_probs']
l_cg_statics.probs = l_cg_statics.game_count / l_cg_statics.total_game_count
l_cg_statics.base_probs = l_cg_statics.apply(lambda x: home_base_win_probs if (x.location == 1 and x.winning_flag == 1) or (x.location == 0 and x.winning_flag == -1) == 1 else away_base_win_probs if (x.location == 1 and x.winning_flag == -1) or (x.location == 0 and x.winning_flag == 1) else base_draw_probs, axis=1)
l_cg_statics.base_probs = l_cg_statics.apply(lambda x: l_cg_statics[(l_cg_statics.location == x.location) & (l_cg_statics.winning_flag == x.winning_flag) & (l_cg_statics.l_cg == x.l_cg - 1)].probs.values[0] if x.l_cg > 0 else 0, axis=1)

l_cg_statics.changed_probs = (l_cg_statics.probs - l_cg_statics.base_probs) / l_cg_statics.base_probs

l_cg_statics[l_cg_statics.l_cg > 0].sort_values(['l_cg', 'winning_flag'])

Unnamed: 0,location,l_cg,winning_flag,game_count,total_game_count,probs,base_probs,changed_probs
3,0,1.0,-1,9,154,0.058442,0.416794,-0.859783
11,1,1.0,-1,11,166,0.066265,0.354092,-0.812859
4,0,1.0,0,21,154,0.136364,0.298728,-0.543519
12,1,1.0,0,34,166,0.204819,0.29439,-0.304259
5,0,1.0,1,124,154,0.805195,0.284478,1.830425
13,1,1.0,1,121,166,0.728916,0.351518,1.073621
6,0,2.0,-1,1,17,0.058824,0.058442,0.006536
14,1,2.0,0,2,27,0.074074,0.204819,-0.638344
7,0,2.0,1,16,17,0.941176,0.805195,0.16888
15,1,2.0,1,25,27,0.925926,0.728916,0.270279


### Point

In [19]:
def get_expected_value(goal_record):
        
    if goal_record.g_fg:
        prob_list = g_fg_statics[(g_fg_statics.location == goal_record.location) & (g_fg_statics.g_fg == goal_record.g_fg)]
    elif goal_record.g_tg > 0:
        prob_list = g_tg_statics[(g_tg_statics.location == goal_record.location) & (g_tg_statics.g_tg == goal_record.g_tg if goal_record.g_tg < 3 else g_tg_statics.g_tg == 2)]
    elif goal_record.g_lg > 0:
        prob_list = g_lg_statics[(g_lg_statics.location == goal_record.location) & (g_lg_statics.g_lg == goal_record.g_lg if goal_record.g_lg < 3 else g_lg_statics.g_lg == 2)]
    elif goal_record.g_cg > 0:
        prob_list = g_cg_statics[(g_cg_statics.location == goal_record.location) & (g_cg_statics.g_cg == goal_record.g_cg if goal_record.g_cg < 3 else g_cg_statics.g_cg == 2)]
#         return float('NaN')
    elif goal_record.l_fg:
        prob_list = l_fg_statics[(l_fg_statics.location == goal_record.location) & (l_fg_statics.l_fg == goal_record.l_fg)]
    elif goal_record.l_tg > 0:
        prob_list = l_tg_statics[(l_tg_statics.location == goal_record.location) & (l_tg_statics.l_tg == goal_record.l_tg if goal_record.l_tg < 3 else l_tg_statics.l_tg == 2)]
    elif goal_record.l_lg > 0:
        prob_list = l_lg_statics[(l_lg_statics.location == goal_record.location) & (l_lg_statics.l_lg == goal_record.l_lg if goal_record.l_lg < 3 else l_lg_statics.l_lg == 2)]
    elif goal_record.l_cg > 0:
        prob_list = l_cg_statics[(l_cg_statics.location == goal_record.location) & (l_cg_statics.l_cg == goal_record.l_cg if goal_record.l_cg < 3 else l_cg_statics.l_cg == 2)]
#         return float('NaN')
    elif goal_record.home_score == 0 and goal_record.away_score == 0:
        return (home_base_win_probs * 3 + base_draw_probs) if goal_record.location == 1 else (away_base_win_probs * 3 + base_draw_probs)
    else:
        return float('NaN')
    
    return prob_list.apply(lambda x: 3 * x.probs if x.winning_flag == 1 else x.probs if x.winning_flag == 0 else 0, axis=1).sum()
#     return prob_list.apply(lambda x: 3 * x.probs if x.winning_flag == 1 else 0, axis=1).sum()

In [20]:
# home
home_goal_point_pd = pd.DataFrame(score_line_w_goal_type, columns=['match_id', 'season_year', 'division', 'id', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'location', 'winning_flag', 'g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg', 'expected_value', 'goal_point'])
home_goal_point_pd.location = 1

# away
away_goal_point_pd = pd.DataFrame(score_line_w_goal_type, columns=['match_id', 'season_year', 'division', 'id', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'location', 'winning_flag', 'g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg', 'expected_value', 'goal_point'])
away_goal_point_pd.columns = ['match_id', 'season_year', 'division', 'id', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'location', 'winning_flag', 'l_fg', 'l_tg', 'l_lg', 'l_cg', 'g_fg', 'g_tg', 'g_lg', 'g_cg', 'expected_value', 'goal_point']
away_goal_point_pd.location = 0
away_goal_point_pd.winning_flag = away_goal_point_pd.winning_flag * -1

goal_point_pd = home_goal_point_pd.append(away_goal_point_pd).sort_values(['match_id', 'location', 'time_range'])
goal_point_pd[['g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg']] = goal_point_pd[['g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg']] * goal_point_pd.groupby(['match_id', 'location'])['g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg'].cumsum()

goal_point_pd.expected_value = goal_point_pd.apply(get_expected_value, axis=1)
goal_point_pd.expected_value = goal_point_pd.expected_value.fillna(method='ffill')
goal_point_pd.goal_point = goal_point_pd.groupby(['match_id', 'location'])['expected_value'].shift(1).fillna(home_base_win_probs * 3 + base_draw_probs)
goal_point_pd.goal_point = (goal_point_pd.expected_value - goal_point_pd.goal_point) / goal_point_pd.goal_point
goal_point_pd = pd.DataFrame(goal_point_pd, columns=['match_id', 'season_year', 'division', 'id', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'location', 'winning_flag', 'g_fg', 'g_tg', 'g_lg', 'g_cg', 'l_fg', 'l_tg', 'l_lg', 'l_cg', 'expected_value', 'goal_point'])

  .format(op=op_str, alt_op=unsupported[op_str]))


In [21]:
# 과연 추격골을 이대로 둬도 되는가....
# goal_point_pd[goal_point_pd.match_id == '2013-1-007']

In [22]:
total_point_pd = pd.DataFrame(goal_point_pd[(goal_point_pd.goal_point > 0) & (goal_point_pd.location == 1)].groupby(['match_id', 'season_year', 'division']).agg({'home_team_id': 'min', 'goal_point': 'sum'}).append(goal_point_pd[(goal_point_pd.goal_point > 0) & (goal_point_pd.location == 0)].groupby(['match_id', 'season_year', 'division']).agg({'away_team_id': 'min', 'goal_point': 'sum'})).reset_index().fillna(1), columns=['match_id', 'season_year', 'division', 'team_id', 'goal_point', 'home_team_id', 'away_team_id'])
total_point_pd.team_id = total_point_pd.home_team_id * total_point_pd.away_team_id
total_point_pd = total_point_pd.groupby(['season_year', 'division', 'team_id']).agg({'home_team_id': 'count', 'goal_point': 'sum'}).reset_index()
total_point_pd = pd.DataFrame(total_point_pd, columns=['season_year', 'division', 'team_id', 'home_team_id', 'goal_point', 'avg_point', 'rank'])
total_point_pd.columns=['season_year', 'division', 'team_id', 'game_count', 'goal_point', 'avg_point', 'rank']
total_point_pd.head()

Unnamed: 0,season_year,division,team_id,game_count,goal_point,avg_point,rank
0,2013,1,1.0,22,29.843673,,
1,2013,1,2.0,26,39.454568,,
2,2013,1,5.0,22,33.643273,,
3,2013,1,6.0,28,40.040445,,
4,2013,1,7.0,24,27.805219,,


In [23]:
total_point_pd.avg_point = total_point_pd.goal_point / total_point_pd.game_count
total_point_pd[['rank']] = total_point_pd.groupby(['season_year', 'division']).avg_point.rank(method='min', ascending=True)
# total_point_pd[(total_point_pd.season_year == 2017) & (total_point_pd.division == 1)].sort_values('rank')

In [24]:
total_point_pd = pd.DataFrame(goal_point_pd[(goal_point_pd.goal_point > 0) & (goal_point_pd.location == 1)].groupby(['match_id', 'season_year', 'division']).agg({'home_team_id': 'min', 'goal_point': 'sum'}).append(goal_point_pd[(goal_point_pd.goal_point > 0) & (goal_point_pd.location == 0)].groupby(['match_id', 'season_year', 'division']).agg({'away_team_id': 'min', 'goal_point': 'sum'})).reset_index().fillna(1), columns=['match_id', 'season_year', 'division', 'team_id', 'goal_point', 'home_team_id', 'away_team_id'])
total_point_pd.team_id = total_point_pd.home_team_id * total_point_pd.away_team_id
total_point_pd = total_point_pd.groupby(['season_year', 'division', 'team_id']).agg({'home_team_id': 'count', 'goal_point': 'sum'}).reset_index()
total_point_pd = pd.DataFrame(total_point_pd, columns=['season_year', 'division', 'team_id', 'home_team_id', 'goal_point', 'avg_point', 'rank'])
total_point_pd.columns=['season_year', 'division', 'team_id', 'game_count', 'goal_point', 'avg_point', 'rank']
total_point_pd.head()

Unnamed: 0,season_year,division,team_id,game_count,goal_point,avg_point,rank
0,2013,1,1.0,22,29.843673,,
1,2013,1,2.0,26,39.454568,,
2,2013,1,5.0,22,33.643273,,
3,2013,1,6.0,28,40.040445,,
4,2013,1,7.0,24,27.805219,,


In [None]:
sql = """SELECT * FROM season_ranking"""
season_ranking = db_conn.select_query(sql)

season_ranking_pd = pd.DataFrame(season_ranking, columns=['year', 'division', 'rank', 'team_id', 'points', 'scored_goal_count', 'losed_goal_count', 'match_count', 'win_count', 'draw_count', 'lose_count'])
season_ranking_pd.columns=['year', 'division', 'ranking', 'team_id', 'points', 'scored_goal_count', 'losed_goal_count', 'match_count', 'win_count', 'draw_count', 'lose_count']
season_ranking_pd[(season_ranking_pd.year == 2017) & (season_ranking_pd.division == 1)].sort_values('points', ascending=False)

In [26]:
total_point_pd.avg_point = total_point_pd.goal_point / total_point_pd.game_count
total_point_pd[['rank']] = total_point_pd.groupby(['season_year', 'division']).avg_point.rank(method='min', ascending=False)
total_point_pd[(total_point_pd.season_year == 2017) & (total_point_pd.division == 1)].sort_values('rank')

temp = pd.DataFrame(total_point_pd, columns=['season_year', 'division', 'team_id', 'game_count', 'goal_point', 'avg_point', 'rank', 'real_point', 'real_rank', 'diff_rank', 'scored_goal_count', 'losed_goal_count', 'win_count', 'draw_count', 'lose_count'])
temp = temp[temp.division == 1]

temp.real_point = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.season_year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].points.values[0], axis = 1)
temp.real_rank = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.season_year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].ranking.values[0], axis = 1)

temp.scored_goal_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.season_year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].scored_goal_count.values[0], axis = 1)
temp.losed_goal_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.season_year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].losed_goal_count.values[0], axis = 1)
temp.win_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.season_year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].win_count.values[0], axis = 1)
temp.draw_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.season_year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].draw_count.values[0], axis = 1)
temp.lose_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.season_year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].lose_count.values[0], axis = 1)

temp.diff_rank = temp.real_rank - temp['rank']

temp[temp.season_year == 2017].sort_values('rank')[['team_id', 'game_count', 'rank', 'real_rank', 'diff_rank', 'avg_point', 'real_point', 'scored_goal_count', 'losed_goal_count', 'win_count', 'draw_count', 'lose_count']]

Unnamed: 0,team_id,game_count,rank,real_rank,diff_rank,avg_point,real_point,scored_goal_count,losed_goal_count,win_count,draw_count,lose_count
99,5.0,29,1.0,8,7.0,1.979486,47,50,52,11,14,13
104,20.0,23,2.0,9,7.0,1.961434,39,32,53,7,18,13
97,1.0,31,3.0,6,3.0,1.81519,49,59,65,13,10,15
108,25.0,28,4.0,7,3.0,1.720031,52,64,60,15,7,16
101,10.0,29,5.0,5,0.0,1.668918,61,56,42,16,13,9
100,9.0,24,6.0,11,5.0,1.646956,35,41,66,8,11,19
98,4.0,22,7.0,12,5.0,1.421464,30,33,61,6,12,20
102,13.0,30,8.0,3,-5.0,1.411892,64,63,41,17,13,8
105,21.0,27,9.0,10,1.0,1.251084,35,53,69,8,11,19
107,23.0,29,10.0,2,-8.0,1.203536,66,60,37,19,9,10


In [27]:
temp[['avg_point', 'real_point', 'scored_goal_count', 'losed_goal_count', 'win_count', 'draw_count', 'lose_count']].corr()

Unnamed: 0,avg_point,real_point,scored_goal_count,losed_goal_count,win_count,draw_count,lose_count
avg_point,1.0,-0.210284,0.143936,0.502663,-0.161431,-0.177656,0.280864
real_point,-0.210284,1.0,0.763135,-0.740571,0.985632,-0.2744,-0.911542
scored_goal_count,0.143936,0.763135,1.0,-0.227521,0.798888,-0.403556,-0.631769
losed_goal_count,0.502663,-0.740571,-0.227521,1.0,-0.682819,-0.050322,0.773591
win_count,-0.161431,0.985632,0.798888,-0.682819,1.0,-0.416297,-0.843326
draw_count,-0.177656,-0.2744,-0.403556,-0.050322,-0.416297,1.0,-0.137549
lose_count,0.280864,-0.911542,-0.631769,0.773591,-0.843326,-0.137549,1.0


In [32]:
total_point_pd.head()
total_point_pd.to_pickle('./case_2_result.pkl')