# 리드골 및 추격골과 경기 결과간의 상관관계

## 기본 컨셉
* 리드골 및 추격골의 여부/횟수가 경기 결과에 영향을 줄 것이다

## 사용 데이터
* 2013~2017년 득점 기록

In [1]:
import db_conn
import pandas as pd
import numpy as np
import copy
import statsmodels.formula.api as sm
import scipy.stats as st

In [2]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
plt.style.use('dark_background')

##### Game Records

In [3]:
sql = """SELECT * FROM game_records"""
game_records = db_conn.select_query(sql)
game_records_pd = pd.DataFrame(game_records, columns=['game_id', 'home_team_id', 'away_team_id', 'home_team_score', 'away_team_score', 'winning_team'])
game_records_pd.head()

Unnamed: 0,game_id,home_team_id,away_team_id,home_team_score,away_team_score,winning_team
0,2013-1-001,10,25,2,2,0
1,2013-1-002,19,5,2,1,19
2,2013-1-003,21,23,0,1,23
3,2013-1-004,12,13,1,2,13
4,2013-1-005,20,2,0,0,0


##### Goal Records

In [4]:
sql = """SELECT * FROM goal_records_rev"""
goal_records = db_conn.select_query(sql)
goal_records_pd = pd.DataFrame(goal_records, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'own_goal'])
goal_records_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,own_goal
0,2013-1-001,2,10,25,N
1,2013-1-001,3,25,10,N
2,2013-1-001,4,10,25,N
3,2013-1-001,5,25,10,N
4,2013-1-002,6,5,19,N


In [5]:
goal_records_pd = pd.DataFrame(goal_records, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'half_type', 'play_time'])
goal_trace_pd = pd.merge(goal_records_pd, game_records_pd[['game_id', 'home_team_id', 'away_team_id', 'winning_team']], how='left', left_on='match_id', right_on='game_id')
goal_trace_pd = pd.DataFrame(goal_trace_pd, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range'])
goal_trace_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,time_range
0,2013-1-001,2,10,25,10,25,0,,,,,1,29,
1,2013-1-001,3,25,10,10,25,0,,,,,1,32,
2,2013-1-001,4,10,25,10,25,0,,,,,2,2,
3,2013-1-001,5,25,10,10,25,0,,,,,2,38,
4,2013-1-002,6,5,19,19,5,19,,,,,1,4,


## 데이터 전처리

##### Home Team 기준 Goal Records 전처리

In [6]:
home_trace_pd = copy.deepcopy(goal_trace_pd)
home_trace_pd.home_team_score = np.where(home_trace_pd.home_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.away_team_score = np.where(home_trace_pd.away_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.home_team_score = home_trace_pd.groupby(['match_id'])['home_team_score'].cumsum()
home_trace_pd.away_team_score = home_trace_pd.groupby(['match_id'])['away_team_score'].cumsum()
home_trace_pd.score_line = home_trace_pd.home_team_score.map(str) + ':' + home_trace_pd.away_team_score.map(str)
home_trace_pd.time_range = (home_trace_pd.play_time / 15).astype(int) + 1
home_trace_pd.time_range = np.where(home_trace_pd.time_range > 3, 3, home_trace_pd.time_range)
home_trace_pd.time_range = np.where(home_trace_pd.half_type == 2, home_trace_pd.time_range + 3, home_trace_pd.time_range)
home_trace_pd.location = np.where(home_trace_pd.home_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,time_range
0,2013-1-001,2,10,25,10,25,0,1,0,1,1:0,1,29,2
1,2013-1-001,3,25,10,10,25,0,1,1,0,1:1,1,32,3
2,2013-1-001,4,10,25,10,25,0,2,1,1,2:1,2,2,4
3,2013-1-001,5,25,10,10,25,0,2,2,0,2:2,2,38,6
4,2013-1-002,6,5,19,19,5,19,0,1,0,0:1,1,4,1


##### 골 유형 구분
* 선제골(First Goal): 점수가 1:0 / 0:1 인 경우
* 동점골(Tying Goal): 양 팀의 점수가 동일한 경우
* 리드골(Lead Goal): 득점한 팀의 점수가 상대적으로 높은 경우
* 추격골(Chase Goal): 득점한 팀의 점수가 상대적으로 낮은 경우

In [7]:
goal_type_pd = pd.DataFrame(home_trace_pd, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'winning_flag', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal'])
goal_type_pd.winning_flag = np.where(goal_type_pd.winning_team == 0, 0, np.where(goal_type_pd.winning_team == goal_type_pd.home_team_id, 1, -1))
goal_type_pd.first_goal = (goal_type_pd.home_team_score + goal_type_pd.away_team_score == 1)
goal_type_pd.tying_goal = (goal_type_pd.home_team_score == goal_type_pd.away_team_score)
goal_type_pd.lead_goal = np.where(goal_type_pd.location == 1, (goal_type_pd.home_team_score > goal_type_pd.away_team_score), (goal_type_pd.home_team_score < goal_type_pd.away_team_score))
goal_type_pd.lead_goal = np.where(goal_type_pd.lead_goal & goal_type_pd.first_goal, False, goal_type_pd.lead_goal)
goal_type_pd.chase_goal = (goal_type_pd.first_goal == False) & (goal_type_pd.tying_goal == False) & (goal_type_pd.lead_goal == False)
goal_type_pd.winning_flag = np.where(goal_type_pd.location == 0, goal_type_pd.winning_flag * -1, goal_type_pd.winning_flag)
t = goal_type_pd[['match_id', 'id', 'winning_flag', 'location', 'score_line', 'time_range', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal']]
t.head()

Unnamed: 0,match_id,id,winning_flag,location,score_line,time_range,first_goal,tying_goal,lead_goal,chase_goal
0,2013-1-001,2,0,1,1:0,2,True,False,False,False
1,2013-1-001,3,0,0,1:1,3,False,True,False,False
2,2013-1-001,4,0,1,2:1,4,False,False,True,False
3,2013-1-001,5,0,0,2:2,6,False,True,False,False
4,2013-1-002,6,-1,0,0:1,1,True,False,False,False


##### Home/Away Lead Goal, Chase Goal 통합 (횟수)

In [8]:
t_home = t[t.location == 1].groupby(['match_id']).agg({'winning_flag' : 'min', 'lead_goal' : 'sum', 'chase_goal' : 'sum', 'location' : 'min'})
t_away = t[t.location == 0].groupby(['match_id']).agg({'winning_flag' : 'min', 'lead_goal' : 'sum', 'chase_goal' : 'sum', 'location' : 'min'})
t_home = t_home.reset_index()
t_away = t_away.reset_index()
t_away.winning_flag = t_away.winning_flag * -1
t_total = t_home.append(t_away, ignore_index=True)
t_total = t_total.sort_values(by=['match_id'])

match_pd = pd.DataFrame(game_records_pd[game_records_pd.home_team_score + game_records_pd.away_team_score != 0], columns=['game_id', 'location', 'winning_team', 'home_team_id', 'winning_flag'])

match_pd = pd.concat([match_pd, match_pd])
match_pd.iloc[0:1934, 1] = 1
match_pd.iloc[1934:, 1] = 0
match_pd.winning_flag = np.where(match_pd.winning_team == 0, 0, np.where(match_pd.winning_team == match_pd.home_team_id, 1, -1))
match_pd.winning_flag = np.where(match_pd.location == 0, match_pd.winning_flag * -1, match_pd.winning_flag)
match_pd = match_pd.sort_values(by='game_id')

t_total = pd.merge(match_pd, t_total, how='left', left_on=['game_id', 'location'], right_on=['match_id', 'location'])
t_total = t_total.drop(['match_id', 'winning_team', 'home_team_id', 'winning_flag_y'], axis=1)
t_total = t_total.fillna(0)
t_total.columns = ['match_id', 'location', 'winning_flag', 'lead_goal', 'chase_goal']
t_total.head()

Unnamed: 0,match_id,location,winning_flag,lead_goal,chase_goal
0,2013-1-001,1.0,0,1.0,0.0
1,2013-1-001,0.0,0,0.0,0.0
2,2013-1-002,1.0,1,1.0,0.0
3,2013-1-002,0.0,-1,0.0,0.0
4,2013-1-003,1.0,-1,0.0,0.0


## 통계치 분석

### 리드골 횟수와 경기 결과 Cross Table

In [9]:
lead_pd = pd.DataFrame(t_total.groupby(['winning_flag', 'lead_goal'])['lead_goal'].count())
lead_pd.columns = ['values']
lead_pd = lead_pd.reset_index()
lead_pd = lead_pd.pivot(index='lead_goal', columns='winning_flag', values='values')
lead_pd = lead_pd.fillna(0)
lead_pd

winning_flag,-1,0,1
lead_goal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1473.0,652.0,433.0
1.0,49.0,147.0,626.0
2.0,6.0,12.0,316.0
3.0,0.0,1.0,118.0
4.0,0.0,0.0,26.0
5.0,0.0,0.0,6.0
6.0,0.0,0.0,1.0
7.0,0.0,0.0,2.0


#### 카이제곱 검정

* 리드골 횟수 4회 이상 및 패배의 기대빈도수가 5이하(카이제곱 검정의 신뢰성 하락 이슈)
* 리드골 횟수를 0, 1, 2, 3회이상으로 경기결과를 승리여부로 변경하여 검정 진행

In [10]:
lead_prob_pd = pd.DataFrame(lead_pd, columns = [-1, 0, 1, 'total', 'win_prob', 'no_win_prob'], index = [0,1,2,3,4,5,6,7, 'total'])
lead_prob_pd.columns = ['lose', 'draw', 'win', 'total', 'win_prob', 'no_win_prob']
lead_prob_pd.loc['total'] = pd.Series([sum(lead_prob_pd[0:8].win), sum(lead_prob_pd[0:8].draw), sum(lead_prob_pd[0:8].lose), sum(lead_prob_pd[0:8].total)], index = ['win', 'draw', 'lose', 'total'])
lead_prob_pd.loc[3] = pd.Series([sum(lead_prob_pd[3:7].win), sum(lead_prob_pd[3:7].draw), sum(lead_prob_pd[3:7].lose), sum(lead_prob_pd[3:7].total)], index = ['win', 'draw', 'lose', 'total'])
lead_prob_pd.total = lead_prob_pd.win + lead_prob_pd.draw + lead_prob_pd.lose
lead_prob_pd = lead_prob_pd.drop([4,5,6,7])
lead_prob_pd.index = ['0', '1', '2', '3 over', 'total']
lead_prob_pd.lose = lead_prob_pd.lose + lead_prob_pd.draw
lead_prob_pd = lead_prob_pd.drop(['draw'], axis=1)
lead_prob_pd.columns = ['no_win', 'win', 'total', 'win_prob', 'no_win_prob']
lead_prob_pd.win_prob = lead_prob_pd.win / lead_prob_pd.total
lead_prob_pd.no_win_prob = lead_prob_pd.no_win / lead_prob_pd.total

lead_prob_pd

Unnamed: 0,no_win,win,total,win_prob,no_win_prob
0,2125.0,433.0,2558.0,0.169273,0.830727
1,196.0,626.0,822.0,0.761557,0.238443
2,18.0,316.0,334.0,0.946108,0.053892
3 over,1.0,151.0,152.0,0.993421,0.006579
total,2340.0,1528.0,3868.0,0.395036,0.604964


* 리드골 0회
    * 0:0으로 경기가 끝나거나(draw), 상대방에게 계속 뒤쳐진 상황으로 경기 종료(lose)
    * 선제골만으로 경기 승리(1:0 win)

In [11]:
print(lead_prob_pd.iloc[0:4][['no_win', 'win']].values)
result = st.chi2_contingency(lead_prob_pd.iloc[0:4][['no_win', 'win']].values)

result

[[2.125e+03 4.330e+02]
 [1.960e+02 6.260e+02]
 [1.800e+01 3.160e+02]
 [1.000e+00 1.510e+02]]


(1660.2387783590818, 0.0, 3, array([[1548.29798241, 1009.70201759],
        [ 497.53750647,  324.46249353],
        [ 202.1624418 ,  131.8375582 ],
        [  92.00206932,   59.99793068]]))

* $x^2$ 1660.24 p-value 0.001 미만, 자유도 3
* 리드골 여부에 따른 경기결과의 차이 존재
* 리드골 득점 시 승리 비율이 16% -> 76% -> 94% -> 99%로 상승

#### 리드골로 인해 발생한 득실차이와 경기 결과

In [12]:
point_total = pd.DataFrame(t, columns = ['match_id', 'winning_flag', 'location', 'score_line', 'lead_goal', 'lead_point'])
point_total.lead_point = point_total.score_line.apply(lambda x: int(x.split(':')[0]) - int(x.split(':')[1]))
point_total.lead_point = np.where(point_total.lead_goal, np.where(point_total.location == 1, point_total.lead_point, point_total.lead_point * -1), 0)
point_total[point_total.lead_goal].head()


Unnamed: 0,match_id,winning_flag,location,score_line,lead_goal,lead_point
2,2013-1-001,0,1,2:1,True,1
6,2013-1-002,1,1,2:1,True,1
10,2013-1-004,1,0,1:2,True,1
12,2013-1-006,0,1,2:0,True,2
16,2013-1-007,1,0,0:2,True,2


In [13]:
lead_point_pd = point_total.groupby(['match_id', 'location']).agg({'winning_flag': 'min', 'lead_goal': 'sum', 'lead_point': 'max'})
lead_point_pd = lead_point_pd.reset_index()

match_pd = pd.DataFrame(game_records_pd[game_records_pd.home_team_score + game_records_pd.away_team_score != 0], columns=['game_id', 'location', 'winning_team', 'home_team_id', 'winning_flag'])

match_pd = pd.concat([match_pd, match_pd])
match_pd.iloc[0:1934, 1] = 1
match_pd.iloc[1934:, 1] = 0
match_pd.winning_flag = np.where(match_pd.winning_team == 0, 0, np.where(match_pd.winning_team == match_pd.home_team_id, 1, -1))
match_pd.winning_flag = np.where(match_pd.location == 0, match_pd.winning_flag * -1, match_pd.winning_flag)
match_pd = match_pd.sort_values(by='game_id')

lead_point_pd = pd.merge(match_pd, lead_point_pd, how='left', left_on=['game_id', 'location'], right_on=['match_id', 'location'])
lead_point_pd = lead_point_pd.drop(['match_id', 'winning_team', 'home_team_id', 'winning_flag_y'], axis=1)
lead_point_pd = lead_point_pd.fillna(0)
lead_point_pd.columns = ['match_id', 'location', 'winning_flag', 'lead_goal', 'lead_point']

lead_point_pd = pd.DataFrame(lead_point_pd.groupby(['winning_flag', 'lead_point'])['lead_point'].count())
lead_point_pd.columns = ['values']
lead_point_pd = lead_point_pd.reset_index()
lead_point_pd = lead_point_pd.pivot(index='lead_point', columns='winning_flag', values='values')
lead_point_pd = lead_point_pd.fillna(0)

lead_point_pd

winning_flag,-1,0,1
lead_point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1473.0,652.0,433.0
1.0,34.0,103.0,348.0
2.0,21.0,56.0,489.0
3.0,0.0,1.0,180.0
4.0,0.0,0.0,58.0
5.0,0.0,0.0,14.0
6.0,0.0,0.0,3.0
7.0,0.0,0.0,2.0
8.0,0.0,0.0,1.0


In [14]:
lead_point_prob_pd = pd.DataFrame(lead_point_pd, columns = [-1, 0, 1, 'total', 'win_prob', 'draw_prob', 'lose_prob'], index = [0,1,2,3,4,5,6,7,8, 'total'])
lead_point_prob_pd.columns = ['lose', 'draw', 'win', 'total',  'win_prob', 'draw_prob', 'lose_prob']
lead_point_prob_pd.loc['total'] = pd.Series([sum(lead_point_prob_pd[0:9].win), sum(lead_point_prob_pd[0:9].draw), sum(lead_point_prob_pd[0:9].lose), sum(lead_point_prob_pd[0:9].total)], index = ['win', 'draw', 'lose', 'total'])
lead_point_prob_pd.loc[3] = pd.Series([sum(lead_point_prob_pd[3:9].win), sum(lead_point_prob_pd[3:9].draw), sum(lead_point_prob_pd[3:9].lose), sum(lead_point_prob_pd[3:9].total)], index = ['win', 'draw', 'lose', 'total'])
lead_point_prob_pd.total = lead_point_prob_pd.win + lead_point_prob_pd.draw + lead_point_prob_pd.lose
lead_point_prob_pd = lead_point_prob_pd.drop([4,5,6,7,8])
lead_point_prob_pd.index = ['0', '1', '2', '3 over', 'total']
lead_point_prob_pd.win_prob = lead_point_prob_pd.win / lead_point_prob_pd.total
lead_point_prob_pd.draw_prob = lead_point_prob_pd.draw / lead_point_prob_pd.total
lead_point_prob_pd.lose_prob = lead_point_prob_pd.lose / lead_point_prob_pd.total
lead_point_prob_pd = lead_point_prob_pd[['win', 'draw', 'lose', 'total',  'win_prob', 'draw_prob', 'lose_prob']]

lead_point_prob_pd

Unnamed: 0,win,draw,lose,total,win_prob,draw_prob,lose_prob
0,433.0,652.0,1473.0,2558.0,0.169273,0.254887,0.575841
1,348.0,103.0,34.0,485.0,0.717526,0.212371,0.070103
2,489.0,56.0,21.0,566.0,0.863958,0.09894,0.037102
3 over,258.0,1.0,0.0,259.0,0.996139,0.003861,0.0
total,1528.0,812.0,1528.0,3868.0,0.395036,0.209928,0.395036


In [15]:
print(lead_point_prob_pd.iloc[0:4][['win', 'draw', 'lose']].values)
result = st.chi2_contingency(lead_point_prob_pd.iloc[0:4][['win', 'draw', 'lose']].values)

result

[[4.330e+02 6.520e+02 1.473e+03]
 [3.480e+02 1.030e+02 3.400e+01]
 [4.890e+02 5.600e+01 2.100e+01]
 [2.580e+02 1.000e+00 0.000e+00]]


(1747.1040545354579,
 0.0,
 6,
 array([[1010.50258532,  536.99482937, 1010.50258532],
        [ 191.59255429,  101.81489142,  191.59255429],
        [ 223.59048604,  118.81902792,  223.59048604],
        [ 102.31437435,   54.37125129,  102.31437435]]))

In [16]:
st.chi2_contingency(lead_point_prob_pd.iloc[0:4][['win','lose']].values)

(1513.034679327855, 0.0, 3, array([[953., 953.],
        [191., 191.],
        [255., 255.],
        [129., 129.]]))

In [17]:
st.chi2_contingency(lead_point_prob_pd.iloc[0:4][['win', 'draw']].values)

(614.8066102267474,
 6.215196625809069e-133,
 3,
 array([[708.4957265 , 376.5042735 ],
        [294.4991453 , 156.5008547 ],
        [355.88034188, 189.11965812],
        [169.12478632,  89.87521368]]))

In [18]:
st.chi2_contingency(lead_point_prob_pd.iloc[0:4][['draw', 'lose']].values)

(165.24397603038648,
 1.3531132627809514e-35,
 3,
 array([[7.37393162e+02, 1.38760684e+03],
        [4.75401709e+01, 8.94598291e+01],
        [2.67196581e+01, 5.02803419e+01],
        [3.47008547e-01, 6.52991453e-01]]))

* $x^2$ 1747.10, p-value 0.001 미만, 자유도 6
* 리드골로 인한 최대 점수 차이에 따른 경기결과의 차이 존재
* 리드골 득점 시 승리 비율이 16% -> 71% -> 86% -> 99%로 상승

### 추격골 횟수와 경기 결과 Cross Table

In [19]:
chase_pd = pd.DataFrame(t_total.groupby(['winning_flag', 'chase_goal'])['chase_goal'].count())
chase_pd.columns = ['values']
chase_pd = chase_pd.reset_index()
chase_pd = chase_pd.pivot(index='chase_goal', columns='winning_flag', values='values')
chase_pd = chase_pd.fillna(0)
chase_pd

winning_flag,-1,0,1
chase_goal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1242.0,755.0,1507.0
1.0,245.0,55.0,20.0
2.0,37.0,2.0,1.0
3.0,4.0,0.0,0.0


In [20]:
chase_prob_pd = pd.DataFrame(chase_pd, columns = [-1, 0, 1, 'total', 'win_prob', 'no_win_prob'], index = [0,1,2,3, 'total'])
chase_prob_pd.columns = ['lose', 'draw', 'win', 'total', 'win_prob', 'no_win_prob']
chase_prob_pd.loc['total'] = pd.Series([sum(chase_prob_pd[0:4].win), sum(chase_prob_pd[0:4].draw), sum(chase_prob_pd[0:4].lose), sum(chase_prob_pd[0:4].total)], index = ['win', 'draw', 'lose', 'totla'])
chase_prob_pd.loc[2] = pd.Series([sum(chase_prob_pd[2:4].win), sum(chase_prob_pd[2:4].draw), sum(chase_prob_pd[2:4].lose), sum(chase_prob_pd[2:4].total)], index = ['win', 'draw', 'lose', 'totla'])
chase_prob_pd.total = chase_prob_pd.win + chase_prob_pd.draw + chase_prob_pd.lose
chase_prob_pd = chase_prob_pd.drop([3])
chase_prob_pd.index = ['0', '1', '2 over', 'total']
chase_prob_pd.win = chase_prob_pd.win + chase_prob_pd.draw
chase_prob_pd = chase_prob_pd.drop(['draw'], axis=1)
chase_prob_pd.columns = ['lose', 'no_lose', 'total', 'lose_prob', 'no_lose_prob']
chase_prob_pd.lose_prob = chase_prob_pd.lose / chase_prob_pd.total
chase_prob_pd.no_lose_prob = chase_prob_pd.no_lose / chase_prob_pd.total

chase_prob_pd

Unnamed: 0,lose,no_lose,total,lose_prob,no_lose_prob
0,1242.0,2262.0,3504.0,0.354452,0.645548
1,245.0,75.0,320.0,0.765625,0.234375
2 over,41.0,3.0,44.0,0.931818,0.068182
total,1528.0,2340.0,3868.0,0.395036,0.604964


In [21]:
print(chase_prob_pd.iloc[0:3][['no_lose', 'lose']].values)
result = st.chi2_contingency(chase_prob_pd.iloc[0:3][['no_lose', 'lose']].values)

result

[[2262. 1242.]
 [  75.  245.]
 [   3.   41.]]


(261.0935589794434,
 2.0149025076721943e-57,
 2,
 array([[2119.79317477, 1384.20682523],
        [ 193.58841779,  126.41158221],
        [  26.61840745,   17.38159255]]))

* $x^2$ 261.09, p-value 0.001 미만, 자유도 2
* 추격골에 따른 경기결과의 차이 존재
* 추격골 득점 시 패배 확률이 35% -> 76% -> 93%로 증가

In [22]:
chase_prob_pd = pd.DataFrame(chase_pd, columns = [-1, 0, 1, 'total', 'win_prob', 'draw_prob', 'lose_prob'], index = [0,1,2,3, 'total'])
chase_prob_pd.columns = ['lose', 'draw', 'win', 'total', 'win_prob', 'draw_prob', 'lose_prob']
chase_prob_pd.loc['total'] = pd.Series([sum(chase_prob_pd[0:4].win), sum(chase_prob_pd[0:4].draw), sum(chase_prob_pd[0:4].lose), sum(chase_prob_pd[0:4].total)], index = ['win', 'draw', 'lose', 'total'])
chase_prob_pd.loc[2] = pd.Series([sum(chase_prob_pd[2:4].win), sum(chase_prob_pd[2:4].draw), sum(chase_prob_pd[2:4].lose), sum(chase_prob_pd[2:4].total)], index = ['win', 'draw', 'lose', 'total'])
chase_prob_pd.total = chase_prob_pd.win + chase_prob_pd.draw + chase_prob_pd.lose
chase_prob_pd = chase_prob_pd.drop([3])
chase_prob_pd.index = ['0', '1', '2 over', 'total']
chase_prob_pd.lose_prob = chase_prob_pd.lose / chase_prob_pd.total
chase_prob_pd.win_prob = chase_prob_pd.win / chase_prob_pd.total
chase_prob_pd.draw_prob = chase_prob_pd.draw / chase_prob_pd.total

chase_prob_pd

Unnamed: 0,lose,draw,win,total,win_prob,draw_prob,lose_prob
0,1242.0,755.0,1507.0,3504.0,0.43008,0.215468,0.354452
1,245.0,55.0,20.0,320.0,0.0625,0.171875,0.765625
2 over,41.0,2.0,1.0,44.0,0.022727,0.045455,0.931818
total,1528.0,812.0,1528.0,3868.0,0.395036,0.209928,0.395036


In [23]:
print(chase_prob_pd.iloc[0:3][['win', 'draw', 'lose']].values)
result = st.chi2_contingency(chase_prob_pd.iloc[0:3][['win', 'draw', 'lose']].values)

result

[[1.507e+03 7.550e+02 1.242e+03]
 [2.000e+01 5.500e+01 2.450e+02]
 [1.000e+00 2.000e+00 4.100e+01]]


(282.2495961414189,
 7.293715159033215e-60,
 4,
 array([[1384.20682523,  735.58634953, 1384.20682523],
        [ 126.41158221,   67.17683557,  126.41158221],
        [  17.38159255,    9.23681489,   17.38159255]]))

* $x^2$ 282.25, p-value 0.001 미만, 자유도 4
* 추격골에 따른 경기결과의 차이 존재
* 추격골 득점 시 패배 확률이 35% -> 76% -> 93%로 증가
* 추격골 득점 시 승리 확률이 43% -> 6% -> 2%로 급감

## 리드골과 추격골에 대한 분석 결과
* 리드골이 발생한다는 것은 이기고 있다는 것, 추격골이 발생한다는 것은 지고 있다는 것을 의미
* 해당 골이 발생하는 상황적 의미가 강력하게 작용
* 즉, 리드골/추격골이 발생하는 상황을 뒤집을 가능성이 낮음

## 참고자료

In [None]:
# fig = plt.figure()
# # plt.scatter(t2.lead_goal, t2.winning_flag)
# ax = Axes3D(fig)
# ax.scatter(t2.lead_goal, t2.chase_goal, t2.winning_flag)

* Clustering 시도

In [None]:
# goal_data_pd = pd.DataFrame(db_conn.select_query("""SELECT * FROM goal_records_rev"""), columns=['match_id', 'id', 'half_type', 'play_time', 'goal_type', 'score_type', 'score_team_id'])
# goal_data_pd = pd.merge(goal_data_pd, game_records_pd, how='left', left_on='match_id', right_on='game_id')
# goal_data_pd = goal_data_pd[['match_id', 'id', 'half_type', 'play_time', 'goal_type', 'score_type', 'score_team_id', 'home_team_id', 'winning_team']]
# goal_data_pd = pd.DataFrame(goal_data_pd, columns=['id', 'half_type', 'play_time', 'time', 'goal_type', 'score_type', 'score_team_id', 'home_team_id', 'gain_loss', 'winning_team', 'winning_flag'])

In [None]:
# goal_data_pd.time = (goal_data_pd.half_type - 1) * 45 + goal_data_pd.play_time
# goal_data_pd.gain_loss = np.where(goal_data_pd.score_team_id == goal_data_pd.home_team_id, 1, 0)
# goal_data_pd.winning_flag = np.where(goal_data_pd.home_team_id == goal_data_pd.winning_team, 1, 0)
# goal_data_pd = goal_data_pd.drop(['half_type', 'play_time', 'score_team_id', 'home_team_id'], axis=1)
# goal_data_pd.head()

In [None]:
# fig = plt.figure()
# ax = Axes3D(fig)
# ax.scatter(goal_data_pd.goal_type, goal_data_pd.score_type, goal_data_pd.time)

In [None]:
# fig = plt.figure()
# ax = Axes3D(fig)
# ax.scatter(goal_data_pd.score_type, goal_data_pd.goal_type, goal_data_pd.time)

In [None]:
# fig = plt.figure()
# ax = Axes3D(fig)
# ax.scatter(goal_data_pd.time, goal_data_pd.goal_type, goal_data_pd.score_type)

### 로지스틱 회귀분석

In [136]:
logit_data = pd.DataFrame(point_total, columns=['match_id', 'location', 'lead_point', 'winning_flag'])
logit_data.match_id = logit_data.match_id.str.split('-').str.get(0)
logit_data.match_id = logit_data.match_id.apply(pd.to_numeric)
logit_data.columns = ['year', 'location', 'lead_point', 'victory']
logit_data.victory = np.where(logit_data.victory == 1, 1, 0)

train_set = logit_data[logit_data.year < 2017]
test_set = logit_data[logit_data.year == 2017]

logit_data = logit_data[logit_data.victory == 0].append(logit_data[logit_data.victory == 1].sample(len(logit_data[logit_data.victory == 0])))

logit_data.head()

Unnamed: 0,year,location,lead_point,victory
0,2013,1,0,0
1,2013,0,0,0
2,2013,1,1,0
3,2013,0,0,0
4,2013,0,0,0


In [137]:
from patsy import dmatrices
from sklearn.cross_validation import train_test_split

# 2017년을 test set으로 하는 sampling
# y_train, x_train = dmatrices('victory ~ lead_point', train_set, return_type='dataframe')
# y_test, x_test = dmatrices('victory ~ lead_point', test_set, return_type='dataframe')
# y_train = np.ravel(y_train)
# y_test = np.ravel(y_test)

# Random sampling
y, X = dmatrices('victory ~ lead_point', logit_data, return_type='dataframe')
y = np.ravel(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model = model.fit(x_train, y_train)

# 로지스틱 회귀식의 절편과 alpha 값
list(zip(x_train.columns, np.transpose(model.coef_)))

[('Intercept', array([-0.29509432])), ('lead_point', array([1.29419705]))]

In [138]:
# Training Set 에서의 정확도
model.score(x_train, y_train)

0.7004156769596199

In [139]:
model.sparsify()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [140]:
# Test Set을 이용하여 예측 수행
from sklearn import metrics

predicted = model.predict(x_test)
predicted_probs = model.predict_proba(x_test)[:, 1]

# Test Set 에서의 정확도
metrics.accuracy_score(y_test, predicted), metrics.roc_auc_score(y_test, predicted_probs)

(0.7232779097387173, 0.7324565364642132)

In [141]:
confusion_mt = metrics.confusion_matrix(y_test, predicted)
tn, fp, fn, tp = confusion_mt.ravel()
tn, fp, fn, tp, confusion_mt
pd.DataFrame([{'Real Positive': tn, 'Real Negative': fp}, {'Real Positive': fn, 'Real Negative': tn}], index=['Predicted Positive', 'Predicted Negative'])

Unnamed: 0,Real Negative,Real Positive
Predicted Positive,45,385
Predicted Negative,385,188


In [142]:
# F-measure 수행
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        0.0       0.67      0.90      0.77       430
        1.0       0.83      0.54      0.66       412

avg / total       0.75      0.72      0.71       842



* 로지

In [143]:
logit_data = pd.DataFrame(point_total, columns=['match_id', 'location', 'lead_point', 'chase_goal', 'winning_flag'])
logit_data = pd.merge(point_total, t_total[['match_id', 'location', 'lead_goal',  'chase_goal']], how='left', on = ['match_id', 'location'])
logit_data.match_id = logit_data.match_id.str.split('-').str.get(0)
logit_data.match_id = logit_data.match_id.apply(pd.to_numeric)
logit_data = logit_data.drop(['score_line', 'lead_goal_x'], axis=1)
logit_data.columns = ['year', 'victory', 'location', 'lead_point', 'lead_goal', 'chase_goal']
logit_data.victory = np.where(logit_data.victory == 1, 1, 0)

train_set = logit_data[logit_data.year < 2017]
test_set = logit_data[logit_data.year == 2017]

logit_data = logit_data[logit_data.victory == 0].append(logit_data[logit_data.victory == 1].sample(len(logit_data[logit_data.victory == 0])))

logit_data.head()


Unnamed: 0,year,victory,location,lead_point,lead_goal,chase_goal
0,2013,0,1,0,1.0,0.0
1,2013,0,0,0,0.0,0.0
2,2013,0,1,1,1.0,0.0
3,2013,0,0,0,0.0,0.0
4,2013,0,0,0,0.0,0.0


In [162]:
# 2017년을 test set으로 하는 sampling
# y_train, x_train = dmatrices('victory ~ lead_point', train_set, return_type='dataframe')
# y_test, x_test = dmatrices('victory ~ lead_point', test_set, return_type='dataframe')
# y_train = np.ravel(y_train)
# y_test = np.ravel(y_test)

# Random sampling
y, X = dmatrices('victory ~ lead_goal + chase_goal', logit_data, return_type='dataframe')
y = np.ravel(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model = model.fit(x_train, y_train)

# 로지스틱 회귀식의 절편과 alpha 값
list(zip(x_train.columns, np.transpose(model.coef_)))

[('Intercept', array([-0.71939426])),
 ('lead_goal', array([2.0650072])),
 ('chase_goal', array([-1.58197429]))]

In [163]:
# Training Set 에서의 정확도
model.score(x_train, y_train)

0.8191805225653207

In [164]:
model.sparsify()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [165]:
# Test Set을 이용하여 예측 수행
from sklearn import metrics

predicted = model.predict(x_test)
predicted_probs = model.predict_proba(x_test)[:, 1]

# Test Set 에서의 정확도
metrics.accuracy_score(y_test, predicted), metrics.roc_auc_score(y_test, predicted_probs)

(0.8147268408551069, 0.8827387672160759)

In [166]:
confusion_mt = metrics.confusion_matrix(y_test, predicted)
tn, fp, fn, tp = confusion_mt.ravel()
tn, fp, fn, tp, confusion_mt
pd.DataFrame([{'Real Positive': tn, 'Real Negative': fp}, {'Real Positive': fn, 'Real Negative': tn}], index=['Predicted Positive', 'Predicted Negative'])

Unnamed: 0,Real Negative,Real Positive
Predicted Positive,91,339
Predicted Negative,339,65


In [167]:
# F-measure 수행
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        0.0       0.84      0.79      0.81       430
        1.0       0.79      0.84      0.82       412

avg / total       0.82      0.81      0.81       842



#### 역전에 성공한 경우?? 엎치락 뒤치락??

In [70]:
lead_chase_total = pd.DataFrame(t_total.groupby(['match_id', 'location']).agg({'lead_goal': 'sum', 'chase_goal': 'sum', 'winning_flag': 'min'}), columns=['lead_goal', 'chase_goal', 'winning_flag', 'lead_chase_flag', 'lead_chase_diff'])
lead_chase_total = lead_chase_total.reset_index()
lead_chase_total.lead_chase_flag = np.where(lead_chase_total.lead_goal > 0, 1, 0)
lead_chase_total.lead_chase_flag = np.where(lead_chase_total.chase_goal > 0, lead_chase_total.lead_chase_flag + 1, lead_chase_total.lead_chase_flag)
lead_chase_total.lead_chase_diff = lead_chase_total.lead_goal - lead_chase_total.chase_goal
lead_chase_total.describe()

Unnamed: 0,location,lead_goal,chase_goal,winning_flag,lead_chase_flag,lead_chase_diff
count,3868.0,3868.0,3868.0,3868.0,3868.0,3868.0
mean,0.5,0.517322,0.106515,0.0,0.432782,0.410807
std,0.500065,0.865068,0.349411,0.888975,0.509928,0.980839
min,0.0,0.0,0.0,-1.0,0.0,-3.0
25%,0.0,0.0,0.0,-1.0,0.0,0.0
50%,0.5,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,1.0,1.0,1.0
max,1.0,7.0,3.0,1.0,2.0,7.0


In [73]:
lead_chase_total.__len__(), lead_chase_total[lead_chase_total.lead_chase_flag == 2].__len__()

(3868, 28)

In [74]:
lead_chase_total[lead_chase_total.lead_chase_flag == 2]

Unnamed: 0,match_id,location,lead_goal,chase_goal,winning_flag,lead_chase_flag,lead_chase_diff
111,2013-1-063,1.0,1.0,1.0,1,2,0.0
163,2013-1-090,1.0,2.0,1.0,0,2,1.0
170,2013-1-095,0.0,2.0,1.0,-1,2,1.0
586,2013-2-054,0.0,1.0,1.0,1,2,0.0
1100,2014-1-195,0.0,1.0,1.0,1,2,0.0
1175,2014-2-006,1.0,1.0,1.0,1,2,0.0
1240,2014-2-042,0.0,1.0,1.0,1,2,0.0
1281,2014-2-063,1.0,1.0,2.0,1,2,-1.0
1412,2014-2-139,0.0,2.0,1.0,1,2,1.0
1463,2014-2-166,1.0,1.0,1.0,1,2,0.0
