# 시간대별 득실차에 따른 경기 결과

## 기본 컨셉
* 시간대별로 발생한 득점과 실점의 차이에 따른 경기 결과를 회귀분석
* 가설: 시간대에 따라 득실의 차이가 경기 결과에 미치는 영향이 다를 것이다(특정 시간대의 득실차가 클 수록 경기에서 승리할 가능성이 높을 것이다)

## 사용 데이터
* 2013~2016년 득점 기록을 기반으로 경기의 시간대별 득실차(득점 - 실점)을 추출
* 경기의 주체는 Home 팀으로 가정

## 데이터 추출

In [1]:
import db_conn
import numpy as np
import pandas as pd

##### team_info Table

In [2]:
# SQL문 작성
sql = """SELECT * FROM `team_info`"""

In [3]:
# SQL 실행
team_info = team_info = db_conn.select_query(sql)

In [4]:
# Column명과 Row명 지정
columns = ['team_id', 'team_name', 'team_nick']
rows = range(len(team_info))

# DB 접속이 가능한 경우
data_source = [[item[key] for key in columns] for item in team_info]

# csv -> DataFrame
team_info_pd = pd.DataFrame(data_source, index=rows, columns=columns)

len(team_info_pd)

25

##### game_records Table

In [5]:
# SQL문 작성
sql = """SELECT * FROM `game_records`"""

In [6]:
# SQL 실행
game_records = db_conn.select_query(sql)

In [7]:
# Column명과 Row명 지정
columns = ['game_id', 'home_team_id', 'away_team_id', 'home_team_score', 'away_team_score', 'winning_team', 'season_year', 'division']
rows = range(len(game_records))

# DB 접속이 가능한 경우
data_source = [[item[key] for key in columns] for item in game_records]

# csv -> DataFrame
game_records_pd = pd.DataFrame(data_source, index=rows, columns=columns)

# 불필요한 Columns 제거
game_records_pd = game_records_pd.drop(['away_team_id', 'home_team_score', 'away_team_score', 'season_year', 'division'], axis=1)
len(game_records_pd)

2136

##### goal_records Table

In [8]:
# SQL문 작성
sql = """SELECT * FROM `goal_records` WHERE year < 2017"""

In [9]:
# SQL 실행
goal_records = db_conn.select_query(sql)

In [10]:
# Column명과 Row명 지정
columns = ['id', 'year', 'division', 'match_date', 'match_id', 'half_type', 'play_time', 'goal_type', 'own_goal', 'penalty_kick', 'score_team', 'score_player', 'score_position', 'score_type', 'score_direction', 'touch_count', 'assist', 'assist_player', 'assist_position', 'assist_type', 'lost_team', 'lost_gk', 'remark']
rows = range(len(goal_records))

data_source = [[item[key] for key in columns] for item in goal_records]

# csv -> DataFrame
goal_records_pd = pd.DataFrame(data_source, index=rows, columns=columns)

# 불필요한 Columns 제거
goal_records_pd = goal_records_pd.drop(['id', 'match_date', 'goal_type', 'own_goal', 'penalty_kick', 'score_player', 'score_position', 'score_type', 'score_direction', 'touch_count', 'assist', 'assist_player', 'assist_position', 'assist_type', 'lost_team', 'lost_gk', 'remark'], axis=1)

# 2017년도 데이터 제외
goal_records_pd = goal_records_pd[goal_records_pd.year < 2017]
len(goal_records_pd)

4374

## 데이터 전처리

### TEAM_INFO
* 팀명을 기준으로 Team ID를 추출하기 위해 사용

In [11]:
# index 변경
team_info_pd.index = team_info_pd.team_id
team_info_pd = team_info_pd.drop('team_id', axis=1)

# team_name으로 team_id 찾는 lambda function
get_team_id = lambda x : int(team_info_pd[team_info_pd.team_name == x].index.values[0])

team_info_pd.head()

Unnamed: 0_level_0,team_name,team_nick
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,강원,강원FC
2,경남,경남FC
3,고양,고양 자이크로
4,광주,광주FC
5,대구,대구FC


### GAME_RECORDS
* 대상 경기의 Home 팀을 확인하기 위해 사용

In [12]:
# index 변경
game_records_pd.index = game_records_pd.game_id
game_records_pd = game_records_pd.drop('game_id', axis=1)

# game_id로 home_team_id 찾는 lambda function
get_home_team_id = lambda x : int(game_records_pd.loc[x].home_team_id)
game_records_pd.head()

Unnamed: 0_level_0,home_team_id,winning_team
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-1-001,10,0
2013-1-002,19,19
2013-1-003,21,23
2013-1-004,12,13
2013-1-005,20,0


### GOAL RECORDS
* 전체 득점을 경기, 시간대별로 Groupping 작업
* 각 시간대의 득실차(득점 - 실점)은 이후 회귀분석의 독립변수로 사용
* 해당 경기의 결과(승/무/패)는 각각 1, 0, -1로 치환하여 회귀분석의 종속변수로 사용

In [13]:
# 파생변수 추가 (Game ID, 득점시간, 득점팀 ID)
goal_records_pd = pd.DataFrame(goal_records_pd, columns=['year', 'division', 'match_id', 'half_type', 'play_time', 'score_team', 'goal_time', 'goal_range', 'home_team'])

## Game ID
goal_records_pd.year = goal_records_pd.year.astype(str)
goal_records_pd.division = goal_records_pd.division.astype(str)
goal_records_pd.match_id = goal_records_pd.year.apply(str) + "-" + goal_records_pd.division.apply(str) + "-" + goal_records_pd.match_id.apply(str).str.zfill(3)

## 득점팀 ID
goal_records_pd.score_team = goal_records_pd.score_team.apply(get_team_id)
goal_records_pd.home_team = goal_records_pd.match_id.apply(get_home_team_id)

## 득점시간
divide_criteria = 15 # 구간을 나누는 기준 (15분)
goal_records_pd.half_type = goal_records_pd.half_type.apply(pd.to_numeric)
goal_records_pd.play_time = goal_records_pd.play_time.apply(pd.to_numeric)
goal_records_pd.goal_time = (goal_records_pd.half_type - 1) * 45 + goal_records_pd.play_time
goal_records_pd.goal_range = goal_records_pd.goal_time / divide_criteria
goal_records_pd.goal_range = goal_records_pd.goal_range.astype(int)

# goal_records_pd.index = goal_records_pd.match_id
goal_records_pd = goal_records_pd.drop(['year', 'division', 'half_type', 'play_time', 'goal_time'], axis=1)

goal_records_pd.head()

Unnamed: 0,match_id,score_team,goal_range,home_team
0,2013-1-001,10,1,10
1,2013-1-001,25,2,10
2,2013-1-001,10,3,10
3,2013-1-001,25,5,10
4,2013-1-002,5,0,19


In [14]:
# 시간대별로 득점 분리
time_goal_diff_pd = goal_records_pd.groupby(['match_id', 'score_team', 'home_team', 'goal_range']).size().reset_index(name='goal_count')

## Away 팀의 득점을 음수(-)로 전환
time_goal_diff_pd.goal_count = time_goal_diff_pd.goal_count * ((time_goal_diff_pd.score_team != time_goal_diff_pd.home_team) * -2 + 1)
time_goal_diff_pd = time_goal_diff_pd.drop(['score_team', 'home_team'], axis=1)

## Home 팀 기준으로 득실차(득점 - 실점) 계산
time_goal_diff_pd = time_goal_diff_pd.groupby(['match_id', 'goal_range']).sum().unstack()
time_goal_diff_pd = time_goal_diff_pd.fillna(0)

## Home 팀 기준으로 승패(victory) 계산
time_goal_diff_pd.columns = ['range_0', 'range_1', 'range_2', 'range_3', 'range_4', 'range_5', 'victory']
time_goal_diff_pd.range_5 = time_goal_diff_pd.range_5 + time_goal_diff_pd.victory
time_goal_diff_pd.victory = time_goal_diff_pd.range_0 + time_goal_diff_pd.range_1 + time_goal_diff_pd.range_2 + time_goal_diff_pd.range_3 + time_goal_diff_pd.range_4 + time_goal_diff_pd.range_5

time_goal_diff_pd['victory'][time_goal_diff_pd.victory > 0] = 1
time_goal_diff_pd['victory'][time_goal_diff_pd.victory < 0] = -1

time_goal_diff_pd.head()

Unnamed: 0_level_0,range_0,range_1,range_2,range_3,range_4,range_5,victory
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-1-001,0.0,1.0,-1.0,1.0,0.0,-1.0,0.0
2013-1-002,-1.0,0.0,0.0,0.0,0.0,2.0,1.0
2013-1-003,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0
2013-1-004,-1.0,1.0,0.0,0.0,-1.0,0.0,-1.0
2013-1-006,1.0,0.0,0.0,0.0,-1.0,0.0,0.0


In [23]:
time_goal_pd = goal_records_pd.groupby(['match_id', 'score_team', 'home_team', 'goal_range']).size().reset_index(name='goal_count')
time_goal_pd = pd.DataFrame(time_goal_pd, columns = ['match_id', 'score_team', 'home_team', 'home_flag', 'goal_count', 'goal_range'])

time_goal_pd.home_flag = np.where(time_goal_pd.score_team == time_goal_pd.home_team, '-1', '-0')
time_goal_pd.match_id = time_goal_pd.match_id + time_goal_pd.home_flag
time_goal_pd = time_goal_pd.drop(['score_team', 'home_team', 'home_flag'], axis=1)

time_goal_pd = time_goal_pd.groupby(['match_id', 'goal_range']).sum().unstack()
time_goal_pd = time_goal_pd.fillna(0)

time_goal_pd.columns = ['range_0', 'range_1', 'range_2', 'range_3', 'range_4', 'range_5', 'victory']
time_goal_pd.range_5 = time_goal_pd.range_5 + time_goal_pd.victory
time_goal_pd.victory = time_goal_pd.range_0 + time_goal_pd.range_1 + time_goal_pd.range_2 + time_goal_pd.range_3 + time_goal_pd.range_4 + time_goal_pd.range_5

time_goal_pd['victory'][time_goal_pd.index.map(lambda x: str(x)[11:12] == '1' and game_records_pd.loc[str(x)[0:10]].home_team_id == game_records_pd.loc[str(x)[0:10]].winning_team)] = 1
time_goal_pd['victory'][time_goal_pd.index.map(lambda x: str(x)[11:12] == '1' and game_records_pd.loc[str(x)[0:10]].home_team_id != game_records_pd.loc[str(x)[0:10]].winning_team)] = -1
time_goal_pd['victory'][time_goal_pd.index.map(lambda x: str(x)[11:12] == '0' and game_records_pd.loc[str(x)[0:10]].home_team_id == game_records_pd.loc[str(x)[0:10]].winning_team)] = -1
time_goal_pd['victory'][time_goal_pd.index.map(lambda x: str(x)[11:12] == '0' and game_records_pd.loc[str(x)[0:10]].home_team_id != game_records_pd.loc[str(x)[0:10]].winning_team)] = 1
time_goal_pd['victory'][time_goal_pd.index.map(lambda x: game_records_pd.loc[str(x)[0:10]].winning_team == 0)] = 0

time_goal_pd.head()

Unnamed: 0,0
range_0,481.0
range_1,580.0
range_2,603.0
range_3,814.0
range_4,754.0
range_5,1142.0
victory,640.0


In [26]:
a = pd.DataFrame(time_goal_pd.sum())
a.columns = ['cnt']
a['total'] = time_goal_pd.__len__()
a['probs'] = a.cnt / a.total
a

Unnamed: 0,cnt,total,probs
range_0,481.0,2474,0.194422
range_1,580.0,2474,0.234438
range_2,603.0,2474,0.243735
range_3,814.0,2474,0.329022
range_4,754.0,2474,0.30477
range_5,1142.0,2474,0.461601
victory,640.0,2474,0.25869


## 회귀분석 수행

In [16]:
import statsmodels.formula.api as sm

### 대상 데이터
* 위에서 전체리된 경기별 시간대별 득실차(득점 - 실점)과 대상 경기의 결과(승/무/패)

In [17]:
print_pd = pd.DataFrame.copy(time_goal_diff_pd, deep=True)
print_pd.columns = ['~15분', '~30분', '~45분', '~60분', '~75분', '~90분', '승/무/패(1/0/-1)']
print_pd.index.name = '경기 ID'
print_pd.head()

Unnamed: 0_level_0,~15분,~30분,~45분,~60분,~75분,~90분,승/무/패(1/0/-1)
경기 ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-1-001,0.0,1.0,-1.0,1.0,0.0,-1.0,0.0
2013-1-002,-1.0,0.0,0.0,0.0,0.0,2.0,1.0
2013-1-003,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0
2013-1-004,-1.0,1.0,0.0,0.0,-1.0,0.0,-1.0
2013-1-006,1.0,0.0,0.0,0.0,-1.0,0.0,0.0


### 회귀분석 모형
$$Y = a1x1 + a2x2 + a3x3 + a4x4 + a5x5 + a6x6 + b$$

* X1 ~ X6은 각각의 시간대 별 득실차를 의미
* Y는 경기 결과 (승/무/패)

#### 사용 모델
* 선형회귀분석 중 최소자승법(Ordinary Least Squares, OLS) 사용
* OLS는 오차항들의 자승(제곱)의 합이 가장 작은 회귀선을 도출하는 방식

In [18]:
result = sm.ols(formula = 'victory ~ range_0 + range_1 + range_2 + range_3 + range_4 + range_5', data=time_goal_diff_pd).fit()
result.summary()

0,1,2,3
Dep. Variable:,victory,R-squared:,0.761
Model:,OLS,Adj. R-squared:,0.76
Method:,Least Squares,F-statistic:,824.1
Date:,"Sat, 14 Apr 2018",Prob (F-statistic):,0.0
Time:,16:31:42,Log-Likelihood:,-911.53
No. Observations:,1557,AIC:,1837.0
Df Residuals:,1550,BIC:,1875.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0168,0.011,-1.508,0.132,-0.039,0.005
range_0,0.4546,0.020,23.125,0.000,0.416,0.493
range_1,0.4705,0.018,25.930,0.000,0.435,0.506
range_2,0.4502,0.018,25.007,0.000,0.415,0.485
range_3,0.4675,0.015,31.485,0.000,0.438,0.497
range_4,0.4757,0.016,30.246,0.000,0.445,0.507
range_5,0.5408,0.013,41.082,0.000,0.515,0.567

0,1,2,3
Omnibus:,66.981,Durbin-Watson:,2.068
Prob(Omnibus):,0.0,Jarque-Bera (JB):,186.915
Skew:,-0.147,Prob(JB):,2.58e-41
Kurtosis:,4.672,Cond. No.,1.82


### 회귀분석 결과의 해석

#### [주요 결과치]

##### R-squared / Adj.R-squared (R제곱, 설명률, 설명력)
 * R제곱과 수정된 R제곱의 값은 0.711, 0.709
 * 일반적으로 학계에서는 0.6, 실무에서는 0.4 이상인 경우 의미가 있는 것으로 간주 (관련 Reference 확인 필요)
 
 
##### coef(상관계수)
 * 상관계수는 각각의 항목들(X1~X6)이 종속변수(Y)에 얼마나 영향을 주는지를 보여주는 척도(일종의 weight)
 * 전체적으로 시간대에 따라 큰 차이는 보이지 않으나 마지막 시간대(75분~90분)의 영향력이 상대적으로 크게 나옴
 * 전반적보다는 후반전에서의 영향력이 더 큰 것으로 나타남
 * 유의확률(P>|t|)의 경우에도 매우 작게 나와 통계적으로 해당 모형의 독립변수가 유의미한 것으로 판단됨


##### Prob / F-statistic (유의확률)
 * 유의확률은 귀무가설, 즉 회귀분석을 수행한 모형을 기각할 수 있는 확률을 의미
 * 일반적으로 99%(<0.01), 95%(<0.05), 90%(<0.1) 등을 사용
 * 본 모형의 경우 유의확률이 99%보다 크게 나왔기 때문에 충분히 유의미하다고 판단 가능


#### [기타 결과치]

##### Durbin-Watson
 * 잔차의 독립성을 검증하는 지표 (0~4 사이의 값)
 * 2에 가까울 수록 잔차의 독립성이 유지된다고 판단
 
#### Skew(왜도)
 * 정규분포를 기준으로 전체 Data의 치우침을 판단
 * 0에 가까울수록 좌우대칭의 분포를 보여줌
 
#### kurtosis(첨도)
 * 정규분포를 기준으로 분포가 뾰족한 정도를 판단
 * 0에 가까울수록 정규분포와 비슷하며, 양수인 경우 뾰족한 모양, 음수인 경우 둥근 모양을 보여줌

In [19]:
print_pd = pd.DataFrame.copy(time_goal_pd, deep=True)
print_pd.columns = ['~15분', '~30분', '~45분', '~60분', '~75분', '~90분', '승/무/패(1/0/-1)']
print_pd.index.name = '경기 ID'
print_pd.head()

Unnamed: 0_level_0,~15분,~30분,~45분,~60분,~75분,~90분,승/무/패(1/0/-1)
경기 ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-1-001-0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2013-1-001-1,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2013-1-002-0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0
2013-1-002-1,0.0,0.0,0.0,0.0,0.0,2.0,1.0
2013-1-003-0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [20]:
result = sm.ols(formula = 'victory ~ range_0 + range_1 + range_2 + range_3 + range_4 + range_5', data=time_goal_pd).fit()
result.summary()

0,1,2,3
Dep. Variable:,victory,R-squared:,0.181
Model:,OLS,Adj. R-squared:,0.179
Method:,Least Squares,F-statistic:,90.98
Date:,"Sat, 14 Apr 2018",Prob (F-statistic):,2.1e-103
Time:,16:31:43,Log-Likelihood:,-2777.4
No. Observations:,2474,AIC:,5569.0
Df Residuals:,2467,BIC:,5609.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.3842,0.032,-12.126,0.000,-0.446,-0.322
range_0,0.3155,0.036,8.884,0.000,0.246,0.385
range_1,0.3507,0.033,10.706,0.000,0.286,0.415
range_2,0.3063,0.033,9.301,0.000,0.242,0.371
range_3,0.3589,0.028,13.002,0.000,0.305,0.413
range_4,0.3391,0.030,11.418,0.000,0.281,0.397
range_5,0.4403,0.025,17.451,0.000,0.391,0.490

0,1,2,3
Omnibus:,620.885,Durbin-Watson:,2.349
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.477
Skew:,-0.336,Prob(JB):,1.28e-33
Kurtosis:,1.992,Cond. No.,4.19


결론은 75분에서 90분 대의 득점의 차이가 경기결과에 영향을 미치는 것이라는 건가
아무래도 그럴것 같은데...
