In [1]:
import db_conn
import pandas as pd
import numpy as np
import copy
import statsmodels.formula.api as sm
import scipy.stats as st

In [2]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
plt.style.use('dark_background')

##### Game Records

In [3]:
sql = """SELECT * FROM game_records"""
game_records = db_conn.select_query(sql)
game_records_pd = pd.DataFrame(game_records, columns=['game_id', 'home_team_id', 'away_team_id', 'home_team_score', 'away_team_score', 'winning_team'])
game_records_pd.head()

Unnamed: 0,game_id,home_team_id,away_team_id,home_team_score,away_team_score,winning_team
0,2013-1-001,10,25,2,2,0
1,2013-1-002,19,5,2,1,19
2,2013-1-003,21,23,0,1,23
3,2013-1-004,12,13,1,2,13
4,2013-1-005,20,2,0,0,0


##### Goal Records

In [4]:
sql = """SELECT * FROM goal_records_rev"""
goal_records = db_conn.select_query(sql)
goal_records_pd = pd.DataFrame(goal_records, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'own_goal'])
goal_records_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,own_goal
0,2013-1-001,2,10,25,N
1,2013-1-001,3,25,10,N
2,2013-1-001,4,10,25,N
3,2013-1-001,5,25,10,N
4,2013-1-002,6,5,19,N


In [5]:
goal_records_pd = pd.DataFrame(goal_records, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'half_type', 'play_time'])
goal_trace_pd = pd.merge(goal_records_pd, game_records_pd[['game_id', 'home_team_id', 'away_team_id', 'winning_team']], how='left', left_on='match_id', right_on='game_id')
goal_trace_pd = pd.DataFrame(goal_trace_pd, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range'])
goal_trace_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,time_range
0,2013-1-001,2,10,25,10,25,0,,,,,1,29,
1,2013-1-001,3,25,10,10,25,0,,,,,1,32,
2,2013-1-001,4,10,25,10,25,0,,,,,2,2,
3,2013-1-001,5,25,10,10,25,0,,,,,2,38,
4,2013-1-002,6,5,19,19,5,19,,,,,1,4,


## 데이터 전처리

##### Home Team 기준 Goal Records 전처리

In [6]:
home_trace_pd = copy.deepcopy(goal_trace_pd)
home_trace_pd.home_team_score = np.where(home_trace_pd.home_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.away_team_score = np.where(home_trace_pd.away_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.home_team_score = home_trace_pd.groupby(['match_id'])['home_team_score'].cumsum()
home_trace_pd.away_team_score = home_trace_pd.groupby(['match_id'])['away_team_score'].cumsum()
home_trace_pd.score_line = home_trace_pd.home_team_score.map(str) + ':' + home_trace_pd.away_team_score.map(str)
home_trace_pd.time_range = (home_trace_pd.play_time / 15).astype(int) + 1
home_trace_pd.time_range = np.where(home_trace_pd.time_range > 3, 3, home_trace_pd.time_range)
home_trace_pd.time_range = np.where(home_trace_pd.half_type == 2, home_trace_pd.time_range + 3, home_trace_pd.time_range)
home_trace_pd.location = np.where(home_trace_pd.home_team_id == home_trace_pd.score_team_id, 1, 0)
home_trace_pd.head()

Unnamed: 0,match_id,id,score_team_id,lost_team,home_team_id,away_team_id,winning_team,home_team_score,away_team_score,location,score_line,half_type,play_time,time_range
0,2013-1-001,2,10,25,10,25,0,1,0,1,1:0,1,29,2
1,2013-1-001,3,25,10,10,25,0,1,1,0,1:1,1,32,3
2,2013-1-001,4,10,25,10,25,0,2,1,1,2:1,2,2,4
3,2013-1-001,5,25,10,10,25,0,2,2,0,2:2,2,38,6
4,2013-1-002,6,5,19,19,5,19,0,1,0,0:1,1,4,1


##### 골 유형 구분
* 선제골(First Goal): 점수가 1:0 / 0:1 인 경우
* 동점골(Tying Goal): 양 팀의 점수가 동일한 경우
* 리드골(Lead Goal): 득점한 팀의 점수가 상대적으로 높은 경우
* 추격골(Chase Goal): 득점한 팀의 점수가 상대적으로 낮은 경우

In [7]:
goal_type_pd = pd.DataFrame(home_trace_pd, columns=['match_id', 'id', 'score_team_id', 'lost_team', 'home_team_id', 'away_team_id', 'winning_team', 'winning_flag', 'home_team_score', 'away_team_score', 'location', 'score_line', 'half_type', 'play_time', 'time_range', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal'])
goal_type_pd.winning_flag = np.where(goal_type_pd.winning_team == 0, 0, np.where(goal_type_pd.winning_team == goal_type_pd.home_team_id, 1, -1))
goal_type_pd.first_goal = (goal_type_pd.home_team_score + goal_type_pd.away_team_score == 1)
goal_type_pd.tying_goal = (goal_type_pd.home_team_score == goal_type_pd.away_team_score)
goal_type_pd.lead_goal = np.where(goal_type_pd.location == 1, (goal_type_pd.home_team_score > goal_type_pd.away_team_score), (goal_type_pd.home_team_score < goal_type_pd.away_team_score))
goal_type_pd.lead_goal = np.where(goal_type_pd.lead_goal & goal_type_pd.first_goal, False, goal_type_pd.lead_goal)
goal_type_pd.chase_goal = (goal_type_pd.first_goal == False) & (goal_type_pd.tying_goal == False) & (goal_type_pd.lead_goal == False)
goal_type_pd.winning_flag = np.where(goal_type_pd.location == 0, goal_type_pd.winning_flag * -1, goal_type_pd.winning_flag)
t = goal_type_pd[['match_id', 'id', 'home_team_id', 'away_team_id', 'winning_flag', 'location', 'score_line', 'time_range', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal']]
t.head()

Unnamed: 0,match_id,id,home_team_id,away_team_id,winning_flag,location,score_line,time_range,first_goal,tying_goal,lead_goal,chase_goal
0,2013-1-001,2,10,25,0,1,1:0,2,True,False,False,False
1,2013-1-001,3,10,25,0,0,1:1,3,False,True,False,False
2,2013-1-001,4,10,25,0,1,2:1,4,False,False,True,False
3,2013-1-001,5,10,25,0,0,2:2,6,False,True,False,False
4,2013-1-002,6,19,5,-1,0,0:1,1,True,False,False,False


##### Home/Away Lead Goal, Chase Goal 통합 (횟수)

In [8]:
total_t = pd.DataFrame(t, columns = ['match_id', 'id', 'home_team_id', 'away_team_id', 'winning_flag', 'location', 'score_line', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal', 'lead_point', 'TGP'])

total_t.winning_flag = np.where(total_t.location == 0, total_t.winning_flag * -1, total_t.winning_flag)
total_t.lead_point = total_t.score_line.apply(lambda x: int(x.split(':')[0]) - int(x.split(':')[1]))
total_t.lead_point = np.where(total_t.location == 1, total_t.lead_point, total_t.lead_point * -1)

total_t.head()

Unnamed: 0,match_id,id,home_team_id,away_team_id,winning_flag,location,score_line,first_goal,tying_goal,lead_goal,chase_goal,lead_point,TGP
0,2013-1-001,2,10,25,0,1,1:0,True,False,False,False,1,
1,2013-1-001,3,10,25,0,0,1:1,False,True,False,False,0,
2,2013-1-001,4,10,25,0,1,2:1,False,False,True,False,1,
3,2013-1-001,5,10,25,0,0,2:2,False,True,False,False,0,
4,2013-1-002,6,19,5,1,0,0:1,True,False,False,False,1,


##### TGP 산출
* Lead Goal의 경우 가장 큰 값만 반영하도록 로직 구현

In [9]:
tgp_pd = pd.DataFrame(total_t, columns = ['match_id', 'id', 'home_team_id', 'away_team_id', 'winning_flag', 'location', 'target_team_id', 'score_line', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal', 'lead_point', 'FG', 'FTG', 'OTG', 'FLG', 'SLG', 'OLG', 'TLG', 'TGP'])

tgp_pd.target_team_id = np.where(tgp_pd.location == 1, tgp_pd.home_team_id, tgp_pd.away_team_id)

tgp_pd.FG = 2 * tgp_pd.first_goal

tgp_pd.FTG = tgp_pd.groupby(['match_id'])['tying_goal'].cumsum()
tgp_pd.OTG = tgp_pd.groupby(['match_id'])['tying_goal'].cumsum()
tgp_pd.FTG = tgp_pd.FTG * tgp_pd.tying_goal
tgp_pd.OTG = tgp_pd.OTG * tgp_pd.tying_goal
tgp_pd.FTG = np.where(tgp_pd.FTG == 1, tgp_pd.tying_goal * 0.44, 0)
tgp_pd.OTG = np.where(tgp_pd.OTG >= 2, tgp_pd.tying_goal * 0.6, 0)

tgp_pd.FLG = np.where((tgp_pd.lead_point == 1) & (tgp_pd.lead_goal), 2.13, 0)
tgp_pd.SLG = np.where((tgp_pd.lead_point == 2) & (tgp_pd.lead_goal), 2.58, 0)
tgp_pd.OLG = np.where((tgp_pd.lead_point == 3) & (tgp_pd.lead_goal), 2.97, 0)
tgp_pd.OLG = np.where((tgp_pd.lead_point > 3) & (tgp_pd.lead_goal), 3, 0)

tgp_pd.TLG = tgp_pd.FLG + tgp_pd.SLG + tgp_pd.OLG
temp = tgp_pd.groupby(['match_id', 'target_team_id']).agg({'TLG': 'max'})
tgp_pd.TLG = tgp_pd.apply(lambda x: temp.loc[x.match_id, x.target_team_id], axis=1)
tgp_pd.TLG = np.where((tgp_pd.FLG + tgp_pd.SLG + tgp_pd.OLG) == tgp_pd.TLG, tgp_pd.TLG, 0)

tgp_pd[['FG', 'FTG', 'OTG', 'FLG', 'SLG', 'OLG', 'TLG']]
tgp_pd.TGP = tgp_pd.FG + tgp_pd.FTG + tgp_pd.OTG + tgp_pd.TLG

tgp_pd.head()

Unnamed: 0,match_id,id,home_team_id,away_team_id,winning_flag,location,target_team_id,score_line,first_goal,tying_goal,...,chase_goal,lead_point,FG,FTG,OTG,FLG,SLG,OLG,TLG,TGP
0,2013-1-001,2,10,25,0,1,10,1:0,True,False,...,False,1,2,0.0,0.0,0.0,0.0,0,0.0,2.0
1,2013-1-001,3,10,25,0,0,25,1:1,False,True,...,False,0,0,0.44,0.0,0.0,0.0,0,0.0,0.44
2,2013-1-001,4,10,25,0,1,10,2:1,False,False,...,False,1,0,0.0,0.0,2.13,0.0,0,2.13,2.13
3,2013-1-001,5,10,25,0,0,25,2:2,False,True,...,False,0,0,0.0,0.6,0.0,0.0,0,0.0,0.6
4,2013-1-002,6,19,5,1,0,5,0:1,True,False,...,False,1,2,0.0,0.0,0.0,0.0,0,0.0,2.0


##### AGPt 산출

In [10]:
tgp_result = tgp_pd[['match_id', 'location', 'target_team_id', 'FG', 'FTG', 'OTG', 'TLG', 'TGP']].groupby(['match_id', 'location']).agg({'TGP': 'sum', 'target_team_id': 'max'})
tgp_result = tgp_result.reset_index()

tgp_result = pd.DataFrame(tgp_result, columns = ['match_id', 'year', 'division', 'location', 'TGP', 'target_team_id'])

tgp_result.year = tgp_result.match_id.str.split('-').str.get(0)
tgp_result.division = tgp_result.match_id.str.split('-').str.get(1)
tgp_result.year = tgp_result.year.apply(pd.to_numeric)
tgp_result.division = tgp_result.division.apply(pd.to_numeric)

tgp_result = tgp_result.groupby(['year', 'division', 'target_team_id']).agg({'TGP': 'sum', 'match_id': 'count'})
tgp_result = tgp_result.reset_index()
tgp_result = pd.DataFrame(tgp_result, columns = ['year', 'division', 'target_team_id', 'TGP', 'match_id', 'AGPt'])
tgp_result.AGPt = tgp_result.TGP / tgp_result.match_id
tgp_result.columns = ['year', 'division', 'team_id', 'TGP', 'match_count', 'AGPt']

tgp_result.describe()

Unnamed: 0,year,division,team_id,TGP,match_count,AGPt
count,121.0,121.0,121.0,121.0,121.0,121.0
mean,2015.008264,1.561983,12.347107,66.025785,25.446281,2.533216
std,1.417132,0.631038,7.478537,26.251594,7.7674,0.593487
min,2013.0,1.0,1.0,0.44,1.0,0.44
25%,2014.0,1.0,6.0,51.95,24.0,2.17381
50%,2015.0,1.0,12.0,69.01,27.0,2.573462
75%,2016.0,2.0,19.0,83.59,30.0,2.828929
max,2017.0,3.0,25.0,113.33,39.0,5.0


##### 랭킹 산출
* TGP, AGPt에 따른 랭킹 산출
* 실제 시즌 랭킹과 비교

In [11]:
sql = """SELECT * FROM season_ranking"""
season_ranking = db_conn.select_query(sql)

season_ranking_pd = pd.DataFrame(season_ranking, columns=['year', 'division', 'rank', 'team_id', 'points', 'scored_goal_count', 'losed_goal_count', 'match_count', 'win_count', 'draw_count', 'lose_count'])
# season_ranking_pd['rank'] = season_ranking_pd.groupby(['year', 'division'])['scored_goal_count'].rank(method='min', ascending=False)

season_ranking_pd[season_ranking_pd.year == 2017]
pd.DataFrame(season_ranking).head()

Unnamed: 0,division,draw_count,goal_diff,lose_count,losed_goal_count,match_count,points,rank,scored_goal_count,team_id,win_count,year
0,1,11,25,6,38,38,74,1,63,25,21,2013
1,1,7,26,9,37,38,73,2,63,19,22,2013
2,1,9,12,11,49,38,63,3,61,22,18,2013
3,1,11,13,10,46,38,62,4,59,10,17,2013
4,1,8,7,15,43,38,53,5,50,13,15,2013


In [24]:
tgp_rank = pd.DataFrame(tgp_result, columns=['year', 'division', 'team_id', 'match_count', 'TGP', 'AGPt', 'AGPt_rank', 'TGP_rank'])
tgp_rank.AGPt = tgp_rank.TGP / 38
tgp_rank.AGPt_rank = tgp_rank.groupby(['year', 'division'])['AGPt'].rank(ascending=False)
tgp_rank.TGP_rank = tgp_rank.groupby(['year', 'division'])['TGP'].rank(ascending=False)
tgp_rank.head()

tgp_rank[(tgp_rank.year == 2017) & (tgp_rank.division == 1)]

Unnamed: 0,year,division,team_id,match_count,TGP,AGPt,AGPt_rank,TGP_rank
97,2017,1,1,32,83.95,2.209211,4.0,4.0
98,2017,1,4,23,42.23,1.111316,11.0,11.0
99,2017,1,5,29,70.61,1.858158,8.0,8.0
100,2017,1,9,25,51.26,1.348947,10.0,10.0
101,2017,1,10,30,79.1,2.081579,6.0,6.0
102,2017,1,13,31,90.94,2.393158,3.0,3.0
103,2017,1,19,28,68.09,1.791842,9.0,9.0
104,2017,1,20,23,38.11,1.002895,12.0,12.0
105,2017,1,21,28,74.88,1.970526,7.0,7.0
106,2017,1,22,32,113.33,2.982368,1.0,1.0


* 실제 승점 기준 랭킹과 비교

In [13]:
# season_ranking_pd = pd.DataFrame(season_ranking, columns=['year', 'division', 'rank', 'team_id', 'points', 'scored_goal_count', 'losed_goal_count', 'match_count'])

tgp_rank_pd = pd.merge(tgp_rank, season_ranking_pd, how='left', on=['year', 'division', 'team_id'])
tgp_rank_pd = pd.DataFrame(tgp_rank_pd, columns = ['year', 'division', 'team_id', 'match_count_x', 'match_count_y', 'AGPt_rank', 'rank', 'rank_diff', 'AGPt', 'points'])
tgp_rank_pd = tgp_rank_pd[tgp_rank_pd['rank'].notna()]
tgp_rank_pd.rank = tgp_rank_pd['rank'].apply(pd.to_numeric)
tgp_rank_pd.AGPt_rank = tgp_rank_pd.AGPt_rank.apply(pd.to_numeric)
tgp_rank_pd.rank_diff = abs(tgp_rank_pd.rank - tgp_rank_pd.AGPt_rank)
display(tgp_rank_pd.groupby(['year', 'division']).agg({'rank_diff': ['sum', 'mean']}))
# display(tgp_rank_pd[tgp_rank_pd.year == 2017].sort_values('points', ascending=False))
display(tgp_rank_pd[tgp_rank_pd.rank_diff > 2].__len__())

Unnamed: 0_level_0,Unnamed: 1_level_0,rank_diff,rank_diff
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
year,division,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,22.0,1.571429
2013,2,4.0,0.571429
2013,3,0.0,0.0
2014,1,18.0,1.5
2014,2,15.0,1.666667
2014,3,0.0,0.0
2015,1,14.0,1.166667
2015,2,14.0,1.4
2015,3,0.0,0.0
2016,1,28.0,2.333333


25

* 실제 득점 기준 랭킹과 비교

In [14]:
season_goal_ranking_pd = copy.deepcopy(season_ranking_pd)
season_goal_ranking_pd['rank'] = season_goal_ranking_pd.groupby(['year', 'division'])['scored_goal_count'].rank(method='min', ascending=False)

tgp_goal_rank_pd = pd.merge(tgp_rank, season_goal_ranking_pd, how='left', on=['year', 'division', 'team_id'])
tgp_goal_rank_pd = pd.DataFrame(tgp_goal_rank_pd, columns = ['year', 'division', 'team_id', 'match_count_x', 'match_count_y', 'AGPt_rank', 'rank', 'rank_diff', 'AGPt', 'points'])
tgp_goal_rank_pd = tgp_goal_rank_pd[tgp_goal_rank_pd['rank'].notna()]
tgp_goal_rank_pd.rank = tgp_goal_rank_pd['rank'].apply(pd.to_numeric)
tgp_goal_rank_pd.AGPt_rank = tgp_goal_rank_pd.AGPt_rank.apply(pd.to_numeric)
tgp_goal_rank_pd.rank_diff = abs((tgp_goal_rank_pd.rank - tgp_goal_rank_pd.AGPt_rank))
display(tgp_goal_rank_pd.groupby(['year', 'division']).agg({'rank_diff': ['sum', 'mean']}))
# display(tgp_goal_rank_pd[tgp_goal_rank_pd.year==2016].sort_values('rank'))
display(tgp_goal_rank_pd[tgp_goal_rank_pd.rank_diff > 2].__len__())

Unnamed: 0_level_0,Unnamed: 1_level_0,rank_diff,rank_diff
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
year,division,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,14.0,1.0
2013,2,6.0,0.857143
2013,3,0.0,0.0
2014,1,11.0,0.916667
2014,2,10.0,1.111111
2014,3,0.0,0.0
2015,1,9.0,0.75
2015,2,6.0,0.6
2015,3,0.0,0.0
2016,1,19.0,1.583333


15

* 실제 득점을 내지 못한 경기들이 누락됨에 따라 경기수에 차이가 발생
    * AGPt 산출 시 TGP를 실제 경기 수로 나누는 것으로 해결
* 실제 랭킹과 비교시 차이가 많이 남
    * AGPt는 득점과 관련된 점수이므로 시즌별 득점 수 랭킹과 비교
    * 득점 랭킹과 비교 시 오차가 많이 축소
    * 실점과 관련한 추가 분석이 필요하다고 판단됨

In [15]:
temp = pd.DataFrame(tgp_rank_pd, columns=['year', 'division', 'team_id', 'match_count_x', 'AGPt', 'real_point', 'scored_goal_count', 'losed_goal_count', 'win_count', 'draw_count', 'lose_count'])
temp = temp[temp.division == 1]

temp.real_point = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].points.values[0], axis = 1)
temp.scored_goal_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].scored_goal_count.values[0], axis = 1)
temp.losed_goal_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].losed_goal_count.values[0], axis = 1)
temp.win_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].win_count.values[0], axis = 1)
temp.draw_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].draw_count.values[0], axis = 1)
temp.lose_count = temp.apply(lambda x: season_ranking_pd[(season_ranking_pd.year == x.year) & (season_ranking_pd.division == x.division) & (season_ranking_pd.team_id == x.team_id)].lose_count.values[0], axis = 1)
temp.to_pickle('../pickles/temp.pkl')
temp.head()

Unnamed: 0,year,division,team_id,match_count_x,AGPt,real_point,scored_goal_count,losed_goal_count,win_count,draw_count,lose_count
0,2013,1,1,23,1.461316,36,37,64,8,12,18
1,2013,1,2,27,1.425,37,42,55,8,13,17
2,2013,1,5,22,1.273421,32,36,57,6,14,18
3,2013,1,6,29,1.251842,32,39,68,7,11,20
4,2013,1,7,24,1.757632,52,43,41,14,10,14


In [16]:
temp[['AGPt', 'real_point', 'scored_goal_count', 'losed_goal_count', 'win_count', 'draw_count', 'lose_count']].corr()

Unnamed: 0,AGPt,real_point,scored_goal_count,losed_goal_count,win_count,draw_count,lose_count
AGPt,1.0,0.850791,0.943876,-0.408976,0.874041,-0.366781,-0.735375
real_point,0.850791,1.0,0.763135,-0.740571,0.985632,-0.2744,-0.911542
scored_goal_count,0.943876,0.763135,1.0,-0.227521,0.798888,-0.403556,-0.631769
losed_goal_count,-0.408976,-0.740571,-0.227521,1.0,-0.682819,-0.050322,0.773591
win_count,0.874041,0.985632,0.798888,-0.682819,1.0,-0.416297,-0.843326
draw_count,-0.366781,-0.2744,-0.403556,-0.050322,-0.416297,1.0,-0.137549
lose_count,-0.735375,-0.911542,-0.631769,0.773591,-0.843326,-0.137549,1.0


In [17]:
sm.ols(formula = 'points ~ AGPt', data=pd.DataFrame(tgp_rank_pd)).fit().summary()

0,1,2,3
Dep. Variable:,points,R-squared:,0.851
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,656.4
Date:,"Sun, 25 Mar 2018",Prob (F-statistic):,2.38e-49
Time:,09:45:34,Log-Likelihood:,-397.05
No. Observations:,117,AIC:,798.1
Df Residuals:,115,BIC:,803.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.9028,1.814,2.152,0.034,0.310,7.496
AGPt,25.0060,0.976,25.620,0.000,23.073,26.939

0,1,2,3
Omnibus:,0.698,Durbin-Watson:,1.942
Prob(Omnibus):,0.705,Jarque-Bera (JB):,0.304
Skew:,-0.015,Prob(JB):,0.859
Kurtosis:,3.248,Cond. No.,6.31


In [18]:
tgp_rank_w_lr_pd = pd.DataFrame(tgp_rank_pd, columns = ['year', 'division', 'team_id', 'match_count_x', 'match_count_y', 'AGPt_rank', 'rank', 'rank_diff', 'AGPt', 'AGPt_lr', 'points'])
tgp_rank_w_lr_pd.columns = ['year', 'division', 'team_id', 'match_count_x', 'match_count_y', 'AGPt_rank', 'real_rank', 'rank_diff', 'AGPt', 'AGPt_lr', 'points']
tgp_rank_w_lr_pd.AGPt_lr = 9.5313 + (22.6777 * tgp_rank_w_lr_pd.AGPt)
tgp_rank_w_lr_pd.AGPt_rank = tgp_rank_w_lr_pd.groupby(['year', 'division'])['AGPt_lr'].rank(ascending=False)
tgp_rank_w_lr_pd.rank_diff = (tgp_rank_w_lr_pd.AGPt_rank - tgp_rank_w_lr_pd.real_rank) ** 2

display(tgp_rank_w_lr_pd.groupby(['year', 'division']).agg({'rank_diff': 'sum'}))
display(tgp_rank_pd.groupby(['year', 'division']).agg({'rank_diff': 'sum'}))

Unnamed: 0_level_0,Unnamed: 1_level_0,rank_diff
year,division,Unnamed: 2_level_1
2013,1,50.0
2013,2,14.0
2013,3,0.0
2014,1,38.0
2014,2,39.0
2014,3,0.0
2015,1,34.0
2015,2,33.0
2015,3,0.0
2016,1,90.0


Unnamed: 0_level_0,Unnamed: 1_level_0,rank_diff
year,division,Unnamed: 2_level_1
2013,1,22.0
2013,2,4.0
2013,3,0.0
2014,1,18.0
2014,2,15.0
2014,3,0.0
2015,1,14.0
2015,2,14.0
2015,3,0.0
2016,1,28.0


In [19]:
from sklearn import metrics

predicted = tgp_rank_pd[(tgp_rank_pd['rank'].notna())]

In [20]:
# pd.merge(tgp_rank, season_ranking_pd, how='left', on=['year', 'division', 'team_id'])
logit_data = tgp_pd.groupby(['match_id', 'target_team_id']).agg({'winning_flag': 'max', 'first_goal': 'sum', 'tying_goal': 'sum', 'lead_goal': 'sum', 'chase_goal': 'sum', 'lead_point': 'max'})
logit_data = logit_data.reset_index()
logit_data = pd.DataFrame(logit_data, columns=['match_id', 'year', 'division', 'target_team_id', 'winning_flag', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal', 'lead_point'])

logit_data.year = logit_data.match_id.str.split('-').str.get(0)
logit_data.division = logit_data.match_id.str.split('-').str.get(1)
logit_data.year = logit_data.year.apply(pd.to_numeric)
logit_data.division = logit_data.division.apply(pd.to_numeric)
logit_data = logit_data.groupby(['year', 'division', 'target_team_id']).agg({'first_goal': 'sum', 'tying_goal': 'sum', 'lead_goal': 'sum', 'chase_goal': 'sum', 'lead_point': 'max', 'match_id': 'count'})
logit_data = logit_data.reset_index()

logit_data = pd.merge(logit_data, season_ranking_pd, how='left', left_on=['year', 'division', 'target_team_id'], right_on=['year', 'division', 'team_id'])

train_set = logit_data[logit_data.year < 2017]
test_set = logit_data[logit_data.year == 2017]

logit_data.head()

Unnamed: 0,year,division,target_team_id,first_goal,tying_goal,lead_goal,chase_goal,lead_point,match_id,rank,team_id,points,scored_goal_count,losed_goal_count,match_count,win_count,draw_count,lose_count
0,2013,1,1,14.0,7.0,12.0,4.0,3,23,12.0,1.0,36.0,37.0,64.0,38.0,8.0,12.0,18.0
1,2013,1,2,13.0,9.0,13.0,7.0,6,27,11.0,2.0,37.0,42.0,55.0,38.0,8.0,13.0,17.0
2,2013,1,5,10.0,11.0,13.0,2.0,3,22,13.0,5.0,32.0,36.0,57.0,38.0,6.0,14.0,18.0
3,2013,1,6,14.0,12.0,7.0,6.0,2,29,14.0,6.0,32.0,39.0,68.0,38.0,7.0,11.0,20.0
4,2013,1,7,18.0,7.0,15.0,3.0,5,24,6.0,7.0,52.0,43.0,41.0,38.0,14.0,10.0,14.0


In [21]:
result = sm.ols(formula = 'points ~ first_goal + tying_goal + lead_goal + chase_goal', data=logit_data).fit()
result.summary()

0,1,2,3
Dep. Variable:,points,R-squared:,0.892
Model:,OLS,Adj. R-squared:,0.889
Method:,Least Squares,F-statistic:,232.4
Date:,"Sun, 25 Mar 2018",Prob (F-statistic):,2.97e-53
Time:,09:45:34,Log-Likelihood:,-377.94
No. Observations:,117,AIC:,765.9
Df Residuals:,112,BIC:,779.7
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.9197,1.975,-0.466,0.642,-4.832,2.993
first_goal,2.2793,0.155,14.744,0.000,1.973,2.586
tying_goal,0.7091,0.171,4.138,0.000,0.370,1.049
lead_goal,0.4201,0.104,4.055,0.000,0.215,0.625
chase_goal,-0.5205,0.285,-1.828,0.070,-1.085,0.044

0,1,2,3
Omnibus:,4.224,Durbin-Watson:,2.034
Prob(Omnibus):,0.121,Jarque-Bera (JB):,3.948
Skew:,-0.284,Prob(JB):,0.139
Kurtosis:,3.697,Cond. No.,91.7


In [22]:
from patsy import dmatrices
from sklearn.cross_validation import train_test_split

# 2017년을 test set으로 하는 sampling
y_train, x_train = dmatrices('points ~ first_goal + tying_goal + lead_goal + chase_goal', train_set, return_type='dataframe')
y_test, x_test = dmatrices('points ~ first_goal + tying_goal + lead_goal + chase_goal', test_set, return_type='dataframe')
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

# Random sampling
# y, X = dmatrices('victory ~ tying_goal', logit_data, return_type='dataframe')
# y = np.ravel(y)
# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model = model.fit(x_train, y_train)

pd.DataFrame(list(zip(x_train.columns, model.coef_)), columns=['features', 'Coef']), model.intercept_



(     features      Coef
 0   Intercept  0.000000
 1  first_goal  2.252564
 2  tying_goal  0.710092
 3   lead_goal  0.429573
 4  chase_goal -0.531033, -0.3505729410210989)

In [23]:
predicted = result.predict(x_test)

np.mean((y_test - predicted) ** 2)
y_test, predicted

(array([49., 30., 47., 35., 61., 64., 62., 39., 35., 75., 66., 52., 79.,
        29., 68., 52., 35., 53., 45., 54., 33., 39.,  0.,  0.]),
 97     54.504141
 98     39.929755
 99     51.085910
 100    34.239400
 101    57.346814
 102    66.516519
 103    54.650026
 104    30.221013
 105    52.079110
 106    77.308947
 107    65.907830
 108    50.034930
 109    68.963576
 110    32.936303
 111    66.658639
 112    55.840370
 113    39.369886
 114    47.081016
 115    39.970042
 116    59.339600
 117    39.093121
 118    41.143262
 119     1.359581
 120     1.359581
 dtype: float64)