In [1]:
import db_conn
import pandas as pd
import numpy as np
import copy
import collections
import statsmodels.formula.api as sm
import scipy.stats as st
import matplotlib.pyplot  as plt
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA as sklearnPCA
import seaborn as sns

plt.style.use('dark_background')
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#### Load Season Ranking Data

In [2]:
sql = """SELECT * FROM season_ranking"""
season_ranking = db_conn.select_query(sql)
season_ranking_pd = pd.DataFrame(season_ranking, columns = ['year', 'division', 'team_id', 'match_count', 'rank'])
season_ranking_pd.columns = ['season_year', 'division', 'team_id', 'match_count', 'real_rank']
season_ranking_pd.head()

Unnamed: 0,season_year,division,team_id,match_count,real_rank
0,2013,1,25,38,1
1,2013,1,19,38,2
2,2013,1,22,38,3
3,2013,1,10,38,4
4,2013,1,13,38,5


#### Load Score Line Data

In [3]:
sql = """SELECT * FROM score_line"""
score_line = db_conn.select_query(sql)
score_line_pd = pd.DataFrame(score_line, columns=['match_id', 'season_year', 'division', 'id', 'half_type', 'play_time', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'score_team_id', 'winning_team', 'score_player'])
score_line_pd = score_line_pd[score_line_pd.id > 0]
score_line_pd.head()

Unnamed: 0,match_id,season_year,division,id,half_type,play_time,time_range,home_score,away_score,home_team_id,away_team_id,score_team_id,winning_team,score_player
1,2013-1-001,2013,1,2,1,29,2,1,0,10,25,10,0,1
2,2013-1-001,2013,1,3,1,32,3,1,1,10,25,25,0,2
3,2013-1-001,2013,1,4,2,2,4,2,1,10,25,10,0,3
5,2013-1-001,2013,1,5,2,38,6,2,2,10,25,25,0,4
7,2013-1-002,2013,1,6,1,4,1,0,1,19,5,5,19,5


#### Load Team Info Data

In [4]:
sql = """SELECT * FROM team_info"""
team_info = db_conn.select_query(sql)
team_info_pd = pd.DataFrame(team_info)
team_info_pd.head()

Unnamed: 0,team_id,team_name,team_nick
0,1,강원,강원FC
1,2,경남,경남FC
2,3,고양,고양 자이크로
3,4,광주,광주FC
4,5,대구,대구FC


## CASE 1. 득점 유형을 4가지로 구분
### 득점 유형 구분
* 선제골(First Goal): 점수가 1:0 / 0:1 인 경우
* 동점골(Tying Goal): 양 팀의 점수가 동일한 경우
* 리드골(Lead Goal): 득점한 팀의 점수가 상대적으로 높은 경우
* 추격골(Chase Goal): 득점한 팀의 점수가 상대적으로 낮은 경우
### 득점 가중치
* 기존 2~4에서 산출한 승률을 기반으로 산출
* 선제골: 2
* 동점골: 0.44 / 0.6
* 리드골: 2.13 / 2.58 / 2.97 or 3
* 추격골: 0

In [5]:
tgp_pd = pd.DataFrame(score_line_pd, columns = ['match_id', 'season_year', 'division', 'id', 'home_team_id', 'away_team_id', 'winning_team', 'winning_flag', 'location', 'score_team_id', 'home_score', 'away_score', 'first_goal', 'tying_goal', 'lead_goal', 'chase_goal', 'lead_point', 'FG', 'FTG', 'OTG', 'FLG', 'SLG', 'OLG', 'TLG', 'TGP'])

tgp_pd.winning_flag = np.where(tgp_pd.winning_team == 0, 0, np.where(tgp_pd.winning_team == tgp_pd.home_team_id, 1, -1))
tgp_pd.location = np.where(tgp_pd.score_team_id == tgp_pd.home_team_id, 1, 0)

tgp_pd.first_goal = (tgp_pd.home_score + tgp_pd.away_score == 1)
tgp_pd.tying_goal = (tgp_pd.home_score == tgp_pd.away_score)
tgp_pd.lead_goal = np.where(tgp_pd.location == 1, (tgp_pd.home_score > tgp_pd.away_score), (tgp_pd.home_score < tgp_pd.away_score))
tgp_pd.lead_goal = np.where(tgp_pd.lead_goal & tgp_pd.first_goal, False, tgp_pd.lead_goal)
tgp_pd.chase_goal = (tgp_pd.first_goal == False) & (tgp_pd.tying_goal == False) & (tgp_pd.lead_goal == False)
tgp_pd.lead_point = np.where(tgp_pd.location == 1, tgp_pd.home_score - tgp_pd.away_score, tgp_pd.away_score - tgp_pd.home_score)

tgp_pd.score_team_id = np.where(tgp_pd.location == 1, tgp_pd.home_team_id, tgp_pd.away_team_id)

tgp_pd.FG = 2 * tgp_pd.first_goal

tgp_pd.FTG = tgp_pd.groupby(['match_id'])['tying_goal'].cumsum()
tgp_pd.OTG = tgp_pd.groupby(['match_id'])['tying_goal'].cumsum()
tgp_pd.FTG = tgp_pd.FTG * tgp_pd.tying_goal
tgp_pd.OTG = tgp_pd.OTG * tgp_pd.tying_goal
tgp_pd.FTG = np.where(tgp_pd.FTG == 1, tgp_pd.tying_goal * 0.44, 0)
tgp_pd.OTG = np.where(tgp_pd.OTG >= 2, tgp_pd.tying_goal * 0.6, 0)

tgp_pd.FLG = np.where((tgp_pd.lead_point == 1) & (tgp_pd.lead_goal), 2.13, 0)
tgp_pd.SLG = np.where((tgp_pd.lead_point == 2) & (tgp_pd.lead_goal), 2.58, 0)
tgp_pd.OLG = np.where((tgp_pd.lead_point == 3) & (tgp_pd.lead_goal), 2.97, 0)
tgp_pd.OLG = np.where((tgp_pd.lead_point > 3) & (tgp_pd.lead_goal), 3, 0)

tgp_pd.TLG = tgp_pd.FLG + tgp_pd.SLG + tgp_pd.OLG
tgp_pd.TGP = tgp_pd.FG + tgp_pd.FTG + tgp_pd.OTG + tgp_pd.TLG

tgp_pd.head()

Unnamed: 0,match_id,season_year,division,id,home_team_id,away_team_id,winning_team,winning_flag,location,score_team_id,home_score,away_score,first_goal,tying_goal,lead_goal,chase_goal,lead_point,FG,FTG,OTG,FLG,SLG,OLG,TLG,TGP
1,2013-1-001,2013,1,2,10,25,0,0,1,10,1,0,True,False,False,False,1,2,0.0,0.0,0.0,0.0,0,0.0,2.0
2,2013-1-001,2013,1,3,10,25,0,0,0,25,1,1,False,True,False,False,0,0,0.44,0.0,0.0,0.0,0,0.0,0.44
3,2013-1-001,2013,1,4,10,25,0,0,1,10,2,1,False,False,True,False,1,0,0.0,0.0,2.13,0.0,0,2.13,2.13
5,2013-1-001,2013,1,5,10,25,0,0,0,25,2,2,False,True,False,False,0,0,0.0,0.6,0.0,0.0,0,0.0,0.6
7,2013-1-002,2013,1,6,19,5,19,1,0,5,0,1,True,False,False,False,1,2,0.0,0.0,0.0,0.0,0,0.0,2.0


In [6]:
case_1_result_pd = pd.DataFrame(tgp_pd.groupby(['season_year', 'division', 'score_team_id']).agg({'FG': 'sum', 'FTG': 'sum', 'OTG': 'sum', 'FLG': 'sum', 'SLG': 'sum', 'OLG': 'sum', 'TLG': 'sum', 'TGP': 'sum'}).reset_index(), columns=['season_year', 'division', 'score_team_id', 'FG', 'FTG', 'OTG', 'FLG', 'SLG', 'OLG', 'TLG', 'TGP', 'AGPt', 'case_1_rank'])
case_1_result_pd = pd.merge(case_1_result_pd, season_ranking_pd[['season_year', 'division', 'team_id', 'match_count']], how='left', left_on=['season_year', 'division', 'score_team_id'], right_on=['season_year', 'division', 'team_id'])

In [7]:
case_1_result_pd.AGPt = case_1_result_pd.TGP / case_1_result_pd.match_count
case_1_result_pd.case_1_rank = case_1_result_pd.groupby(['season_year', 'division'])['AGPt'].rank(ascending=False)
case_1_result_pd = case_1_result_pd.drop('team_id', axis=1)
case_1_result_pd.columns = ['season_year', 'division', 'score_team_id', 'FG', 'FTG', 'OTG', 'FLG', 'SLG', 'OLG', 'TLG', 'case_1_TGP', 'case_1_AGPt', 'case_1_rank', 'match_count']
pd.merge(case_1_result_pd[(case_1_result_pd.season_year == 2017) & (case_1_result_pd.division == 1)], season_ranking_pd, left_on=['season_year', 'division', 'score_team_id'], right_on=['season_year', 'division', 'team_id'])

Unnamed: 0,season_year,division,score_team_id,FG,FTG,OTG,FLG,SLG,OLG,TLG,case_1_TGP,case_1_AGPt,case_1_rank,match_count_x,team_id,match_count_y,real_rank
0,2017,1,1,36,4.84,1.2,25.56,18.06,3,46.62,88.66,2.333158,5.0,38.0,1,38,6
1,2017,1,4,26,3.96,2.4,6.39,7.74,0,14.13,46.49,1.223421,11.0,38.0,4,38,12
2,2017,1,5,36,4.4,0.6,8.52,23.22,0,31.74,72.74,1.914211,8.0,38.0,5,38,8
3,2017,1,9,20,5.28,1.2,19.17,7.74,0,26.91,53.39,1.405,10.0,38.0,9,38,11
4,2017,1,10,38,3.96,1.2,21.3,15.48,6,42.78,85.94,2.261579,6.0,38.0,10,38,5
5,2017,1,13,44,3.08,1.8,19.17,33.54,0,52.71,101.59,2.673421,3.0,38.0,13,38,3
6,2017,1,19,38,3.96,0.0,14.91,15.48,0,30.39,72.35,1.903947,9.0,38.0,19,38,4
7,2017,1,20,22,3.96,0.6,6.39,5.16,0,11.55,38.11,1.002895,12.0,38.0,20,38,9
8,2017,1,21,34,3.08,1.8,12.78,20.64,12,45.42,84.3,2.218421,7.0,38.0,21,38,10
9,2017,1,22,50,2.64,0.6,12.78,51.6,15,79.38,132.62,3.49,1.0,38.0,22,38,1


## CASE 2. 득점 유형을 5가지로 구분하고, 득점과 실점을 구분
* 별도 파일 참조

In [8]:
import pickle
case_2_result_pd = pickle.load(open('./case_2_result.pkl', mode='rb'))
case_2_result_pd.columns = ['season_year', 'division', 'team_id', 'game_count', 'goal_point', 'avg_point', 'case_2_rank']
case_2_result_pd = pd.merge(case_2_result_pd, season_ranking_pd[['season_year', 'division', 'team_id', 'match_count']], how='left', on=['season_year', 'division', 'team_id'])
case_2_result_pd.avg_point = case_2_result_pd.goal_point / case_2_result_pd.match_count
case_2_result_pd.case_2_rank = case_2_result_pd.groupby(['season_year', 'division'])['avg_point'].rank(ascending=False)
case_2_result_pd.columns = ['season_year', 'division', 'score_team_id', 'game_count', 'case_2_TGP', 'case_2_AGPt', 'case_2_rank', 'match_count']
pd.merge(case_2_result_pd[(case_2_result_pd.season_year == 2017) & (case_2_result_pd.division == 1)], season_ranking_pd, left_on=['season_year', 'division', 'score_team_id'], right_on=['season_year', 'division', 'team_id'])

Unnamed: 0,season_year,division,score_team_id,game_count,case_2_TGP,case_2_AGPt,case_2_rank,match_count_x,team_id,match_count_y,real_rank
0,2017,1,1,31,56.270895,1.480813,2.0,38.0,1,38,6
1,2017,1,4,22,31.272217,0.822953,11.0,38.0,4,38,12
2,2017,1,5,29,57.405094,1.51066,1.0,38.0,5,38,8
3,2017,1,9,24,39.52694,1.040183,7.0,38.0,9,38,11
4,2017,1,10,29,48.398626,1.273648,3.0,38.0,10,38,5
5,2017,1,13,30,42.356752,1.114651,6.0,38.0,13,38,3
6,2017,1,19,28,29.01185,0.76347,12.0,38.0,19,38,4
7,2017,1,20,23,45.112972,1.187183,5.0,38.0,20,38,9
8,2017,1,21,27,33.779276,0.888928,10.0,38.0,21,38,10
9,2017,1,22,32,33.876113,0.891477,9.0,38.0,22,38,1


## CASE 3. 득점 유형을 3가지로 구분

#### 목적
* 리드골(LG)의 영향력이 다른 득점 유형에 비해 크게 작용
* 리드골의 영향력을 최소화하기 위해 역전골(동점 상황을 깨는 득점)로 대체
* 추격골(CG)의 경우 CASE 1에서 가중치가 0으로 산출됨에 따라 아예 제외

#### 득점 유형
1. 선제골
2. 동점골: 양 팀간의 득점차가 사라지고 다시 균형 상황을 만드는 득점
3. 역전골: 양 팀간의 균형 상황을 깨는 득점

#### 가중치
1. 선제골: CASE 1과 동일
2. 돔점골: CASE 1과 동일
3. 역전골: 별도 산출

In [9]:
goal_type_pd = pd.DataFrame(score_line_pd, columns=['match_id', 'season_year', 'division', 'location', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'score_team_id', 'winning_team', 'score_player', 'g_fg', 'g_tg', 'g_og', 'g_lg', 'g_cg', 'FGP', 'FTG', 'STG', 'FOG', 'SOG', 'TGP', 'winning_flag'])
goal_type_pd.location = np.where(goal_type_pd.home_team_id == goal_type_pd.score_team_id, 1, 0)
goal_type_pd.g_fg = np.where((goal_type_pd.home_score > 0) & (goal_type_pd.home_score + goal_type_pd.away_score == 1), True, False)
goal_type_pd.g_tg = np.where((goal_type_pd.home_score == goal_type_pd.away_score), True, False)
goal_type_pd.g_og = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, np.where(goal_type_pd.home_score - goal_type_pd.away_score == 1, True, False), np.where(goal_type_pd.away_score - goal_type_pd.home_score == 1, True, False))
goal_type_pd.g_og = np.where(goal_type_pd.g_fg, False, goal_type_pd.g_og)
goal_type_pd.g_lg = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, goal_type_pd.home_score > goal_type_pd.away_score, goal_type_pd.away_score > goal_type_pd.home_score)
goal_type_pd.g_lg = np.where(goal_type_pd.g_fg, False, goal_type_pd.g_lg)
goal_type_pd.g_lg = np.where(goal_type_pd.g_og, False, goal_type_pd.g_lg)
goal_type_pd.g_cg = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, np.where(goal_type_pd.home_score < goal_type_pd.away_score, True, False), np.where(goal_type_pd.away_score < goal_type_pd.home_score, True, False))
goal_type_pd.winning_flag = np.where(goal_type_pd.winning_team == 0, 0, np.where(goal_type_pd.score_team_id == goal_type_pd.winning_team, 1, -1))

goal_type_pd.head()

Unnamed: 0,match_id,season_year,division,location,home_score,away_score,home_team_id,away_team_id,score_team_id,winning_team,score_player,g_fg,g_tg,g_og,g_lg,g_cg,FGP,FTG,STG,FOG,SOG,TGP,winning_flag
1,2013-1-001,2013,1,1,1,0,10,25,10,0,1,True,False,False,False,False,,,,,,,0
2,2013-1-001,2013,1,0,1,1,10,25,25,0,2,False,True,False,False,False,,,,,,,0
3,2013-1-001,2013,1,1,2,1,10,25,10,0,3,False,False,True,False,False,,,,,,,0
5,2013-1-001,2013,1,0,2,2,10,25,25,0,4,False,True,False,False,False,,,,,,,0
7,2013-1-002,2013,1,0,0,1,19,5,5,19,5,False,False,True,False,False,,,,,,,-1


In [10]:
over_goal_statics_pd = goal_type_pd.groupby(['match_id', 'location']).agg({'g_fg': 'sum', 'g_tg': 'sum', 'g_og': 'sum', 'g_lg': 'sum', 'g_cg': 'sum', 'winning_flag': 'max'}).reset_index()
over_goal_statics_pd.g_og = np.where(over_goal_statics_pd.g_og > 2, 2, over_goal_statics_pd.g_og)
# over_goal_statics_pd.winning_flag = np.where(over_goal_statics_pd.location == 1, over_goal_statics_pd.winning_flag * -1, over_goal_statics_pd.winning_flag)

over_goal_statics_pd = pd.DataFrame(over_goal_statics_pd.groupby(['g_og', 'winning_flag']).agg({'match_id': 'count'}).reset_index(), columns=['g_og', 'winning_flag', 'match_id', 'total', 'probs'])
over_goal_statics_pd.columns = ['g_og', 'winning_flag', 'match_count', 'total_count', 'probs']
over_goal_statics_pd.total_count = over_goal_statics_pd.apply(lambda x: over_goal_statics_pd[over_goal_statics_pd.g_og == x.g_og].match_count.sum(), axis=1)
over_goal_statics_pd.probs = over_goal_statics_pd.match_count / over_goal_statics_pd.total_count
display(over_goal_statics_pd.pivot(index='g_og', columns='winning_flag'))
# display(over_goal_statics_pd)

Unnamed: 0_level_0,match_count,match_count,match_count,total_count,total_count,total_count,probs,probs,probs
winning_flag,-1,0,1,-1,0,1,-1,0,1
g_og,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0.0,592,513,566,1671,1671,1671,0.354279,0.307002,0.338719
1.0,141,272,809,1222,1222,1222,0.115385,0.222586,0.662029
2.0,6,27,153,186,186,186,0.032258,0.145161,0.822581


In [11]:
point_by_match_pd = pd.DataFrame(goal_type_pd.groupby(['match_id', 'location']).agg({'score_team_id': 'max', 'winning_flag': 'max', 'g_fg': 'sum', 'g_tg': 'sum', 'g_og': 'sum'}).reset_index(), columns = ['match_id', 'season_year', 'division', 'location', 'score_team_id', 'winning_flag', 'g_fg', 'g_tg', 'g_og', 'FGP', 'FTG', 'STG', 'FOG', 'SOG', 'TGP'])

point_by_match_pd.season_year = point_by_match_pd.match_id.str.split('-').str.get(0)
point_by_match_pd.season_year = point_by_match_pd.season_year.apply(pd.to_numeric)
point_by_match_pd.division = point_by_match_pd.match_id.str.split('-').str.get(1)
point_by_match_pd.division = point_by_match_pd.division.apply(pd.to_numeric)
point_by_match_pd.FGP = point_by_match_pd.g_fg * 2
point_by_match_pd.FTG = np.where(point_by_match_pd.g_tg > 0, 0.44, 0)
point_by_match_pd.STG = np.where(point_by_match_pd.g_tg > 1, 0.6, 0)
# point_by_match_pd.FOG = np.where(point_by_match_pd.g_og > 0, 0.7, 0)
# point_by_match_pd.SOG = np.where(point_by_match_pd.g_og > 1, 0.88, 0)
point_by_match_pd.FOG = np.where(point_by_match_pd.g_og > 0, 0.66*3, 0)
point_by_match_pd.SOG = np.where(point_by_match_pd.g_og > 1, 0.82*3, 0)
point_by_match_pd.TGP = point_by_match_pd.FGP + point_by_match_pd.FTG + point_by_match_pd.STG + point_by_match_pd.FOG + point_by_match_pd.SOG
point_by_match_pd.head()

Unnamed: 0,match_id,season_year,division,location,score_team_id,winning_flag,g_fg,g_tg,g_og,FGP,FTG,STG,FOG,SOG,TGP
0,2013-1-001,2013,1,0,25,0,False,2.0,0.0,0,0.44,0.6,0.0,0.0,1.04
1,2013-1-001,2013,1,1,10,0,True,0.0,1.0,2,0.0,0.0,1.98,0.0,3.98
2,2013-1-002,2013,1,0,5,-1,False,0.0,1.0,0,0.0,0.0,1.98,0.0,1.98
3,2013-1-002,2013,1,1,19,1,False,1.0,1.0,0,0.44,0.0,1.98,0.0,2.42
4,2013-1-003,2013,1,0,23,1,False,0.0,1.0,0,0.0,0.0,1.98,0.0,1.98


In [12]:
case_3_result_pd = pd.DataFrame(point_by_match_pd.groupby(['season_year', 'division', 'score_team_id']).agg({'match_id': 'count', 'FGP': 'sum', 'FTG': 'sum', 'STG': 'sum', 'FOG': 'sum', 'SOG': 'sum', 'TGP': 'sum'}).reset_index(), columns=['season_year', 'division', 'score_team_id', 'FGP', 'FTG', 'STG', 'FOG', 'SOG', 'TGP', 'AGPt', 'case_3_rank'])
case_3_result_pd = pd.merge(case_3_result_pd, season_ranking_pd[['season_year', 'division', 'match_count']], how='left', on=['season_year', 'division']).drop_duplicates()
case_3_result_pd.AGPt = case_3_result_pd.TGP / case_3_result_pd.match_count
case_3_result_pd.case_3_rank = case_3_result_pd.groupby(['season_year', 'division'])['AGPt'].rank(ascending=False)
case_3_result_pd.columns = ['season_year', 'division', 'score_team_id', 'FGP', 'FTG', 'STG', 'FOG', 'SOG', 'case_3_TGP', 'case_3_AGPt', 'case_3_rank', 'match_count']
pd.merge(case_3_result_pd[(case_3_result_pd.season_year == 2017) & (case_3_result_pd.division == 1)], season_ranking_pd, left_on=['season_year', 'division', 'score_team_id'], right_on=['season_year', 'division', 'team_id']).sort_values(['FOG', 'SOG'], ascending=False)

Unnamed: 0,season_year,division,score_team_id,FGP,FTG,STG,FOG,SOG,case_3_TGP,case_3_AGPt,case_3_rank,match_count_x,team_id,match_count_y,real_rank
5,2017,1,13,18,3.96,0.6,37.62,7.38,67.56,1.777895,1.0,38,13,38,3
9,2017,1,22,24,2.64,0.6,35.64,2.46,65.34,1.719474,3.0,38,22,38,1
0,2017,1,1,18,5.28,0.6,31.68,9.84,65.4,1.721053,2.0,38,1,38,6
4,2017,1,10,18,3.96,1.2,31.68,9.84,64.68,1.702105,4.0,38,10,38,5
11,2017,1,25,16,6.6,0.6,29.7,2.46,55.36,1.456842,7.0,38,25,38,7
6,2017,1,19,20,3.96,0.0,27.72,4.92,56.6,1.489474,6.0,38,19,38,4
10,2017,1,23,28,3.52,0.6,25.74,0.0,57.86,1.522632,5.0,38,23,38,2
8,2017,1,21,22,3.96,0.6,21.78,2.46,50.8,1.336842,8.0,38,21,38,10
2,2017,1,5,22,4.4,0.6,19.8,2.46,49.26,1.296316,9.0,38,5,38,8
3,2017,1,9,14,5.72,0.6,17.82,7.38,45.52,1.197895,10.0,38,9,38,11


## CASE 4. 득점 유형을 3가지로 구분하고, 득점시점에 따른 가중치 부여
* 득점 유형 구분은 CASE 3의 3가지 득점 유형과 동일한 기준
* 단, 득점에 따른 가중치를 시간대(15분단위)에 따라 별도로 계산하여 부여

In [13]:
def calc_points(goal):
    return (goal.g_fg * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 1)].g_fg_probs.values[0] * 3) +\
    (goal.g_tg * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 0)].g_tg_probs.values[0] * 1) +\
    (goal.g_og * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 1)].g_og_probs.values[0] * 3) +\
    (goal.g_lg * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 1)].g_lg_probs.values[0] * 3)

In [14]:
def get_team_id(x):
    if x.location == 1:
        return score_line_pd[score_line_pd.match_id == x.match_id].home_team_id.unique()[0]
    else:
        return score_line_pd[score_line_pd.match_id == x.match_id].away_team_id.unique()[0]

In [15]:
goal_type_pd = pd.DataFrame(score_line_pd, columns=['match_id', 'season_year', 'division', 'location', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'score_team_id', 'winning_team', 'score_player', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'winning_flag'])
goal_type_pd.location = np.where(goal_type_pd.home_team_id == goal_type_pd.score_team_id, 1, 0)
goal_type_pd.g_fg = np.where((goal_type_pd.home_score > 0) & (goal_type_pd.home_score + goal_type_pd.away_score == 1), True, False)
goal_type_pd.g_tg = np.where((goal_type_pd.home_score == goal_type_pd.away_score), True, False)
goal_type_pd.g_og = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, np.where(goal_type_pd.home_score - goal_type_pd.away_score == 1, True, False), np.where(goal_type_pd.away_score - goal_type_pd.home_score == 1, True, False))
goal_type_pd.g_og = np.where(goal_type_pd.g_fg, False, goal_type_pd.g_og)
goal_type_pd.g_lg = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, goal_type_pd.home_score > goal_type_pd.away_score, goal_type_pd.away_score > goal_type_pd.home_score)
goal_type_pd.g_lg = np.where(goal_type_pd.g_fg, False, goal_type_pd.g_lg)
goal_type_pd.g_lg = np.where(goal_type_pd.g_og, False, goal_type_pd.g_lg)
goal_type_pd.g_cg = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, np.where(goal_type_pd.home_score < goal_type_pd.away_score, True, False), np.where(goal_type_pd.away_score < goal_type_pd.home_score, True, False))

goal_type_pd.g_stg = goal_type_pd.groupby(['match_id', 'location', 'time_range']).g_tg.cumsum()
goal_type_pd.g_stg = np.where(goal_type_pd.g_stg > 1, True, False)
goal_type_pd.g_tg = np.where(goal_type_pd.g_stg, False, goal_type_pd.g_tg)
goal_type_pd.g_sog = goal_type_pd.groupby(['match_id', 'location', 'time_range']).g_og.cumsum()
goal_type_pd.g_sog = np.where(goal_type_pd.g_sog > 1, True, False)
goal_type_pd.g_og = np.where(goal_type_pd.g_sog, False, goal_type_pd.g_og)
goal_type_pd.g_slg = goal_type_pd.groupby(['match_id', 'location', 'time_range']).g_lg.cumsum()
goal_type_pd.g_olg = np.where(goal_type_pd.g_slg > 2, True, False)
goal_type_pd.g_slg = np.where((goal_type_pd.g_slg > 1) & (goal_type_pd.g_olg == False), True, False)
goal_type_pd.g_lg = np.where((goal_type_pd.g_slg) | (goal_type_pd.g_olg), False, goal_type_pd.g_lg)
goal_type_pd.winning_flag = np.where(goal_type_pd.winning_team == 0, 0, np.where(goal_type_pd.home_team_id == goal_type_pd.winning_team, 1, 0))

goal_type_pd.head()

Unnamed: 0,match_id,season_year,division,location,time_range,home_score,away_score,home_team_id,away_team_id,score_team_id,winning_team,score_player,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,winning_flag
1,2013-1-001,2013,1,1,2,1,0,10,25,10,0,1,True,False,False,False,False,False,False,False,False,0
2,2013-1-001,2013,1,0,3,1,1,10,25,25,0,2,False,True,False,False,False,False,False,False,False,0
3,2013-1-001,2013,1,1,4,2,1,10,25,10,0,3,False,False,False,True,False,False,False,False,False,0
5,2013-1-001,2013,1,0,6,2,2,10,25,25,0,4,False,True,False,False,False,False,False,False,False,0
7,2013-1-002,2013,1,0,1,0,1,19,5,5,19,5,False,False,False,True,False,False,False,False,False,1


In [16]:
goal_count_by_type_pd = goal_type_pd.groupby(['location', 'winning_flag', 'time_range', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg'])['match_id'].nunique().reset_index()

goal_count_by_type_pd.g_fg = goal_count_by_type_pd.g_fg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_tg = goal_count_by_type_pd.g_tg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_stg = goal_count_by_type_pd.g_stg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_og = goal_count_by_type_pd.g_og * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_sog = goal_count_by_type_pd.g_sog * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_lg = goal_count_by_type_pd.g_lg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_slg = goal_count_by_type_pd.g_slg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_olg = goal_count_by_type_pd.g_olg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_cg = goal_count_by_type_pd.g_cg * goal_count_by_type_pd.match_id

goal_count_by_type_pd = pd.DataFrame(goal_count_by_type_pd.groupby(['location', 'winning_flag', 'time_range']).agg({'g_fg': 'sum', 'g_tg': 'sum', 'g_stg': 'sum', 'g_og': 'sum', 'g_sog': 'sum', 'g_lg': 'sum', 'g_slg': 'sum', 'g_olg': 'sum', 'g_cg': 'sum'}).reset_index(), columns=['location', 'winning_flag', 'time_range', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'total', 'g_fg_probs', 'g_tg_probs', 'g_stg_probs', 'g_og_probs', 'g_sog_probs', 'g_lg_probs', 'g_slg_probs', 'g_olg_probs', 'g_cg_probs'])

goal_count_by_type_pd.total = goal_count_by_type_pd.g_fg + goal_count_by_type_pd.g_tg + goal_count_by_type_pd.g_stg + goal_count_by_type_pd.g_og + goal_count_by_type_pd.g_sog + goal_count_by_type_pd.g_lg + goal_count_by_type_pd.g_slg + goal_count_by_type_pd.g_olg + goal_count_by_type_pd.g_cg
goal_count_by_type_pd.head(6)

Unnamed: 0,location,winning_flag,time_range,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,total,g_fg_probs,g_tg_probs,g_stg_probs,g_og_probs,g_sog_probs,g_lg_probs,g_slg_probs,g_olg_probs,g_cg_probs
0,0,0,1,0,8,0,207,0,17,0,0,0,232,,,,,,,,,
1,0,0,2,0,49,0,175,0,39,2,0,2,267,,,,,,,,,
2,0,0,3,0,59,0,189,1,71,5,0,8,333,,,,,,,,,
3,0,0,4,0,66,0,151,1,99,9,0,11,337,,,,,,,,,
4,0,0,5,0,84,2,143,2,90,4,0,14,339,,,,,,,,,
5,0,0,6,0,130,3,244,3,160,21,1,13,575,,,,,,,,,


In [17]:
# location, time_range에서 특정 type의 goal이 발생했을 때 각 경기결과가 나올 확률(조건부확률)

goal_count_by_type_pd.g_fg_probs = goal_count_by_type_pd.apply(lambda x: x.g_fg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_fg.sum(), axis=1)
goal_count_by_type_pd.g_tg_probs = goal_count_by_type_pd.apply(lambda x: x.g_tg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_tg.sum(), axis=1)
goal_count_by_type_pd.g_stg_probs = goal_count_by_type_pd.apply(lambda x: x.g_stg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_stg.sum(), axis=1)
goal_count_by_type_pd.g_og_probs = goal_count_by_type_pd.apply(lambda x: x.g_og / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_og.sum(), axis=1)
goal_count_by_type_pd.g_sog_probs = goal_count_by_type_pd.apply(lambda x: x.g_sog / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_sog.sum(), axis=1)
goal_count_by_type_pd.g_lg_probs = goal_count_by_type_pd.apply(lambda x: x.g_lg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_lg.sum(), axis=1)
goal_count_by_type_pd.g_slg_probs = goal_count_by_type_pd.apply(lambda x: x.g_slg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_slg.sum(), axis=1)
goal_count_by_type_pd.g_olg_probs = goal_count_by_type_pd.apply(lambda x: x.g_olg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_olg.sum(), axis=1)
goal_count_by_type_pd.g_cg_probs = goal_count_by_type_pd.apply(lambda x: x.g_cg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_cg.sum(), axis=1)
goal_count_by_type_pd = goal_count_by_type_pd.fillna(0)

goal_count_by_type_pd[(goal_count_by_type_pd.location == 0) & (goal_count_by_type_pd.winning_flag == 1)]
goal_count_by_type_pd.head()

  This is separate from the ipykernel package so we can avoid doing imports until
  """
  import sys
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,location,winning_flag,time_range,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,total,g_fg_probs,g_tg_probs,g_stg_probs,g_og_probs,g_sog_probs,g_lg_probs,g_slg_probs,g_olg_probs,g_cg_probs
0,0,0,1,0,8,0,207,0,17,0,0,0,232,0.0,0.571429,0.0,0.824701,0.0,0.894737,0.0,0.0,0.0
1,0,0,2,0,49,0,175,0,39,2,0,2,267,0.0,0.710145,0.0,0.875,0.0,0.95122,1.0,0.0,0.25
2,0,0,3,0,59,0,189,1,71,5,0,8,333,0.0,0.567308,0.0,0.9,0.5,0.959459,1.0,0.0,0.275862
3,0,0,4,0,66,0,151,1,99,9,0,11,337,0.0,0.647059,0.0,0.872832,1.0,0.970588,1.0,0.0,0.289474
4,0,0,5,0,84,2,143,2,90,4,0,14,339,0.0,0.717949,1.0,0.922581,1.0,0.989011,1.0,0.0,0.304348


In [54]:
%%time

goal_points_by_type_pd = pd.DataFrame(goal_type_pd, columns=['match_id', 'location', 'time_range', 'score_player', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'winning_flag', 'points'])
goal_points_by_type_pd.points = goal_points_by_type_pd.apply(lambda x: calc_points(x), axis=1)
display(goal_points_by_type_pd.head())

Unnamed: 0,match_id,location,time_range,score_player,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,winning_flag,points
1,2013-1-001,1,2,1,True,False,False,False,False,False,False,False,False,0,1.950617
2,2013-1-001,0,3,2,False,True,False,False,False,False,False,False,False,0,0.567308
3,2013-1-001,1,4,3,False,False,False,True,False,False,False,False,False,0,1.875
5,2013-1-001,0,6,4,False,True,False,False,False,False,False,False,False,0,0.833333
7,2013-1-002,0,1,5,False,False,False,True,False,False,False,False,False,1,0.525896


CPU times: user 59.9 s, sys: 129 ms, total: 1min
Wall time: 1min


In [55]:
goal_points_pd = pd.DataFrame(goal_points_by_type_pd, columns= ['year', 'division', 'match_id', 'location', 'team_id', 'time_range', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'winning_flag', 'points'])
goal_points_pd.year = goal_points_pd.match_id.str.split('-').str.get(0)
goal_points_pd.division = goal_points_pd.match_id.str.split('-').str.get(1)
goal_points_pd.team_id = goal_points_pd.apply(lambda x: get_team_id(x), axis=1)
goal_points_pd.head()

Unnamed: 0,year,division,match_id,location,team_id,time_range,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,winning_flag,points
1,2013,1,2013-1-001,1,10,2,True,False,False,False,False,False,False,False,False,0,1.950617
2,2013,1,2013-1-001,0,25,3,False,True,False,False,False,False,False,False,False,0,0.567308
3,2013,1,2013-1-001,1,10,4,False,False,False,True,False,False,False,False,False,0,1.875
5,2013,1,2013-1-001,0,25,6,False,True,False,False,False,False,False,False,False,0,0.833333
7,2013,1,2013-1-002,0,5,1,False,False,False,True,False,False,False,False,False,1,0.525896


In [56]:
case_4_result_pd = pd.DataFrame(goal_points_pd.groupby(['year', 'division', 'team_id']).agg({'points': 'sum'}).reset_index(), columns=['year', 'division', 'team_id', 'points'])
case_4_result_pd.columns = ['season_year', 'division', 'team_id', 'points']
case_4_result_pd.season_year = case_4_result_pd.season_year.astype('int64')
case_4_result_pd.division = case_4_result_pd.division.astype('int64')
case_4_result_pd = pd.DataFrame(case_4_result_pd, columns = ['season_year', 'division', 'team_id', 'points', 'point_rank', 'case_4_AGPt'])
case_4_result_pd = pd.merge(case_4_result_pd, season_ranking_pd[['season_year', 'division', 'team_id', 'match_count']], how='left', on=['season_year', 'division', 'team_id'])
case_4_result_pd.case_4_AGPt = case_4_result_pd.points / case_4_result_pd.match_count
case_4_result_pd.point_rank = case_4_result_pd.groupby(['season_year', 'division'])['case_4_AGPt'].rank(ascending=False)
case_4_result_pd.columns = ['season_year', 'division', 'score_team_id', 'case_4_TGP', 'case_4_rank', 'case_4_AGPt', 'match_point']

pd.merge(case_4_result_pd[(case_4_result_pd.season_year == 2017) & (case_4_result_pd.division == 1)], season_ranking_pd, left_on=['season_year', 'division', 'score_team_id'], right_on=['season_year', 'division', 'team_id'])

Unnamed: 0,season_year,division,score_team_id,case_4_TGP,case_4_rank,case_4_AGPt,match_point,team_id,match_count,real_rank
0,2017,1,1,55.688414,8.0,1.465485,38.0,1,38,6
1,2017,1,4,37.29332,11.0,0.981403,38.0,4,38,12
2,2017,1,5,59.874612,6.0,1.575648,38.0,5,38,8
3,2017,1,9,37.391364,10.0,0.983983,38.0,9,38,11
4,2017,1,10,56.336465,7.0,1.482539,38.0,10,38,5
5,2017,1,13,60.560837,5.0,1.593706,38.0,13,38,3
6,2017,1,19,50.997942,9.0,1.342051,38.0,19,38,4
7,2017,1,20,31.496549,12.0,0.828857,38.0,20,38,9
8,2017,1,21,67.835269,3.0,1.785139,38.0,21,38,10
9,2017,1,22,88.870515,1.0,2.338698,38.0,22,38,1


## 9. 종합

In [21]:
total_result_pd = pd.DataFrame(pd.merge(case_1_result_pd, case_2_result_pd, on=['season_year', 'division', 'score_team_id']), columns = ['season_year', 'division', 'score_team_id', 'case_1_TGP', 'case_1_AGPt', 'case_2_TGP', 'case_2_AGPt', 'case_1_rank', 'case_2_rank'])

total_result_pd = pd.DataFrame(pd.merge(total_result_pd, case_3_result_pd, on=['season_year', 'division', 'score_team_id']), columns = ['season_year', 'division', 'score_team_id', 'case_1_TGP', 'case_1_AGPt', 'case_2_TGP', 'case_2_AGPt', 'case_3_TGP', 'case_3_AGPt', 'case_1_rank', 'case_2_rank', 'case_3_rank'])

total_result_pd = pd.DataFrame(pd.merge(total_result_pd, case_4_result_pd, on=['season_year', 'division', 'score_team_id']), columns = ['season_year', 'division', 'score_team_id', 'case_1_TGP', 'case_1_AGPt', 'case_2_TGP', 'case_2_AGPt', 'case_3_TGP', 'case_3_AGPt', 'case_4_TGP', 'case_4_AGPt', 'case_1_rank', 'case_2_rank', 'case_3_rank', 'case_4_rank'])

total_result_pd = pd.DataFrame(pd.merge(total_result_pd, season_ranking_pd, left_on=['season_year', 'division', 'score_team_id'], right_on=['season_year', 'division', 'team_id']), columns = ['season_year', 'division', 'score_team_id', 'case_1_TGP', 'case_1_AGPt', 'case_2_TGP', 'case_2_AGPt', 'case_3_TGP', 'case_3_AGPt', 'case_4_TGP', 'case_4_AGPt', 'case_1_rank', 'case_2_rank', 'case_3_rank', 'case_4_rank', 'real_rank'])

total_result_pd = pd.DataFrame(pd.merge(total_result_pd, team_info_pd[['team_id', 'team_name']], left_on=['score_team_id'], right_on=['team_id']))

In [22]:
total_result_pd = pd.DataFrame(total_result_pd, columns = ['season_year', 'division', 'score_team_id', 'team_name', 'case_1_TGP', 'case_1_AGPt', 'case_2_TGP', 'case_2_AGPt', 'case_3_TGP', 'case_3_AGPt', 'case_4_TGP', 'case_4_AGPt', 'case_1_rank', 'case_2_rank', 'case_3_rank', 'case_4_rank', 'real_rank', 'case_1_diff', 'case_2_diff', 'case_3_diff', 'case_4_diff'])
total_result_pd.case_1_diff = abs(total_result_pd.case_1_rank - total_result_pd.real_rank)
total_result_pd.case_2_diff = abs(total_result_pd.case_2_rank - total_result_pd.real_rank)
total_result_pd.case_3_diff = abs(total_result_pd.case_3_rank - total_result_pd.real_rank)
total_result_pd.case_4_diff = abs(total_result_pd.case_4_rank - total_result_pd.real_rank)

total_result_pd[(total_result_pd.season_year == 2017) & (total_result_pd.division == 1)]

Unnamed: 0,season_year,division,score_team_id,team_name,case_1_TGP,case_1_AGPt,case_2_TGP,case_2_AGPt,case_3_TGP,case_3_AGPt,case_4_TGP,case_4_AGPt,case_1_rank,case_2_rank,case_3_rank,case_4_rank,real_rank,case_1_diff,case_2_diff,case_3_diff,case_4_diff
12,2017,1,1,강원,88.66,2.333158,56.270895,1.480813,65.4,1.721053,55.688414,1.465485,5.0,2.0,2.0,8.0,6,1.0,4.0,4.0,2.0
35,2017,1,5,대구,72.74,1.914211,57.405094,1.51066,49.26,1.296316,59.874612,1.575648,8.0,1.0,9.0,6.0,8,0.0,7.0,1.0,2.0
61,2017,1,10,서울,85.94,2.261579,48.398626,1.273648,64.68,1.702105,56.336465,1.482539,6.0,3.0,4.0,7.0,5,1.0,2.0,1.0,2.0
74,2017,1,13,수원,101.59,2.673421,42.356752,1.114651,67.56,1.777895,60.560837,1.593706,3.0,6.0,1.0,5.0,3,0.0,3.0,2.0,2.0
79,2017,1,19,울산,72.35,1.903947,29.01185,0.76347,56.6,1.489474,50.997942,1.342051,9.0,12.0,6.0,9.0,4,5.0,8.0,2.0,5.0
84,2017,1,20,인천,38.11,1.002895,45.112972,1.187183,32.74,0.861579,31.496549,0.828857,12.0,5.0,12.0,12.0,9,3.0,4.0,3.0,3.0
89,2017,1,21,전남,84.3,2.218421,33.779276,0.888928,50.8,1.336842,67.835269,1.785139,7.0,10.0,8.0,3.0,10,3.0,0.0,2.0,7.0
94,2017,1,22,전북,132.62,3.49,33.876113,0.891477,65.34,1.719474,88.870515,2.338698,1.0,9.0,3.0,1.0,1,0.0,8.0,2.0,0.0
99,2017,1,23,제주,103.21,2.716053,34.902543,0.918488,57.86,1.522632,81.377712,2.141519,2.0,8.0,5.0,2.0,2,0.0,6.0,3.0,0.0
104,2017,1,25,포항,91.62,2.411053,48.160875,1.267391,55.36,1.456842,64.337498,1.693092,4.0,4.0,7.0,4.0,7,3.0,3.0,0.0,3.0


In [23]:
total_result_group_pd = pd.DataFrame(total_result_pd.groupby(['season_year', 'division']).agg({'case_1_diff': 'sum', 'case_2_diff': 'sum', 'case_3_diff': 'sum', 'case_4_diff': 'sum'}), columns=['case_1_diff', 'case_2_diff', 'case_3_diff', 'case_4_diff', 'case_1_outlier_cnt', 'case_2_outlier_cnt', 'case_3_outlier_cnt', 'case_4_outlier_cnt'])
total_result_group_pd.case_1_outlier_cnt = total_result_group_pd.apply(lambda x: total_result_pd[(total_result_pd.season_year == x.name[0]) & (total_result_pd.division == x.name[1]) & (total_result_pd.case_1_diff > 2)].__len__(), axis=1)
total_result_group_pd.case_2_outlier_cnt = total_result_group_pd.apply(lambda x: total_result_pd[(total_result_pd.season_year == x.name[0]) & (total_result_pd.division == x.name[1]) & (total_result_pd.case_2_diff > 2)].__len__(), axis=1)
total_result_group_pd.case_3_outlier_cnt = total_result_group_pd.apply(lambda x: total_result_pd[(total_result_pd.season_year == x.name[0]) & (total_result_pd.division == x.name[1]) & (total_result_pd.case_3_diff > 2)].__len__(), axis=1)
total_result_group_pd.case_4_outlier_cnt = total_result_group_pd.apply(lambda x: total_result_pd[(total_result_pd.season_year == x.name[0]) & (total_result_pd.division == x.name[1]) & (total_result_pd.case_4_diff > 2)].__len__(), axis=1)
total_result_group_pd

Unnamed: 0_level_0,Unnamed: 1_level_0,case_1_diff,case_2_diff,case_3_diff,case_4_diff,case_1_outlier_cnt,case_2_outlier_cnt,case_3_outlier_cnt,case_4_outlier_cnt
season_year,division,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013,1,24.0,44.0,26.0,20.0,4,8,5,4
2013,2,8.0,20.0,10.0,12.0,1,4,1,1
2013,3,0.0,2.0,1.0,0.0,0,0,0,0
2014,1,18.0,56.0,22.0,22.0,3,10,4,3
2014,2,51.0,75.0,280.0,51.0,6,15,21,9
2014,3,0.0,0.0,0.0,0.0,0,0,0,0
2015,1,16.0,40.0,18.0,16.0,2,8,3,2
2015,2,33.0,75.0,315.0,51.0,3,15,25,12
2015,3,0.0,0.0,0.0,0.0,0,0,0,0
2016,1,26.0,40.0,30.0,30.0,6,8,5,4


### 총평
* CASE 2,3의 경우 실제 랭킹과는 큰 차이가 존재
    * CASE 2의 경우 리드골의 영향력이 너무 커서 리드골이 발생한 팀의 랭킹이 크게 상승하는 현상을 발견
    * CASE 3의 경우 역시 역전골의 영향력이 커서 역전골 발생한 팀의 랭킹이 크게 상승하는 현상을 발견
* CASE 1,4의 경우 상대적으로 정확도는 높음
    * 차이가 발생하는 경우는
        1. 득점을 거의 내지 않고 이기는 경우(1:0 승 등)
        2. 득점을 많이 내고 지는 경우(4:5 패 등)

### 추가 분석 필요 항목

1. 각 CASE 별 랭킹 자체가 얼마나 설득력이 있는 랭킹인지 여부 확인
 * 실제 랭킹은 단순 승점 기준이기때문에 이게 경기력이라는 변수를 얼마나 표현할 수 있는지에 대해서는 전문가의 판단이 필요
2. 득점 기여도에 큰 영향을 조눈 '리드골', '역전골'에 대한 가중치의 적정성 판단 필요
3. '저득점 승리'와 '고득점 패배'와 경기력에 대한 판단 필요

가중치를 승리의 경우 3점, 무승부인 경우 1점의 승점 범위 안에서 부여하는 경우에도 랭킹이 차이가 나는지?


In [57]:
goal_point_norm_pd = pd.DataFrame(goal_points_by_type_pd, columns = list(goal_points_by_type_pd.columns) + ['total_points', 'norm_points'])
goal_point_sum_pd = goal_points_by_type_pd.groupby(['match_id', 'location'])['points'].sum()
goal_point_norm_pd.total_points = goal_point_norm_pd.apply(lambda x: goal_point_sum_pd[x.match_id][x.location], axis=1)
goal_point_norm_pd.winning_flag = np.where(goal_point_norm_pd.winning_flag == 0, 1, np.where(goal_point_norm_pd.winning_flag == 1, 3, 0))
goal_point_norm_pd.norm_points = (goal_point_norm_pd.points / goal_point_norm_pd.total_points) * goal_point_norm_pd.winning_flag
goal_point_norm_pd.head()

Unnamed: 0,match_id,location,time_range,score_player,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,winning_flag,points,total_points,norm_points
1,2013-1-001,1,2,1,True,False,False,False,False,False,False,False,False,1,1.950617,3.825617,0.509883
2,2013-1-001,0,3,2,False,True,False,False,False,False,False,False,False,1,0.567308,1.400641,0.405034
3,2013-1-001,1,4,3,False,False,False,True,False,False,False,False,False,1,1.875,3.825617,0.490117
5,2013-1-001,0,6,4,False,True,False,False,False,False,False,False,False,1,0.833333,1.400641,0.594966
7,2013-1-002,0,1,5,False,False,False,True,False,False,False,False,False,3,0.525896,0.525896,3.0


In [89]:
sql = """SELECT * FROM player_info"""
player_info = db_conn.select_query(sql)
player_info_pd = pd.DataFrame(player_info)
player_info_pd.head()

player_info_pd[player_info_pd.player_id == 2]

Unnamed: 0,player_id,player_name,player_position
1,2,신진호,


In [96]:
goal_point_norm_by_player_pd = goal_point_norm_pd[(goal_point_norm_pd.match_id.str.split('-').str.get(0) == '2017') & (goal_point_norm_pd.match_id.str.split('-').str.get(1) == '1')].fillna(0).groupby('score_player').agg({'points': 'sum', 'norm_points': 'sum'}).sort_values('norm_points', ascending=False).reset_index()
goal_point_norm_by_player_pd['player_name'] = goal_point_norm_by_player_pd.score_player.apply(lambda x: player_info_pd[player_info_pd.player_id == int(x)].player_name.values[0])
goal_point_norm_by_player_pd.head(20)

Unnamed: 0,score_player,points,norm_points,player_name
0,396,28.610376,18.526174,조나탄
1,185,18.770425,16.959511,양동현
2,1,17.983695,16.553801,데얀
3,706,17.905652,13.311462,에반드로
4,709,10.867832,13.208674,디에고
5,704,16.730423,12.932224,마그노
6,702,14.350865,12.306723,페체신
7,472,12.124931,12.017946,오르샤
8,7,17.088825,11.890773,김신욱
9,737,10.597749,11.594395,주니오


|      | season_year | division | score_player | player_name | match_id | g_fg | g_tg | g_lg | g_cg | total_goal | expected_value | goal_point |
| ---- | ----------- | -------- | ------------ | ----------- | -------- | ---- | ---- | ---- | ---- | ---------- | -------------- | ---------- |
| 1342 | 2017        | 1        | 396          | 조나탄      | 16       | 8.0  | 3.0  | 11.0 | 0.0  | 22.0       | 42.04          | 2.627500   |
| 1288 | 2017        | 1        | 1            | 데얀        | 15       | 7.0  | 2.0  | 8.0  | 2.0  | 19.0       | 33.21          | 2.214000   |
| 1388 | 2017        | 1        | 613          | 자일        | 12       | 6.0  | 1.0  | 9.0  | 0.0  | 16.0       | 32.06          | 2.671667   |
| 1303 | 2017        | 1        | 185          | 양동현      | 17       | 6.0  | 5.0  | 6.0  | 2.0  | 19.0       | 26.98          | 1.587059   |
| 1350 | 2017        | 1        | 449          | 에두        | 11       | 3.0  | 2.0  | 8.0  | 0.0  | 13.0       | 24.37          | 2.215455   |
| 1390 | 2017        | 1        | 616          | 룰리냐      | 13       | 4.0  | 3.0  | 7.0  | 3.0  | 17.0       | 24.23          | 1.863846   |
| 1405 | 2017        | 1        | 704          | 마그노      | 12       | 6.0  | 2.0  | 5.0  | 0.0  | 13.0       | 23.53          | 1.960833   |
| 1311 | 2017        | 1        | 227          | 주민규      | 11       | 4.0  | 5.0  | 5.0  | 3.0  | 17.0       | 21.01          | 1.910000   |
| 1439 | 2017        | 1        | 737          | 주니오      | 8        | 6.0  | 2.0  | 3.0  | 1.0  | 12.0       | 19.72          | 2.465000   |
| 1401 | 2017        | 1        | 7            | 김신욱      | 10       | 2.0  | 1.0  | 7.0  | 0.0  | 10.0       | 19.35          | 1.935000   |

|      | season_year | division | score_player | player_name | match_id | g_fg | g_tg | g_lg | g_cg | total_goal | expected_value | goal_point |
| ---- | ----------- | -------- | ------------ | ----------- | -------- | ---- | ---- | ---- | ---- | ---------- | -------------- | ---------- |
| 1342 | 2017        | 1        | 396          | 조나탄      | 16       | 8.0  | 3.0  | 11.0 | 0.0  | 22.0       | 42.04          | 2.627500   |
| 1288 | 2017        | 1        | 1            | 데얀        | 15       | 7.0  | 2.0  | 8.0  | 2.0  | 19.0       | 33.21          | 2.214000   |
| 1303 | 2017        | 1        | 185          | 양동현      | 17       | 6.0  | 5.0  | 6.0  | 2.0  | 19.0       | 26.98          | 1.587059   |
| 1311 | 2017        | 1        | 227          | 주민규      | 11       | 4.0  | 5.0  | 5.0  | 3.0  | 17.0       | 21.01          | 1.910000   |
| 1390 | 2017        | 1        | 616          | 룰리냐      | 13       | 4.0  | 3.0  | 7.0  | 3.0  | 17.0       | 24.23          | 1.863846   |
| 1388 | 2017        | 1        | 613          | 자일        | 12       | 6.0  | 1.0  | 9.0  | 0.0  | 16.0       | 32.06          | 2.671667   |
| 1350 | 2017        | 1        | 449          | 에두        | 11       | 3.0  | 2.0  | 8.0  | 0.0  | 13.0       | 24.37          | 2.215455   |
| 1405 | 2017        | 1        | 704          | 마그노      | 12       | 6.0  | 2.0  | 5.0  | 0.0  | 13.0       | 23.53          | 1.960833   |
| 1410 | 2017        | 1        | 709          | 디에고      | 13       | 2.0  | 5.0  | 4.0  | 2.0  | 13.0       | 14.72          | 1.132308   |
| 1439 | 2017        | 1        | 737          | 주니오      | 8        | 6.0  | 2.0  | 3.0  | 1.0  | 12.0       | 19.72          | 2.465000   |