In [1]:
import db_conn
import pandas as pd
import numpy as np
import copy
import collections
import statsmodels.formula.api as sm
import scipy.stats as st
import matplotlib.pyplot  as plt
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA as sklearnPCA
import seaborn as sns

plt.style.use('dark_background')
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
sql = """SELECT * FROM season_ranking"""
season_ranking = db_conn.select_query(sql)
season_ranking_pd = pd.DataFrame(season_ranking, columns = ['year', 'division', 'team_id', 'rank'])
season_ranking_pd.columns = ['season_year', 'division', 'team_id', 'real_rank']
season_ranking_pd.head()

Unnamed: 0,season_year,division,team_id,real_rank
0,2013,1,25,1
1,2013,1,19,2
2,2013,1,22,3
3,2013,1,10,4
4,2013,1,13,5


In [3]:
sql = """SELECT * FROM score_line"""
score_line = db_conn.select_query(sql)
score_line_pd = pd.DataFrame(score_line, columns=['match_id', 'season_year', 'division', 'id', 'half_type', 'play_time', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'score_team_id', 'winning_team', 'score_player'])
score_line_pd = score_line_pd[score_line_pd.id > 0]
score_line_pd.head()

Unnamed: 0,match_id,season_year,division,id,half_type,play_time,time_range,home_score,away_score,home_team_id,away_team_id,score_team_id,winning_team,score_player
1,2013-1-001,2013,1,2,1,29,2,1,0,10,25,10,0,1
2,2013-1-001,2013,1,3,1,32,3,1,1,10,25,25,0,2
3,2013-1-001,2013,1,4,2,2,4,2,1,10,25,10,0,3
5,2013-1-001,2013,1,5,2,38,6,2,2,10,25,25,0,4
7,2013-1-002,2013,1,6,1,4,1,0,1,19,5,5,19,5


In [4]:
goal_type_pd = pd.DataFrame(score_line_pd, columns=['match_id', 'season_year', 'division', 'location', 'time_range', 'home_score', 'away_score', 'home_team_id', 'away_team_id', 'score_team_id', 'winning_team', 'score_player', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'winning_flag'])
goal_type_pd.location = np.where(goal_type_pd.home_team_id == goal_type_pd.score_team_id, 1, 0)
goal_type_pd.g_fg = np.where((goal_type_pd.home_score > 0) & (goal_type_pd.home_score + goal_type_pd.away_score == 1), True, False)
goal_type_pd.g_tg = np.where((goal_type_pd.home_score == goal_type_pd.away_score), True, False)
goal_type_pd.g_og = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, np.where(goal_type_pd.home_score - goal_type_pd.away_score == 1, True, False), np.where(goal_type_pd.away_score - goal_type_pd.home_score == 1, True, False))
goal_type_pd.g_og = np.where(goal_type_pd.g_fg, False, goal_type_pd.g_og)
goal_type_pd.g_lg = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, goal_type_pd.home_score > goal_type_pd.away_score, goal_type_pd.away_score > goal_type_pd.home_score)
goal_type_pd.g_lg = np.where(goal_type_pd.g_fg, False, goal_type_pd.g_lg)
goal_type_pd.g_lg = np.where(goal_type_pd.g_og, False, goal_type_pd.g_lg)
goal_type_pd.g_cg = np.where(goal_type_pd.score_team_id == goal_type_pd.home_team_id, np.where(goal_type_pd.home_score < goal_type_pd.away_score, True, False), np.where(goal_type_pd.away_score < goal_type_pd.home_score, True, False))

goal_type_pd.g_stg = goal_type_pd.groupby(['match_id', 'location', 'time_range']).g_tg.cumsum()
goal_type_pd.g_stg = np.where(goal_type_pd.g_stg > 1, True, False)
goal_type_pd.g_tg = np.where(goal_type_pd.g_stg, False, goal_type_pd.g_tg)
goal_type_pd.g_sog = goal_type_pd.groupby(['match_id', 'location', 'time_range']).g_og.cumsum()
goal_type_pd.g_sog = np.where(goal_type_pd.g_sog > 1, True, False)
goal_type_pd.g_og = np.where(goal_type_pd.g_sog, False, goal_type_pd.g_og)
goal_type_pd.g_slg = goal_type_pd.groupby(['match_id', 'location', 'time_range']).g_lg.cumsum()
goal_type_pd.g_olg = np.where(goal_type_pd.g_slg > 2, True, False)
goal_type_pd.g_slg = np.where((goal_type_pd.g_slg > 1) & (goal_type_pd.g_olg == False), True, False)
goal_type_pd.g_lg = np.where((goal_type_pd.g_slg) | (goal_type_pd.g_olg), False, goal_type_pd.g_lg)
goal_type_pd.winning_flag = np.where(goal_type_pd.winning_team == 0, 0, np.where(goal_type_pd.home_team_id == goal_type_pd.winning_team, 1, 0))

goal_type_pd.head()

Unnamed: 0,match_id,season_year,division,location,time_range,home_score,away_score,home_team_id,away_team_id,score_team_id,winning_team,score_player,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,winning_flag
1,2013-1-001,2013,1,1,2,1,0,10,25,10,0,1,True,False,False,False,False,False,False,False,False,0
2,2013-1-001,2013,1,0,3,1,1,10,25,25,0,2,False,True,False,False,False,False,False,False,False,0
3,2013-1-001,2013,1,1,4,2,1,10,25,10,0,3,False,False,False,True,False,False,False,False,False,0
5,2013-1-001,2013,1,0,6,2,2,10,25,25,0,4,False,True,False,False,False,False,False,False,False,0
7,2013-1-002,2013,1,0,1,0,1,19,5,5,19,5,False,False,False,True,False,False,False,False,False,1


In [5]:
goal_count_by_type_pd = goal_type_pd.groupby(['location', 'winning_flag', 'time_range', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg'])['match_id'].nunique().reset_index()

goal_count_by_type_pd.g_fg = goal_count_by_type_pd.g_fg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_tg = goal_count_by_type_pd.g_tg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_stg = goal_count_by_type_pd.g_stg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_og = goal_count_by_type_pd.g_og * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_sog = goal_count_by_type_pd.g_sog * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_lg = goal_count_by_type_pd.g_lg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_slg = goal_count_by_type_pd.g_slg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_olg = goal_count_by_type_pd.g_olg * goal_count_by_type_pd.match_id
goal_count_by_type_pd.g_cg = goal_count_by_type_pd.g_cg * goal_count_by_type_pd.match_id

goal_count_by_type_pd = pd.DataFrame(goal_count_by_type_pd.groupby(['location', 'winning_flag', 'time_range']).agg({'g_fg': 'sum', 'g_tg': 'sum', 'g_stg': 'sum', 'g_og': 'sum', 'g_sog': 'sum', 'g_lg': 'sum', 'g_slg': 'sum', 'g_olg': 'sum', 'g_cg': 'sum'}).reset_index(), columns=['location', 'winning_flag', 'time_range', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'total', 'g_fg_probs', 'g_tg_probs', 'g_stg_probs', 'g_og_probs', 'g_sog_probs', 'g_lg_probs', 'g_slg_probs', 'g_olg_probs', 'g_cg_probs'])

goal_count_by_type_pd.total = goal_count_by_type_pd.g_fg + goal_count_by_type_pd.g_tg + goal_count_by_type_pd.g_stg + goal_count_by_type_pd.g_og + goal_count_by_type_pd.g_sog + goal_count_by_type_pd.g_lg + goal_count_by_type_pd.g_slg + goal_count_by_type_pd.g_olg + goal_count_by_type_pd.g_cg
goal_count_by_type_pd.head(6)

Unnamed: 0,location,winning_flag,time_range,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,total,g_fg_probs,g_tg_probs,g_stg_probs,g_og_probs,g_sog_probs,g_lg_probs,g_slg_probs,g_olg_probs,g_cg_probs
0,0,0,1,0,8,0,207,0,17,0,0,0,232,,,,,,,,,
1,0,0,2,0,49,0,175,0,39,2,0,2,267,,,,,,,,,
2,0,0,3,0,59,0,189,1,71,5,0,8,333,,,,,,,,,
3,0,0,4,0,66,0,151,1,99,9,0,11,337,,,,,,,,,
4,0,0,5,0,84,2,143,2,90,4,0,14,339,,,,,,,,,
5,0,0,6,0,130,3,244,3,160,21,1,13,575,,,,,,,,,


In [6]:
# location, time_range에서 특정 type의 goal이 발생했을 때 각 경기결과가 나올 확률(조건부확률)

goal_count_by_type_pd.g_fg_probs = goal_count_by_type_pd.apply(lambda x: x.g_fg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_fg.sum(), axis=1)
goal_count_by_type_pd.g_tg_probs = goal_count_by_type_pd.apply(lambda x: x.g_tg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_tg.sum(), axis=1)
goal_count_by_type_pd.g_stg_probs = goal_count_by_type_pd.apply(lambda x: x.g_stg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_stg.sum(), axis=1)
goal_count_by_type_pd.g_og_probs = goal_count_by_type_pd.apply(lambda x: x.g_og / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_og.sum(), axis=1)
goal_count_by_type_pd.g_sog_probs = goal_count_by_type_pd.apply(lambda x: x.g_sog / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_sog.sum(), axis=1)
goal_count_by_type_pd.g_lg_probs = goal_count_by_type_pd.apply(lambda x: x.g_lg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_lg.sum(), axis=1)
goal_count_by_type_pd.g_slg_probs = goal_count_by_type_pd.apply(lambda x: x.g_slg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_slg.sum(), axis=1)
goal_count_by_type_pd.g_olg_probs = goal_count_by_type_pd.apply(lambda x: x.g_olg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_olg.sum(), axis=1)
goal_count_by_type_pd.g_cg_probs = goal_count_by_type_pd.apply(lambda x: x.g_cg / goal_count_by_type_pd[(goal_count_by_type_pd.location == x.location) & (goal_count_by_type_pd.time_range == x.time_range)].g_cg.sum(), axis=1)
goal_count_by_type_pd = goal_count_by_type_pd.fillna(0)

goal_count_by_type_pd[(goal_count_by_type_pd.location == 0) & (goal_count_by_type_pd.winning_flag == 1)]
goal_count_by_type_pd.head()

  This is separate from the ipykernel package so we can avoid doing imports until
  """
  import sys
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,location,winning_flag,time_range,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,total,g_fg_probs,g_tg_probs,g_stg_probs,g_og_probs,g_sog_probs,g_lg_probs,g_slg_probs,g_olg_probs,g_cg_probs
0,0,0,1,0,8,0,207,0,17,0,0,0,232,0.0,0.571429,0.0,0.824701,0.0,0.894737,0.0,0.0,0.0
1,0,0,2,0,49,0,175,0,39,2,0,2,267,0.0,0.710145,0.0,0.875,0.0,0.95122,1.0,0.0,0.25
2,0,0,3,0,59,0,189,1,71,5,0,8,333,0.0,0.567308,0.0,0.9,0.5,0.959459,1.0,0.0,0.275862
3,0,0,4,0,66,0,151,1,99,9,0,11,337,0.0,0.647059,0.0,0.872832,1.0,0.970588,1.0,0.0,0.289474
4,0,0,5,0,84,2,143,2,90,4,0,14,339,0.0,0.717949,1.0,0.922581,1.0,0.989011,1.0,0.0,0.304348


In [7]:
def calc_points(goal):
    return (goal.g_fg * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 1)].g_fg_probs.values[0] * 3) +\
    (goal.g_tg * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 0)].g_tg_probs.values[0] * 1) +\
    (goal.g_og * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 1)].g_og_probs.values[0] * 3) +\
    (goal.g_lg * goal_count_by_type_pd[(goal_count_by_type_pd.location == goal.location) & (goal_count_by_type_pd.time_range == goal.time_range) & (goal_count_by_type_pd.winning_flag == 1)].g_lg_probs.values[0] * 3)

In [8]:
%%time

goal_points_by_type_pd = pd.DataFrame(goal_type_pd, columns=['match_id', 'location', 'time_range', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'winning_flag', 'points'])
goal_points_by_type_pd.points = goal_points_by_type_pd.apply(lambda x: calc_points(x), axis=1)
display(goal_points_by_type_pd.head())

Unnamed: 0,match_id,location,time_range,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,winning_flag,points
1,2013-1-001,1,2,True,False,False,False,False,False,False,False,False,0,1.950617
2,2013-1-001,0,3,False,True,False,False,False,False,False,False,False,0,0.567308
3,2013-1-001,1,4,False,False,False,True,False,False,False,False,False,0,1.875
5,2013-1-001,0,6,False,True,False,False,False,False,False,False,False,0,0.833333
7,2013-1-002,0,1,False,False,False,True,False,False,False,False,False,1,0.525896


CPU times: user 58.7 s, sys: 135 ms, total: 58.9 s
Wall time: 59 s


In [9]:
def get_team_id(x):
    if x.location == 1:
        return score_line_pd[score_line_pd.match_id == x.match_id].home_team_id.unique()[0]
    else:
        return score_line_pd[score_line_pd.match_id == x.match_id].away_team_id.unique()[0]

In [10]:
goal_points_pd = pd.DataFrame(goal_points_by_type_pd, columns= ['year', 'division', 'match_id', 'location', 'team_id', 'time_range', 'g_fg', 'g_tg', 'g_stg', 'g_og', 'g_sog', 'g_lg', 'g_slg', 'g_olg', 'g_cg', 'winning_flag', 'points'])
goal_points_pd.year = goal_points_pd.match_id.str.split('-').str.get(0)
goal_points_pd.division = goal_points_pd.match_id.str.split('-').str.get(1)
goal_points_pd.team_id = goal_points_pd.apply(lambda x: get_team_id(x), axis=1)
goal_points_pd.head()

Unnamed: 0,year,division,match_id,location,team_id,time_range,g_fg,g_tg,g_stg,g_og,g_sog,g_lg,g_slg,g_olg,g_cg,winning_flag,points
1,2013,1,2013-1-001,1,10,2,True,False,False,False,False,False,False,False,False,0,1.950617
2,2013,1,2013-1-001,0,25,3,False,True,False,False,False,False,False,False,False,0,0.567308
3,2013,1,2013-1-001,1,10,4,False,False,False,True,False,False,False,False,False,0,1.875
5,2013,1,2013-1-001,0,25,6,False,True,False,False,False,False,False,False,False,0,0.833333
7,2013,1,2013-1-002,0,5,1,False,False,False,True,False,False,False,False,False,1,0.525896


In [11]:
goal_points_rank_pd = pd.DataFrame(goal_points_pd.groupby(['year', 'division', 'team_id']).agg({'points': 'sum', 'match_id': 'unique'}).reset_index(), columns=['year', 'division', 'team_id', 'points', 'match_id'])
goal_points_rank_pd.columns = ['year', 'division', 'team_id', 'points', 'match_count']
goal_points_rank_pd.match_count = goal_points_rank_pd.match_count.apply(lambda x: x.__len__())
goal_points_rank_pd = pd.DataFrame(goal_points_rank_pd, columns = ['year', 'division', 'team_id', 'points', 'match_count', 'point_rank', 'rank_diff'])
goal_points_rank_pd.point_rank = goal_points_rank_pd.groupby(['year', 'division'])['points'].rank(ascending=False)
goal_points_rank_pd.year = goal_points_rank_pd.year.astype('int64')
goal_points_rank_pd.division = goal_points_rank_pd.division.astype('int64')
goal_points_rank_pd.columns = ['season_year', 'division', 'team_id', 'points', 'match_count', 'point_rank', 'rank_diff']
goal_points_rank_pd = goal_points_rank_pd.merge(season_ranking_pd, how='left', on=['season_year', 'division', 'team_id'])
goal_points_rank_pd.rank_diff = abs(goal_points_rank_pd.real_rank - goal_points_rank_pd.point_rank)
goal_points_rank_pd.head()

Unnamed: 0,season_year,division,team_id,points,match_count,point_rank,rank_diff,real_rank
0,2013,1,1,39.567785,23,12.0,0.0,12.0
1,2013,1,2,51.994787,27,8.0,3.0,11.0
2,2013,1,5,32.692089,22,14.0,1.0,13.0
3,2013,1,6,35.396941,29,13.0,1.0,14.0
4,2013,1,7,57.019619,24,7.0,1.0,6.0


In [12]:
goal_points_rank_pd.groupby(['season_year', 'division']).agg({'rank_diff': ['sum', 'mean']})

Unnamed: 0_level_0,Unnamed: 1_level_0,rank_diff,rank_diff
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
season_year,division,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,20.0,1.428571
2013,2,12.0,1.5
2013,3,0.0,0.0
2014,1,22.0,1.833333
2014,2,14.0,1.555556
2014,3,0.0,0.0
2015,1,16.0,1.333333
2015,2,17.0,1.7
2015,3,0.0,0.0
2016,1,30.0,2.5


In [22]:
display(goal_points_rank_pd[(goal_points_rank_pd.season_year == 2017) & (goal_points_rank_pd.division == 1)].sort_values('team_id'))
display(goal_points_rank_pd[goal_points_rank_pd.rank_diff > 2].__len__())

Unnamed: 0,season_year,division,team_id,points,match_count,point_rank,rank_diff,real_rank
97,2017,1,1,55.688414,32,8.0,2.0,6.0
98,2017,1,4,37.29332,23,11.0,1.0,12.0
99,2017,1,5,59.874612,29,6.0,2.0,8.0
100,2017,1,9,37.391364,25,10.0,1.0,11.0
101,2017,1,10,56.336465,30,7.0,2.0,5.0
102,2017,1,13,60.560837,31,5.0,2.0,3.0
103,2017,1,19,50.997942,28,9.0,5.0,4.0
104,2017,1,20,31.496549,23,12.0,3.0,9.0
105,2017,1,21,67.835269,28,3.0,7.0,10.0
106,2017,1,22,88.870515,32,1.0,0.0,1.0


27

 

| 5 Type + Real Rank | 25       |      |          | 5 Type + Goal Rank | 15       |      |          | 3 Type + Real Rank | 24       |      |          | 3 Type + Time Range | 27       |      |          |
| ------------------ | -------- | ---- | -------- | ------------------ | -------- | ---- | -------- | ------------------ | -------- | ---- | -------- | ------------------- | -------- | ---- | -------- |
| year               | division | sum  | mean     | year               | division | sum  | mean     | year               | division | sum  | mean     | year                | division | sum  | mean     |
| 2013               | 1        | 22   | 1.571429 | 2013               | 1        | 14   | 1        | 2013               | 1        | 20   | 1.428571 | 2013                | 1        | 20   | 1.428571 |
|                    | 2        | 4    | 0.571429 |                    | 2        | 6    | 0.857143 |                    | 2        | 5    | 0.714286 |                     | 2        | 12   | 1.5      |
|                    | 3        | 0    | 0        |                    | 3        | 0    | 0        |                    | 3        | 1    | 0.5      |                     | 3        | 0    | 0        |
| 2014               | 1        | 18   | 1.5      | 2014               | 1        | 11   | 0.916667 | 2014               | 1        | 22   | 1.833333 | 2014                | 1        | 22   | 1.833333 |
|                    | 2        | 15   | 1.666667 |                    | 2        | 10   | 1.111111 |                    | 2        | 11   | 1.222222 |                     | 2        | 14   | 1.555556 |
|                    | 3        | 0    | 0        |                    | 3        | 0    | 0        |                    | 3        | 0    | 0        |                     | 3        | 0    | 0        |
| 2015               | 1        | 14   | 1.166667 | 2015               | 1        | 9    | 0.75     | 2015               | 1        | 18   | 1.5      | 2015                | 1        | 16   | 1.333333 |
|                    | 2        | 14   | 1.4      |                    | 2        | 6    | 0.6      |                    | 2        | 17   | 1.7      |                     | 2        | 17   | 1.7      |
|                    | 3        | 0    | 0        |                    | 3        | 0    | 0        |                    | 3        | 0    | 0        |                     | 3        | 0    | 0        |
| 2016               | 1        | 28   | 2.333333 | 2016               | 1        | 19   | 1.583333 | 2016               | 1        | 36   | 3        | 2016                | 1        | 30   | 2.5      |
|                    | 2        | 19   | 1.9      |                    | 2        | 11   | 1.1      |                    | 2        | 17   | 1.7      |                     | 2        | 24   | 2.4      |
|                    | 3        | 1    | 0.5      |                    | 3        | 1    | 0.5      |                    | 3        | 1    | 0.5      |                     | 3        | 1    | 0.5      |
| 2017               | 1        | 18   | 1.5      | 2017               | 1        | 6    | 0.5      | 2017               | 1        | 18   | 1.5      | 2017                | 1        | 28   | 2.333333 |
|                    | 2        | 12   | 1.2      |                    | 2        | 19   | 1.9      |                    | 2        | 8    | 0.8      |                     | 2        | 14   | 1.4      |
|                    | 3        | 1    | 0.5      |                    | 3        | 1    | 0.5      |                    | 3        | 1    | 0.5      |                     | 3        | 0    | 0        |

In [21]:
goal_point_diff_pd = pd.DataFrame(goal_points_pd.groupby(['year', 'division', 'match_id', 'location']).agg({'points': 'sum', 'winning_flag': 'max'}).reset_index(), columns=['year', 'division', 'match_id', 'location', 'points', 'winning_flag', 'home_points', 'away_points', 'points_diff'])


# goal_point_diff_pd.home_points = goal_point_diff_pd.apply(lambda x: goal_point_diff_pd[(goal_point_diff_pd.match_id == x.match_id) & (goal_point_diff_pd.location == 1)].points, axis=1)
# goal_point_diff_pd.away_points = goal_point_diff_pd.apply(lambda x: goal_point_diff_pd[(goal_point_diff_pd.match_id == x.match_id) & (goal_point_diff_pd.location == 0)].points, axis=1)
# goal_point_diff_pd.points_diff  = goal_point_diff_pd.apply(lambda x: np.where(x.location == 1, x.home_points - x.away_points, x.away_points - x.home_points), axis=1)
goal_point_diff_pd.head()

Unnamed: 0,year,division,match_id,location,points,winning_flag,home_points,away_points,points_diff
0,2013,1,2013-1-001,0,1.400641,0,,,
1,2013,1,2013-1-001,1,3.825617,0,,,
2,2013,1,2013-1-002,0,0.525896,1,,,
3,2013,1,2013-1-002,1,3.53262,1,,,
4,2013,1,2013-1-003,0,0.375,0,,,
