In [1]:
import db_conn
import pandas as pd
import numpy as np
import copy
import collections
import statsmodels.formula.api as sm
import scipy.stats as st
import matplotlib.pyplot  as plt
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA as sklearnPCA
import seaborn as sns

plt.style.use('dark_background')
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
sql = """SELECT * FROM score_line"""
score_line = db_conn.select_query(sql)
score_line_pd = pd.DataFrame(score_line)
score_line_pd.head()

Unnamed: 0,assist,assist_player,assist_position,assist_type,away_score,away_team_id,division,goal_type,half_type,home_score,home_team_id,id,lost_gk,lost_team,match_id,own_goal,penalty_kick,play_time,score_direction,score_player,score_position,score_team_id,score_type,season_year,time_range,touch_count,winning_team
0,N,-1.0,-1,-1,0,25,1,-1,1,0,10,0,-1,-1,2013-1-001,N,N,0,-1,0,-1,0,-1,2013,1,-1,0
1,Y,57.0,1,1,0,25,1,0,1,1,10,2,913,25,2013-1-001,N,N,29,3,1,19,10,3,2013,2,-1,0
2,N,,0,0,1,25,1,0,1,1,10,3,605,10,2013-1-001,N,N,32,2,2,12,25,1,2013,3,-1,0
3,Y,57.0,24,2,1,25,1,0,2,2,10,4,913,25,2013-1-001,N,N,2,1,3,18,10,1,2013,4,-1,0
4,-1,-1.0,-1,-1,1,25,1,-1,2,2,10,0,-1,-1,2013-1-001,-1,-1,16,-1,-1,-1,-1,-1,2013,5,-1,0


In [3]:
sql = """SELECT * FROM game_records"""
game_records = db_conn.select_query(sql)
game_records_pd = pd.DataFrame(game_records)
game_records_pd.head()

Unnamed: 0,away_team_id,away_team_score,division,game_date,game_id,game_stadium,game_time,home_team_id,home_team_score,season_year,winning_team
0,25,2,1,2013-03-02,2013-1-001,서울 월드컵,15:00,10,2,2013,0
1,5,1,1,2013-03-02,2013-1-002,울산 문수,14:45,19,2,2013,19
2,23,1,1,2013-03-02,2013-1-003,광양 전용,15:00,21,0,2013,23
3,13,2,1,2013-03-03,2013-1-004,탄천 종합,14:00,12,1,2013,13
4,2,0,1,2013-03-03,2013-1-005,인천 전용,14:00,20,0,2013,0


In [4]:
first_goal_pd = score_line_pd[(score_line_pd.home_score + score_line_pd.away_score == 1) & (score_line_pd.score_team_id > -1)]
first_goal_pd.groupby(['half_type', 'play_time']).agg({'match_id': ['unique', 'count']}).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,match_id,match_id
Unnamed: 0_level_1,Unnamed: 1_level_1,unique,count
half_type,play_time,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,"[2013-1-104, 2013-1-179, 2013-1-207, 2013-1-21...",18
1,1,"[2013-1-126, 2013-1-174, 2013-1-235, 2013-2-04...",24
1,2,"[2013-1-006, 2013-1-098, 2013-1-123, 2013-1-19...",35
1,3,"[2013-1-007, 2013-1-083, 2013-1-117, 2013-1-14...",34
1,4,"[2013-1-002, 2013-1-049, 2013-1-095, 2013-1-11...",36


## 0. 시간대 관련
* 시간대는 전후반 매 10분단위로 구분
* 전반 0-9 / 10-19 / 20-29 / 30-39 / 40-전반종료
* 후반 0-9 / 10-19 / 20-29 / 30-39 / 40-후반종료

## 1. 10분, 10-20분, 20분이후 선제골이 나온 경기의 수
* 데이터 전처리

In [5]:
first_goal_time_range_pd = pd.DataFrame(first_goal_pd.groupby(['half_type', 'play_time']).agg({'match_id': ['unique', 'count']}).reset_index())
first_goal_time_range_pd.columns = ['half_type', 'play_time', 'match_id_list', 'match_count']
first_goal_time_range_pd = pd.DataFrame(first_goal_time_range_pd, columns = ['half_type', 'play_time', 'match_id_list', 'match_count', 'time_range'])
first_goal_time_range_pd.time_range = (first_goal_time_range_pd.play_time / 10).astype('int64')
first_goal_time_range_pd.time_range = np.where(first_goal_time_range_pd.time_range >= 5, 4, first_goal_time_range_pd.time_range)
first_goal_time_range_pd.time_range = np.where(first_goal_time_range_pd.half_type == 1, first_goal_time_range_pd.time_range + 1, first_goal_time_range_pd.time_range + 6)

first_goal_time_range_pd.head()

Unnamed: 0,half_type,play_time,match_id_list,match_count,time_range
0,1,0,"[2013-1-104, 2013-1-179, 2013-1-207, 2013-1-21...",18,1
1,1,1,"[2013-1-126, 2013-1-174, 2013-1-235, 2013-2-04...",24,1
2,1,2,"[2013-1-006, 2013-1-098, 2013-1-123, 2013-1-19...",35,1
3,1,3,"[2013-1-007, 2013-1-083, 2013-1-117, 2013-1-14...",34,1
4,1,4,"[2013-1-002, 2013-1-049, 2013-1-095, 2013-1-11...",36,1


* 결과

In [6]:
first_goal_count_pd = pd.DataFrame(first_goal_time_range_pd.groupby('time_range').agg({'match_count': 'sum'}).reset_index(), columns = ['time_range', 'match_count', 'probs'])
first_goal_count_pd.columns = ['time_range', 'first_goal_count', 'probs']
first_goal_count_pd.probs = first_goal_count_pd.first_goal_count / game_records_pd.__len__()
first_goal_count_pd

Unnamed: 0,time_range,first_goal_count,probs
0,1,333,0.155899
1,2,343,0.160581
2,3,279,0.130618
3,4,240,0.11236
4,5,133,0.062266
5,6,171,0.080056
6,7,142,0.066479
7,8,110,0.051498
8,9,114,0.053371
9,10,69,0.032303


## 2. 10분, 10-20분, 20분이후 선제골이 나온 경기들의 평균 총 득점
* 데이터 전처리

In [7]:
def get_total_goal(x):
    total_goal = 0
    for idx in x:
        gr = game_records_pd[game_records_pd.game_id == idx]
        total_goal = total_goal + (gr.home_team_score + gr.away_team_score).values[0]
    return total_goal

In [8]:
first_goal_match_total_goal_pd = pd.DataFrame(first_goal_time_range_pd[['time_range', 'match_id_list', 'match_count']], columns = ['time_range', 'match_id_list', 'match_count', 'total_goal', 'avg_goal'])

first_goal_match_total_goal_pd.total_goal = first_goal_match_total_goal_pd.match_id_list.apply(lambda x: get_total_goal(x))
first_goal_match_total_goal_pd.avg_goal = first_goal_match_total_goal_pd.total_goal / first_goal_match_total_goal_pd.match_count

first_goal_match_total_goal_pd.head()

Unnamed: 0,time_range,match_id_list,match_count,total_goal,avg_goal
0,1,"[2013-1-104, 2013-1-179, 2013-1-207, 2013-1-21...",18,66,3.666667
1,1,"[2013-1-126, 2013-1-174, 2013-1-235, 2013-2-04...",24,95,3.958333
2,1,"[2013-1-006, 2013-1-098, 2013-1-123, 2013-1-19...",35,118,3.371429
3,1,"[2013-1-007, 2013-1-083, 2013-1-117, 2013-1-14...",34,119,3.5
4,1,"[2013-1-002, 2013-1-049, 2013-1-095, 2013-1-11...",36,153,4.25


* 결과

In [9]:
first_goal_match_total_goal_pd.groupby('time_range').agg({'avg_goal': ['mean', 'max', 'min']})

Unnamed: 0_level_0,avg_goal,avg_goal,avg_goal
Unnamed: 0_level_1,mean,max,min
time_range,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,3.652268,4.25,3.020408
2,3.356559,3.708333,2.717949
3,3.314033,3.791667,2.966667
4,2.984644,3.428571,2.666667
5,2.389644,3.0,1.8
6,2.42719,3.166667,1.833333
7,2.175425,2.466667,1.846154
8,1.602764,1.888889,1.0
9,1.404627,1.6875,1.181818
10,1.111242,1.363636,1.0


## 3. 시간대별 선제골을 넣은 팀이 우승한 비율

In [10]:
first_goal_win_pd = first_goal_pd[first_goal_pd.score_team_id == first_goal_pd.winning_team]

first_goal_win_time_range_pd = pd.DataFrame(first_goal_win_pd.groupby(['half_type', 'play_time']).agg({'match_id': 'count'}).reset_index())
first_goal_win_time_range_pd.columns = ['half_type', 'play_time', 'match_count']
first_goal_win_time_range_pd = pd.DataFrame(first_goal_win_time_range_pd, columns = ['half_type', 'play_time', 'match_count', 'time_range'])
first_goal_win_time_range_pd.time_range = (first_goal_win_time_range_pd.play_time / 10).astype('int64')
first_goal_win_time_range_pd.time_range = np.where(first_goal_win_time_range_pd.time_range >= 5, 4, first_goal_win_time_range_pd.time_range)
first_goal_win_time_range_pd.time_range = np.where(first_goal_win_time_range_pd.half_type == 1, first_goal_win_time_range_pd.time_range + 1, first_goal_win_time_range_pd.time_range + 6)

first_goal_win_time_range_pd.head()

Unnamed: 0,half_type,play_time,match_count,time_range
0,1,0,12,1
1,1,1,15,1
2,1,2,15,1
3,1,3,23,1
4,1,4,25,1


In [11]:
first_goal_win_count_pd = pd.DataFrame(first_goal_win_time_range_pd.groupby('time_range').agg({'match_count': 'sum'}).reset_index(), columns = ['time_range', 'match_count', 'probs'])
first_goal_win_count_pd.columns = ['time_range', 'first_goal_win_count', 'probs']
first_goal_win_count_pd.probs = first_goal_win_count_pd.first_goal_win_count / game_records_pd.__len__()
first_goal_win_count_pd

Unnamed: 0,time_range,first_goal_win_count,probs
0,1,202,0.094569
1,2,212,0.099251
2,3,178,0.083333
3,4,152,0.071161
4,5,93,0.043539
5,6,117,0.054775
6,7,94,0.044007
7,8,87,0.04073
8,9,92,0.043071
9,10,63,0.029494


## 4. 시간대별로 선제골 득점 시 승리한 확률(조건부 확률)

In [12]:
first_goal_win_probs_pd = first_goal_win_count_pd.merge(first_goal_count_pd, how='left', on=['time_range'])
first_goal_win_probs_pd = pd.DataFrame(first_goal_win_probs_pd, columns = ['time_range', 'first_goal_win_count', 'probs_x', 'first_goal_count', 'probs_y', 'win_goal_probs'])
first_goal_win_probs_pd.columns = ['time_range', 'first_goal_win_count', 'first_goal_win_probs', 'first_goal_count', 'first_goal_probs', 'win_goal_probs']
first_goal_win_probs_pd.win_goal_probs = first_goal_win_probs_pd.first_goal_win_probs / first_goal_win_probs_pd.first_goal_probs
first_goal_win_probs_pd

Unnamed: 0,time_range,first_goal_win_count,first_goal_win_probs,first_goal_count,first_goal_probs,win_goal_probs
0,1,202,0.094569,333,0.155899,0.606607
1,2,212,0.099251,343,0.160581,0.618076
2,3,178,0.083333,279,0.130618,0.637993
3,4,152,0.071161,240,0.11236,0.633333
4,5,93,0.043539,133,0.062266,0.699248
5,6,117,0.054775,171,0.080056,0.684211
6,7,94,0.044007,142,0.066479,0.661972
7,8,87,0.04073,110,0.051498,0.790909
8,9,92,0.043071,114,0.053371,0.807018
9,10,63,0.029494,69,0.032303,0.913043
