# Пайплайн <a id='top'></a>

## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from data_transformer import DataTransformer
from models import BoostingModel
from scorer import MoneyScorer

## Load data

In [2]:
raw_train_data = pd.read_csv('data/england.csv')
raw_test_data = pd.read_csv('data/future.csv')

## Preprocess data

In [3]:
def base_data_preprocess(data):
    
    preprocessed_data = data.copy()
    
    preprocessed_data = preprocessed_data.rename(columns={'away_tem':'away_team'})
    preprocessed_data = preprocessed_data.fillna(1.01)
    preprocessed_data.date = pd.to_datetime(preprocessed_data.date, format='%d.%m.%Y')
    preprocessed_data.date = preprocessed_data.date.values.astype(np.int64) // 10 ** 9
    preprocessed_data = preprocessed_data.sort_values(by='date')
    preprocessed_data = preprocessed_data.drop(columns=['link', 'country', 'league_level', 'time'])
    preprocessed_data.loc[preprocessed_data.league == 'premier-league', 'league'] = 'premier-league-2021-2022'
    
    return preprocessed_data

In [4]:
train_data = base_data_preprocess(raw_train_data)

test_data = train_data.tail(20)

In [5]:
train_data.head()

Unnamed: 0,date,day_of_week,season,league,home_team,away_team,home_win_rate,draw_rate,away_win_rate,home_double_chance_rate,away_double_chance_rate,no_draw_rate,total_over_1_rate,total_under_1_rate,total_over_15_rate,total_under_15_rate,total_over_2_rate,total_under_2_rate,total_over_25_rate,total_under_25_rate,total_over_3_rate,total_under_3_rate,total_over_35_rate,total_under_35_rate,both_team_to_score_yes,both_team_to_score_no,home_scored,away_scored
217,1474675200,Saturday,2016,premier-league-2016-2017,Middlesbrough,Tottenham,4.1,3.5,2.08,0.0,0.0,0.0,1.07,7.7,1.33,3.34,1.56,2.5,2.2,1.8,3.1,1.39,3.76,1.27,1.86,1.94,1,2
276,1474675200,Saturday,2016,premier-league-2016-2017,Stoke,West Brom,2.33,3.34,3.52,1.34,1.67,1.37,1.08,7.3,1.35,3.24,1.59,2.42,2.26,1.76,3.22,1.37,3.9,1.26,1.79,2.02,1,1
34,1474675200,Saturday,2016,premier-league-2016-2017,Manchester Utd,Leicester,1.69,4.0,5.9,1.17,2.3,1.29,1.03,11.5,1.22,4.3,1.35,3.34,1.85,2.13,2.36,1.62,2.93,1.41,1.65,2.24,4,1
6,1474675200,Saturday,2016,premier-league-2016-2017,Swansea,Manchester City,9.5,5.8,1.36,3.44,1.08,1.17,1.03,12.5,1.13,6.0,1.19,5.0,1.54,2.77,1.77,2.1,2.24,1.65,1.8,2.0,1,3
190,1474675200,Saturday,2016,premier-league-2016-2017,Bournemouth,Everton,3.54,3.5,2.25,1.72,1.33,1.34,1.05,9.1,1.26,3.84,1.43,2.94,1.98,1.98,2.6,1.52,3.22,1.35,1.68,2.19,1,0


In [6]:
train_data.shape

(1740, 28)

In [7]:
numeric_features = list(train_data.select_dtypes(include=['int', 'float']).columns)
numeric_features.remove('season')
numeric_features

['date',
 'home_win_rate',
 'draw_rate',
 'away_win_rate',
 'home_double_chance_rate',
 'away_double_chance_rate',
 'no_draw_rate',
 'total_over_1_rate',
 'total_under_1_rate',
 'total_over_15_rate',
 'total_under_15_rate',
 'total_over_2_rate',
 'total_under_2_rate',
 'total_over_25_rate',
 'total_under_25_rate',
 'total_over_3_rate',
 'total_under_3_rate',
 'total_over_35_rate',
 'total_under_35_rate',
 'both_team_to_score_yes',
 'both_team_to_score_no',
 'home_scored',
 'away_scored']

In [8]:
date_features = ['date']

In [9]:
train_data['season'] = train_data.season.astype('object')
test_data['season'] = test_data.season.astype('object')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['season'] = test_data.season.astype('object')


In [10]:
categorical_features = ['day_of_week', 'season', 'league']
categorical_features

['day_of_week', 'season', 'league']

<a id='transformer'></a>

In [11]:
transformer_context = {'train':train_data,
                       'test':test_data,
                       'cat_features':categorical_features,
                       'num_features':numeric_features
                      }

transformer = DataTransformer(transformer_context)

In [12]:
train, val, test, decode_teams, teams_labels = transformer.run_logic() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.test_data['target'] = self.test_data.apply(_set_target, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.test_data['home_team'] = self.test_data['home_team'].map(self.teams_labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.test_data['away_team'] = self.test_data['away_team

In [13]:
test.head()

Unnamed: 0,date,day_of_week,season,league,home_team,away_team,home_win_rate,draw_rate,away_win_rate,home_double_chance_rate,away_double_chance_rate,no_draw_rate,total_over_1_rate,total_under_1_rate,total_over_15_rate,total_under_15_rate,total_over_2_rate,total_under_2_rate,total_over_25_rate,total_under_25_rate,total_over_3_rate,total_under_3_rate,total_over_35_rate,total_under_35_rate,both_team_to_score_yes,both_team_to_score_no,target,home_current_points,home_current_lose_streak,home_current_win_streak,away_current_points,away_current_lose_streak,away_current_win_streak
1720,1645660800,4,6,5,0,30,1.68,3.77,5.77,1.15,2.28,1.3,1.09,6.94,1.38,3.07,1.63,2.34,2.19,1.72,3.07,1.37,3.86,1.26,2.07,1.76,3,41,0,6,31,0,2
1721,1645747200,0,6,5,22,20,1.5,4.73,6.53,1.13,2.7,1.23,1.04,11.12,1.22,4.3,1.33,3.27,1.72,2.2,2.19,1.71,2.73,1.47,1.79,2.05,3,24,0,1,10,6,0
1722,1645833600,2,6,5,13,26,3.89,3.77,1.96,1.92,1.27,1.3,1.04,10.02,1.21,4.33,1.3,3.5,1.68,2.25,2.13,1.77,2.64,1.5,1.59,2.35,0,17,3,0,32,1,0
1723,1645833600,2,6,5,8,5,1.91,3.4,4.63,1.22,1.94,1.36,1.11,6.29,1.42,2.9,1.73,2.19,2.27,1.65,3.31,1.32,4.1,1.23,2.04,1.79,1,24,0,1,17,0,2
1724,1645833600,2,6,5,3,19,2.42,3.13,3.33,1.36,1.62,1.39,1.13,5.77,1.46,2.76,1.81,2.09,2.42,1.59,3.61,1.28,4.54,1.2,1.99,1.82,0,21,1,0,14,0,0


In [83]:
train.target.value_counts()

3    736
0    539
1    395
Name: target, dtype: int64

In [85]:
print(f"Class_0 rate {train.target.value_counts()[0] / train.shape[0]}")
print(f"Class_1 rate {train.target.value_counts()[1] / train.shape[0]}")
print(f"Class_3 rate {train.target.value_counts()[3] / train.shape[0]}")

Class_0 rate 0.32275449101796405
Class_1 rate 0.23652694610778444
Class_3 rate 0.4407185628742515


In [81]:
corr_df = train.corr()
corr_df.style.background_gradient(cmap='Reds')

Unnamed: 0,date,day_of_week,season,league,home_team,away_team,home_win_rate,draw_rate,away_win_rate,home_double_chance_rate,away_double_chance_rate,no_draw_rate,total_over_1_rate,total_under_1_rate,total_over_15_rate,total_under_15_rate,total_over_2_rate,total_under_2_rate,total_over_25_rate,total_under_25_rate,total_over_3_rate,total_under_3_rate,total_over_35_rate,total_under_35_rate,both_team_to_score_yes,both_team_to_score_no,target,home_current_points,home_current_lose_streak,home_current_win_streak,away_current_points,away_current_lose_streak,away_current_win_streak
date,1.0,0.021807,0.976963,0.990095,-0.051177,-0.068097,0.002039,-0.03302,-0.07273,0.057228,-0.034338,0.128376,0.311701,0.186069,0.024991,-0.02879,0.059618,-0.076708,-0.053482,0.014382,-0.034103,0.011082,-0.026005,0.013408,-0.152982,0.085289,-0.064155,0.05909,-0.023058,0.007125,0.062206,0.027668,-0.019888
day_of_week,0.021807,1.0,0.052371,-0.011002,0.005621,-0.003619,0.05736,0.064887,0.020731,0.059579,0.026353,-0.026181,-0.008785,0.054871,-0.06504,0.046097,-0.052118,0.059982,-0.067642,0.047503,-0.057442,0.054878,-0.06238,0.046792,0.029567,-0.013656,-0.01943,0.224781,0.002831,0.008978,0.235894,-0.042706,0.069876
season,0.976963,0.052371,1.0,0.951845,-0.047521,-0.065409,0.007752,-0.033145,-0.073475,0.058262,-0.036834,0.120898,0.300549,0.173452,0.035205,-0.041827,0.070993,-0.08314,-0.040266,0.001058,-0.020393,0.001663,-0.012395,0.00028,-0.133184,0.069578,-0.062796,0.164226,-0.021152,0.009289,0.167202,0.020368,-0.012758
league,0.990095,-0.011002,0.951845,1.0,-0.052761,-0.06933,-0.001034,-0.039793,-0.073365,0.053794,-0.036833,0.132974,0.322371,0.186183,0.02932,-0.034136,0.061979,-0.085875,-0.048299,0.008469,-0.030713,0.002764,-0.022203,0.008055,-0.154582,0.082923,-0.064394,-0.048233,-0.030104,0.000365,-0.046233,0.024546,-0.02635
home_team,-0.051177,0.005621,-0.047521,-0.052761,1.0,-0.030865,0.016482,-0.027856,-0.024609,0.017747,-0.028142,0.019176,0.024135,-0.025028,0.044072,-0.056048,0.027786,-0.049072,0.05551,-0.057149,0.040646,-0.058309,0.049235,-0.059337,0.041976,-0.043998,-0.023362,-0.022245,-0.015507,0.002917,-0.003712,-0.040629,0.038215
away_team,-0.068097,-0.003619,-0.065409,-0.06933,-0.030865,1.0,-0.015631,0.011271,0.02214,-0.026413,0.021592,-0.03264,0.006168,-0.030451,0.019596,-0.022815,0.028116,-0.015523,0.03582,-0.025613,0.040859,-0.015691,0.039007,-0.028549,0.051678,-0.083238,0.005688,0.001636,0.019045,0.05451,-0.034505,0.027589,-0.015635
home_win_rate,0.002039,0.05736,0.007752,-0.001034,0.016482,-0.015631,1.0,0.101069,-0.461062,0.951987,-0.468402,-0.096392,-0.02812,0.092797,-0.125123,0.077863,-0.112835,0.071304,-0.125748,0.086422,-0.118008,0.07431,-0.122545,0.076055,0.002869,-0.014161,-0.359606,-0.175854,0.103949,-0.163237,0.332505,-0.184473,0.359937
draw_rate,-0.03302,0.064887,-0.033145,-0.039793,-0.027856,0.011271,0.101069,1.0,0.80811,0.063773,0.807196,-0.449266,-0.372388,0.269764,-0.575183,0.818299,-0.563346,0.816592,-0.654019,0.830328,-0.608906,0.802104,-0.633281,0.809816,0.384922,-0.278718,0.171654,0.277145,-0.150551,0.28148,0.014799,0.01677,0.045889
away_win_rate,-0.07273,0.020731,-0.073475,-0.073365,-0.024609,0.02214,-0.461062,0.80811,1.0,-0.484672,0.9819,-0.326597,-0.277304,0.139036,-0.350197,0.580643,-0.347238,0.5937,-0.405374,0.5859,-0.371648,0.574097,-0.387046,0.576316,0.481389,-0.3803,0.348965,0.316285,-0.186679,0.325823,-0.19844,0.126053,-0.172184
home_double_chance_rate,0.057228,0.059579,0.058262,0.053794,0.017747,-0.026413,0.951987,0.063773,-0.484672,1.0,-0.458367,0.112218,-0.006418,0.116601,-0.129084,0.07616,-0.115166,0.066386,-0.13846,0.087523,-0.129561,0.076936,-0.136041,0.077048,-0.059628,0.048136,-0.347195,-0.164914,0.099051,-0.160879,0.332846,-0.176568,0.360117


In [82]:
corr_df[['target']].style.background_gradient(cmap='Reds')

Unnamed: 0,target
date,-0.064155
day_of_week,-0.01943
season,-0.062796
league,-0.064394
home_team,-0.023362
away_team,0.005688
home_win_rate,-0.359606
draw_rate,0.171654
away_win_rate,0.348965
home_double_chance_rate,-0.347195


# Проверить:

- мультиколлинеарность
- forward/backward selection
- 


# Переход к [модели](#model)

## Generating some features and testing hypotheses

# Оставим только "не-коэффициенты" и декодируем названия команд 

In [46]:
non_coef_features = ['date',
                     'time',
                     'day_of_week',
                     'league',
                     'home_team',
                     'away_team',
                     'home_scored',
                     'away_scored',
                     'target'                    
                    ]

train_with_names = train[non_coef_features].copy()

train_with_names.home_team = train_with_names.home_team.map(decode_teams)
train_with_names.away_team = train_with_names.away_team.map(decode_teams)

train_with_names.head()

Unnamed: 0,date,time,day_of_week,league,home_team,away_team,home_scored,away_scored,target
217,2016-09-24,14:00,Saturday,premier-league-2016-2017,Middlesbrough,Tottenham,1,2,0
276,2016-09-24,14:00,Saturday,premier-league-2016-2017,Stoke,West Brom,1,1,1
34,2016-09-24,11:30,Saturday,premier-league-2016-2017,Manchester Utd,Leicester,4,1,3
6,2016-09-24,14:00,Saturday,premier-league-2016-2017,Swansea,Manchester City,1,3,0
190,2016-09-24,14:00,Saturday,premier-league-2016-2017,Bournemouth,Everton,1,0,3


# ПЕРЕДЕЛАТЬ! Считаем сезонные :
- средние (пропущенные и забитые)
- общие (количество очков, забитые, пропущенные)

In [42]:
# home_team_season_data = train_with_names.groupby(['home_team', 'league'], as_index=False) \
#                         .agg(home_scored_by_season = ('home_scored','sum'), 
#                             home_missed_by_season = ('away_scored','sum'),
# #                             total_points = ('target', 'sum'),
#                             home_mean_scored_season = ('home_scored', 'mean'),
#                             home_mean_missed_season = ('away_scored', 'mean')                                 
#                             )

# away_team_season_data = train_with_names.groupby(['away_team', 'league'], as_index=False) \
#                         .agg(away_scored_by_season = ('away_scored','sum'), 
#                             away_missed_by_season = ('home_scored','sum'),
# #                                  total_points = ('target', 'sum'),
#                             away_mean_scored_season = ('away_scored', 'mean'),
#                             away_mean_missed_season = ('home_scored', 'mean')                                 
#                             )

# away_team_season_data

Unnamed: 0,away_team,league,away_scored_by_season,away_missed_by_season,away_mean_scored_season,away_mean_missed_season
0,Arsenal,premier-league-2016-2017,28,17,2.153846,1.307692
1,Arsenal,premier-league-2017-2018,18,27,1.125000,1.687500
2,Arsenal,premier-league-2018-2019,31,34,1.722222,1.888889
3,Arsenal,premier-league-2019-2020,13,16,1.000000,1.230769
4,Arsenal,premier-league-2020-2021,25,15,1.470588,0.882353
...,...,...,...,...,...,...
115,West Ham,premier-league-2021-2022,14,11,1.400000,1.100000
116,Wolves,premier-league-2018-2019,17,19,1.062500,1.187500
117,Wolves,premier-league-2019-2020,21,21,1.312500,1.312500
118,Wolves,premier-league-2020-2021,14,27,0.777778,1.500000


In [45]:
# train_with_names = train_with_names.merge(home_team_season_data, how='left', on=['home_team', 'league'])
# train_with_names = train_with_names.merge(away_team_season_data, how='left', on=['away_team', 'league'])

# train_with_names.head()

Unnamed: 0,date,time,day_of_week,league,home_team,away_team,home_scored,away_scored,target,home_scored_by_season,home_missed_by_season,home_mean_scored_season,home_mean_missed_season,away_scored_by_season,away_missed_by_season,away_mean_scored_season,away_mean_missed_season
0,2016-09-24,14:00,Saturday,premier-league-2016-2017,Middlesbrough,Tottenham,1,2,0,15,20,0.9375,1.25,34,14,2.125,0.875
1,2016-09-24,14:00,Saturday,premier-league-2016-2017,Stoke,West Brom,1,1,1,19,14,1.357143,1.0,13,23,0.866667,1.533333
2,2016-09-24,11:30,Saturday,premier-league-2016-2017,Manchester Utd,Leicester,4,1,3,20,9,1.333333,0.6,12,28,0.8,1.866667
3,2016-09-24,14:00,Saturday,premier-league-2016-2017,Swansea,Manchester City,1,3,0,20,28,1.333333,1.866667,33,20,2.2,1.333333
4,2016-09-24,14:00,Saturday,premier-league-2016-2017,Bournemouth,Everton,1,0,3,28,22,2.0,1.571429,13,24,0.866667,1.6


# ПЕРЕДЕЛАТЬ! Личные встречи за все время забитые/пропущенные

In [47]:
# personal_battles = train_with_names.groupby(['home_team', 'away_team'], as_index=False) \
#                         .agg(total_home_scored = ('home_scored', 'sum'),
#                             total_home_missed = ('away_scored', 'sum'),
                            
#                             )

# personal_battles

# Подсчет текущих очков и вин/луз стриков по сезонам

In [48]:
query = '((home_team == @team) | (away_team == @team)) & (league == @season)'

def calculate_win_streak(actual_win_streak: int, match_result: int) -> int:
    
    new_win_streak = actual_win_streak
    
    if match_result == 3:
        
        new_win_streak += 1
    
    else:
        
        new_win_streak = 0
        
    return new_win_streak

def calculate_lose_streak(actual_lose_streak: int, match_result: int) -> int:
    
    new_lose_streak = actual_lose_streak
    
    if match_result == 0:
        
        new_lose_streak += 1
    
    else:
        
        new_lose_streak = 0
        
    return new_lose_streak


train_with_current_points = train_with_names.copy()

for season in train_with_current_points.league.unique():
    
    for team in train_with_current_points.home_team.unique():    

        current_points = 0
        current_win_streak = 0
        current_lose_streak = 0

        team_season_data = train_with_current_points.query(query)

        for idx in team_season_data.index:
            
#             if season == 'premier-league-2021-2022' and team == 'Arsenal':

#                 print(f"Match of {team_season_data.loc[idx, 'date']}\n"
#                          f"Home team: {team_season_data.loc[idx, 'home_team']}, "
#                          f"away team: {team_season_data.loc[idx, 'away_team']}"
#                          )

            if team_season_data.loc[idx, 'home_team'] == team:

                train_with_current_points.loc[idx, 'home_current_points'] = current_points

                current_points += team_season_data.loc[idx, 'target']

#                 if season == 'premier-league-2021-2022' and team == 'Arsenal':
    
#                     print(f"result is {team_season_data.loc[idx, 'target']} points for home team {team_season_data.loc[idx, 'home_team']}\n"
#                          f"Match score {team_season_data.loc[idx, 'home_scored']}:{team_season_data.loc[idx, 'away_scored']}")

                train_with_current_points.loc[idx, 'home_current_lose_streak'] = current_lose_streak

                train_with_current_points.loc[idx, 'home_current_win_streak'] = current_win_streak

                current_lose_streak = calculate_lose_streak(current_lose_streak, team_season_data.loc[idx, 'target'])

                current_win_streak = calculate_win_streak(current_win_streak, team_season_data.loc[idx, 'target'])

            else:

                train_with_current_points.loc[idx, 'away_current_points'] = current_points

                home = team_season_data.loc[idx, 'home_scored']
                away = team_season_data.loc[idx, 'away_scored']

                away_match_score = 3 if home < away else 1 if home == away else 0

                current_points += away_match_score
                
#                 if season == 'premier-league-2021-2022' and team == 'Arsenal':

#                     print(f"result is {away_match_score} points for away team {team_season_data.loc[idx, 'away_team']}\n"
#                          f"Match score {home}:{away}")

                train_with_current_points.loc[idx, 'away_current_lose_streak'] = current_lose_streak

                train_with_current_points.loc[idx, 'away_current_win_streak'] = current_win_streak

                current_lose_streak = calculate_lose_streak(current_lose_streak, away_match_score)

                current_win_streak = calculate_win_streak(current_win_streak, away_match_score)
            
#             if season == 'premier-league-2021-2022' and team == 'Arsenal':

#                 print('______________________________________________________________ \n')

train_with_current_points.home_current_points = train_with_current_points.home_current_points.astype(int)
train_with_current_points.away_current_points = train_with_current_points.away_current_points.astype(int)
train_with_current_points.away_current_win_streak = train_with_current_points.away_current_win_streak.astype(int)
train_with_current_points.away_current_lose_streak = train_with_current_points.away_current_lose_streak.astype(int)
train_with_current_points.home_current_win_streak = train_with_current_points.home_current_win_streak.astype(int)
train_with_current_points.home_current_lose_streak = train_with_current_points.home_current_lose_streak.astype(int)

In [49]:
train_with_current_points[(train_with_current_points.home_team == 'Manchester City')|(train_with_current_points.away_team == 'Manchester City')].tail(15)

Unnamed: 0,date,time,day_of_week,league,home_team,away_team,home_scored,away_scored,target,home_current_points,home_current_lose_streak,home_current_win_streak,away_current_points,away_current_lose_streak,away_current_win_streak
1698,2021-10-03,15:30,Sunday,premier-league-2021-2022,Liverpool,Manchester City,2,2,1,14,0,0,13,0,1
1702,2021-10-16,14:00,Saturday,premier-league-2021-2022,Manchester City,Burnley,2,0,3,14,0,0,3,0,0
1552,2021-10-23,16:30,Saturday,premier-league-2021-2022,Brighton,Manchester City,1,4,0,15,0,0,17,0,1
1550,2021-10-30,14:00,Saturday,premier-league-2021-2022,Manchester City,Crystal Palace,0,2,0,20,0,2,9,0,0
1557,2021-11-06,12:30,Saturday,premier-league-2021-2022,Manchester Utd,Manchester City,0,2,0,17,0,1,20,1,0
1631,2021-11-21,14:00,Sunday,premier-league-2021-2022,Manchester City,Everton,3,0,3,23,0,1,15,0,0
1647,2021-11-28,14:00,Sunday,premier-league-2021-2022,Manchester City,West Ham,2,1,3,26,0,2,23,1,0
1562,2021-12-01,20:15,Wednesday,premier-league-2021-2022,Aston Villa,Manchester City,1,2,0,16,0,2,29,0,3
1520,2021-12-04,17:30,Saturday,premier-league-2021-2022,Watford,Manchester City,1,3,0,13,2,0,32,0,4
1619,2021-12-11,12:30,Saturday,premier-league-2021-2022,Manchester City,Wolves,1,0,3,35,0,5,21,1,0


In [51]:
train_with_names = train_with_names.merge(train_with_current_points, how='left')
train_with_names.sample(5)

Unnamed: 0,date,time,day_of_week,league,home_team,away_team,home_scored,away_scored,target,home_current_points,home_current_lose_streak,home_current_win_streak,away_current_points,away_current_lose_streak,away_current_win_streak
1265,2020-11-08,16:30,Sunday,premier-league-2020-2021,Manchester City,Liverpool,1,1,1,11,0,1,16,0,2
167,2017-01-22,16:30,Sunday,premier-league-2016-2017,Chelsea,Hull,2,0,3,42,0,1,9,0,1
1316,2020-12-17,18:00,Thursday,premier-league-2020-2021,Aston Villa,Burnley,0,0,1,18,0,1,9,0,1
1521,2021-08-14,14:00,Saturday,premier-league-2021-2022,Leicester,Wolves,1,0,3,0,0,0,0,0,0
1365,2021-01-17,14:00,Sunday,premier-league-2020-2021,Sheffield Utd,Tottenham,1,3,0,5,0,1,30,0,0


# <a id='model'></a> Model  
[go to transformer](#transformer)

[go to top](#top)

In [14]:
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

In [15]:
categorical_features.append('home_team')
categorical_features.append('away_team')
categorical_features

['day_of_week', 'season', 'league', 'home_team', 'away_team']

In [16]:
cat_features = list(categorical_features)

model_data = {
    'train':train,
    'val':val,
    'test':test,
    'target':'target',
    'cat_features':cat_features
}

In [17]:
X_test = test.drop(columns=['target'])
y_test = test.target

In [87]:
model_params = {'n_estimators':1000,
                'learning_rate':0.01,
                'loss_function':'MultiClass',
                'task_type':'GPU',
                'verbose':250
                }

model = BoostingModel(params=model_params, data=model_data)

model.fit()
preds_class = model.predict(X_test)
preds_proba = model.predict_proba(X_test)

0:	learn: 1.0963448	test: 1.0948318	best: 1.0948318 (0)	total: 14.6ms	remaining: 14.5s
250:	learn: 0.8858045	test: 0.8869384	best: 0.8869384 (250)	total: 1.59s	remaining: 4.75s
500:	learn: 0.8322711	test: 0.8761343	best: 0.8761343 (500)	total: 3.11s	remaining: 3.09s
750:	learn: 0.7866787	test: 0.8765372	best: 0.8742498 (684)	total: 4.57s	remaining: 1.51s
999:	learn: 0.7463874	test: 0.8839453	best: 0.8742498 (684)	total: 5.98s	remaining: 0us
bestTest = 0.8742498016
bestIteration = 684
Shrink model to first 685 iterations.


# Оценка качества

In [20]:
print(f'Validation ACCURACY SCORE: {accuracy_score(y_test, preds_class)}')

Validation ACCURACY SCORE: 0.6


In [21]:
print(f'Validation ROC AUC SCORE: {roc_auc_score(y_test, preds_proba, multi_class="ovr")}')

Validation ROC AUC SCORE: 0.7584175084175083


In [89]:
train_preds_class = model.model.predict(train.drop(columns=['target']))

In [93]:
train_preds_df = pd.DataFrame()
train_preds_df['pred_class'] = train_preds_class.ravel()

In [98]:
print('True target rate: ')
print(f"Class_0 rate {train.target.value_counts()[0] / train.shape[0]}")
print(f"Class_1 rate {train.target.value_counts()[1] / train.shape[0]}")
print(f"Class_3 rate {train.target.value_counts()[3] / train.shape[0]}")

True target rate: 
Class_0 rate 0.32275449101796405
Class_1 rate 0.23652694610778444
Class_3 rate 0.4407185628742515


In [97]:
print('Predict target rate:')
print(f"Class_0 rate {train_preds_df.pred_class.value_counts()[0] / train_preds_df.shape[0]}")
print(f"Class_1 rate {train_preds_df.pred_class.value_counts()[1] / train_preds_df.shape[0]}")
print(f"Class_3 rate {train_preds_df.pred_class.value_counts()[3] / train_preds_df.shape[0]}")

Predict target rate:
Class_0 rate 0.3461077844311377
Class_1 rate 0.05568862275449102
Class_3 rate 0.5982035928143713


In [22]:
results = pd.DataFrame()

results['true'] = y_test
results['pred_class'] = preds_class
results['score_0'] = preds_proba[:,0]
results['score_1'] = preds_proba[:,1]
results['score_3'] = preds_proba[:,2]

results

Unnamed: 0,true,pred_class,score_0,score_1,score_3
1720,3,3,0.200003,0.259659,0.540338
1721,3,3,0.143805,0.225741,0.630454
1722,0,0,0.527448,0.200994,0.271558
1723,1,3,0.271509,0.309225,0.419266
1724,0,3,0.247046,0.330031,0.422922
1725,1,3,0.12557,0.207403,0.667027
1726,0,0,0.707343,0.149805,0.142853
1727,0,3,0.212374,0.371285,0.416341
1728,3,3,0.280423,0.283762,0.435816
1729,0,3,0.296087,0.345812,0.358101


In [99]:
feature_importance = model.get_feature_importances()
feature_importance_df = pd.DataFrame()

feature_importance_df['feature'] = X_test.columns
feature_importance_df['importance'] = feature_importance

feature_importance_df.sort_values(by='importance', ascending=False).head(15)

Unnamed: 0,feature,importance
8,away_win_rate,11.249222
4,home_team,9.374967
6,home_win_rate,8.29396
5,away_team,7.318116
10,away_double_chance_rate,7.075056
7,draw_rate,4.757542
3,league,3.924642
2,season,3.814146
0,date,3.538609
11,no_draw_rate,3.489918


In [70]:
test_predictions = pd.DataFrame()
test_predictions['home_team'] = X_test.home_team.map(decode_teams)
test_predictions['away_team'] = X_test.away_team.map(decode_teams)
test_predictions['home_win_proba'] = preds_proba[:,0]
test_predictions['draw_proba'] = preds_proba[:,1]
test_predictions['away_win_proba'] = preds_proba[:,2]
test_predictions['home_win_rate'] = X_test.home_win_rate
test_predictions['draw_rate'] = X_test.draw_rate
test_predictions['away_win_rate'] = X_test.away_win_rate
test_predictions['result'] = y_test
test_predictions['predict'] = preds_class

In [27]:
test_predictions

Unnamed: 0,home_team,away_team,home_win_proba,draw_proba,away_win_proba,home_win_rate,draw_rate,away_win_rate,result,predict
1720,Arsenal,Wolves,0.200003,0.259659,0.540338,1.68,3.77,5.77,3,3
1721,Southampton,Norwich,0.143805,0.225741,0.630454,1.5,4.73,6.53,3,3
1722,Leeds,Tottenham,0.527448,0.200994,0.271558,3.89,3.77,1.96,0,0
1723,Crystal Palace,Burnley,0.271509,0.309225,0.419266,1.91,3.4,4.63,1,3
1724,Brentford,Newcastle,0.247046,0.330031,0.422922,2.42,3.13,3.33,0,3
1725,Manchester Utd,Watford,0.12557,0.207403,0.667027,1.38,5.18,8.5,1,3
1726,Everton,Manchester City,0.707343,0.149805,0.142853,11.45,6.37,1.27,0,0
1727,Brighton,Aston Villa,0.212374,0.371285,0.416341,2.25,3.26,3.53,0,3
1728,West Ham,Wolves,0.280423,0.283762,0.435816,1.93,3.37,4.57,3,3
1729,Burnley,Leicester,0.296087,0.345812,0.358101,3.03,3.32,2.5,0,3


# Считаем ROI - "коэффициент выгодности ставки"

In [40]:
class ROIChecker():
    def __init__(self, predictions):
        self.predictions = predictions
    
    def get_roi(self):
        
        self.predictions['home_win_ROI'] = self.predictions.home_win_rate * self.predictions.home_win_proba - 1
        self.predictions['away_win_ROI'] = self.predictions.away_win_rate * self.predictions.away_win_proba - 1
        self.predictions['draw_ROI'] = self.predictions.draw_rate * self.predictions.draw_proba - 1
        
        return self.predictions

In [41]:
checker = ROIChecker(test_predictions)
roi_info = checker.get_roi()

In [42]:
roi_info.head()

Unnamed: 0,home_team,away_team,home_win_proba,draw_proba,away_win_proba,home_win_rate,draw_rate,away_win_rate,result,predict,home_win_ROI,away_win_ROI,draw_ROI
1720,Arsenal,Wolves,0.200003,0.259659,0.540338,1.68,3.77,5.77,3,3,-0.663995,2.117749,-0.021084
1721,Southampton,Norwich,0.143805,0.225741,0.630454,1.5,4.73,6.53,3,3,-0.784293,3.116867,0.067754
1722,Leeds,Tottenham,0.527448,0.200994,0.271558,3.89,3.77,1.96,0,0,1.051772,-0.467746,-0.242253
1723,Crystal Palace,Burnley,0.271509,0.309225,0.419266,1.91,3.4,4.63,1,3,-0.481419,0.941202,0.051367
1724,Brentford,Newcastle,0.247046,0.330031,0.422922,2.42,3.13,3.33,0,3,-0.402148,0.408332,0.032998


In [45]:
def explain_roi_info(roi_info):
    roi_cols = ['home_win_ROI', 'away_win_ROI', 'draw_ROI']
    best_roi_df = pd.DataFrame()
    for index, row in roi_info.iterrows():
        print(f"Match #{index+1}: {row.home_team} vs {row.away_team}")
        
        max_roi = np.max(roi_info.loc[index, roi_cols])
        current_choice = 'home_win_ROI'
        
        for col in roi_cols:
            if row[col] == max_roi:
                current_choice = col
        current_choice = ' '.join(current_choice.split('_')[:1])
        
        if max_roi > 0:
            print(f"Maximal ROI = {max_roi*100}% on {current_choice}")
        
        else:
            print("Нет положительного ROI")
            
        print('_______________________________ \n')
        
        best_roi_df.loc[index, 'home_team'] = row.home_team
        best_roi_df.loc[index, 'away_team'] = row.away_team
        best_roi_df.loc[index, 'best_ROI'] = max_roi
        best_roi_df.loc[index, 'choice'] = current_choice
        best_roi_df.loc[index, 'home_win_rate'] = row.home_win_rate
        best_roi_df.loc[index, 'draw_rate'] = row.draw_rate
        best_roi_df.loc[index, 'away_win_rate'] = row.away_win_rate
        best_roi_df.loc[index, 'result'] = row.result
        best_roi_df.loc[index, 'predict'] = row.predict

        
    return best_roi_df

In [46]:
best_roi_df = explain_roi_info(roi_info)

Match #1721: Arsenal vs Wolves
Maximal ROI = 211.77489097258905% on away
_______________________________ 

Match #1722: Southampton vs Norwich
Maximal ROI = 311.68670343368774% on away
_______________________________ 

Match #1723: Leeds vs Tottenham
Maximal ROI = 105.17719395641758% on home
_______________________________ 

Match #1724: Crystal Palace vs Burnley
Maximal ROI = 94.12016590726147% on away
_______________________________ 

Match #1725: Brentford vs Newcastle
Maximal ROI = 40.833157784895114% on away
_______________________________ 

Match #1726: Manchester Utd vs Watford
Maximal ROI = 466.97254428741275% on away
_______________________________ 

Match #1727: Everton vs Manchester City
Maximal ROI = 709.9074276470485% on home
_______________________________ 

Match #1728: Brighton vs Aston Villa
Maximal ROI = 46.96849363017157% on away
_______________________________ 

Match #1729: West Ham vs Wolves
Maximal ROI = 99.1677241899261% on away
_______________________________ 


In [47]:
best_roi_df

Unnamed: 0,home_team,away_team,best_ROI,choice,home_win_rate,draw_rate,away_win_rate,result,predict
1720,Arsenal,Wolves,2.117749,away,1.68,3.77,5.77,3.0,3.0
1721,Southampton,Norwich,3.116867,away,1.5,4.73,6.53,3.0,3.0
1722,Leeds,Tottenham,1.051772,home,3.89,3.77,1.96,0.0,0.0
1723,Crystal Palace,Burnley,0.941202,away,1.91,3.4,4.63,1.0,3.0
1724,Brentford,Newcastle,0.408332,away,2.42,3.13,3.33,0.0,3.0
1725,Manchester Utd,Watford,4.669725,away,1.38,5.18,8.5,1.0,3.0
1726,Everton,Manchester City,7.099074,home,11.45,6.37,1.27,0.0,0.0
1727,Brighton,Aston Villa,0.469685,away,2.25,3.26,3.53,0.0,3.0
1728,West Ham,Wolves,0.991677,away,1.93,3.37,4.57,3.0,3.0
1729,Burnley,Leicester,0.148096,draw,3.03,3.32,2.5,0.0,3.0


In [67]:
def money_score(best_roi_df, bet=100):
    results = {'home':0, 'draw':1, 'away':2}
    cols = {'home':'home_win_rate', 'draw':'draw_rate', 'away':'away_win_rate'}
    profit = 0
    skipped_bets = 0
    accepted_bets = 0
    
    for index, row in best_roi_df.iterrows():
        if row[cols[row.choice]] > 100:
                skipped_bets += 1
        else:
            accepted_bets += 1

            if results[row.choice] == row.result:

                current_profit = bet * (row[cols[row.choice]] - 1)
                profit += current_profit
                print(f'Match {row.home_team} vs {row.away_team}')
                print(row.choice, row[cols[row.choice]], current_profit)
                print(f'Match score ')
                print('_____________________________________________________\n')
            else:
                profit -= bet
    return profit, skipped_bets, accepted_bets
    

In [68]:
score = money_score(best_roi_df, 100)

Match Leeds vs Tottenham
home 3.89 289.0
_____________________________________________________

Match Everton vs Manchester City
home 11.45 1045.0
_____________________________________________________

Match Burnley vs Chelsea
home 7.53 653.0
_____________________________________________________

Match Watford vs Arsenal
home 6.86 586.0
_____________________________________________________



In [62]:
print(f'Skipped bets: {score[1]} \nAccepted bets: {score[2]} \nProfit: {score[0]}$')

Skipped bets: 0 
Accepted bets: 20 
Profit: 973.0$


# [НАВЕРХ](#top)