# Пайплайн <a id='top'></a>

## Imports

In [2]:
import yaml

import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from data_transformer import DataTransformer
from models import BoostingModel
from scorer import MoneyScorer

2022-04-15 04:01:30.032354: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-15 04:01:30.032375: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# PATHS

In [3]:
DATA_PATH = 'data/all_england_v2.2.csv'
FEATURES_PATH = 'data/features.yaml'

## Load data

In [4]:
raw_train_data = pd.read_csv(DATA_PATH)

with open(FEATURES_PATH) as f:
    all_features_dict = yaml.safe_load(f)

## Preprocess data

In [5]:
def base_data_preprocess(data):
    
    preprocessed_data = data.copy()
    
    preprocessed_data = preprocessed_data.fillna(1.01)
    preprocessed_data['day'] = pd.to_datetime(preprocessed_data.date, format='%d.%m.%Y')
    preprocessed_data.date = preprocessed_data.day.values.astype(np.int64) // 10 ** 9
    preprocessed_data = preprocessed_data.sort_values(by='date')
    preprocessed_data = preprocessed_data.drop(columns=['link', 'country', 'time', 'month'])
    preprocessed_data.loc[preprocessed_data.league == 'premier-league', 'league'] = 'premier-league-2021-2022'
    preprocessed_data.loc[preprocessed_data.league == 'championship', 'league'] = 'championship-2021-2022'
    preprocessed_data.loc[:, 'season'] = preprocessed_data.apply(set_season, axis=1)
    preprocessed_data.loc[:, 'league'] = preprocessed_data.apply(set_league, axis=1)
    
    # ТЕКУЩАЯ ВЕРСИЯ
    preprocessed_data.loc[preprocessed_data.league == 'premier-league', 'league_level'] = 1


    return preprocessed_data

In [6]:
def set_league(row):
    l = row.league
    l = l.split('-')[:-2:1]
    return "-".join(l)
    
def set_season(row):
    s = row.league
    s = s.split('-')[-2:]
    return "-".join(s)    

In [7]:
train_data = base_data_preprocess(raw_train_data)

train_data.tail()

Unnamed: 0,date,day_of_week,season,league,league_level,home_team,away_team,home_win_rate,draw_rate,away_win_rate,home_double_chance_rate,away_double_chance_rate,no_draw_rate,total_over_1_rate,total_under_1_rate,total_over_15_rate,total_under_15_rate,total_over_2_rate,total_under_2_rate,total_over_25_rate,total_under_25_rate,total_over_3_rate,total_under_3_rate,total_over_35_rate,total_under_35_rate,both_team_to_score_yes,both_team_to_score_no,home_handicap_0,away_handicap_0,home_handicap_minus_15,away_handicap_minus_15,home_handicap_minus_1,away_handicap_minus_1,home_handicap_plus_15,away_handicap_plus_15,home_handicap_plus_1,away_handicap_plus_1,home_first_half_handicap_0,away_first_half_handicap_0,home_first_half_handicap_minus_15,away_first_half_handicap_minus_15,home_first_half_handicap_minus_1,away_first_half_handicap_minus_1,home_first_half_handicap_plus_15,away_first_half_handicap_plus_15,home_first_half_handicap_plus_1,away_first_half_handicap_plus_1,home_second_half_handicap_0,away_second_half_handicap_0,home_second_half_handicap_minus_15,away_second_half_handicap_minus_15,home_second_half_handicap_minus_1,away_second_half_handicap_minus_1,home_second_half_handicap_plus_15,away_second_half_handicap_plus_15,home_second_half_handicap_plus_1,away_second_half_handicap_plus_1,home_first_half_win_rate,draw_first_half_rate,away_first_half_win_rate,home_second_half_win_rate,draw_second_half_rate,away_second_half_win_rate,home_double_chance_first_half_rate,away_double_chance_first_half_rate,no_draw_first_half_rate,home_double_chance_second_half_rate,away_double_chance_second_half_rate,no_draw_second_half_rate,total_first_half_over_05_rate,total_first_half_under_05_rate,total_first_half_over_1_rate,total_first_half_under_1_rate,total_first_half_over_15_rate,total_first_half_under_15_rate,total_first_half_over_2_rate,total_first_half_under_2_rate,total_second_half_over_05_rate,total_second_half_under_05_rate,total_second_half_over_1_rate,total_second_half_under_1_rate,total_second_half_over_15_rate,total_second_half_under_15_rate,total_second_half_over_2_rate,total_second_half_under_2_rate,both_team_to_score_first_half_yes,both_team_to_score_first_half_no,both_team_to_score_second_half_yes,both_team_to_score_second_half_no,odd,even,odd_first,odd_second,even_first,even_second,correct_score10,correct_score20,correct_score21,correct_score30,correct_score31,correct_score32,correct_score40,correct_score41,correct_score00,correct_score11,correct_score22,correct_score33,correct_score01,correct_score02,correct_score12,correct_score03,correct_score13,correct_score23,correct_score04,correct_score14,home_home,home_draw,home_away,draw_home,draw_draw,draw_away,away_home,away_draw,away_away,home_scored,away_scored,home_first_half_scored,away_first_half_scored,home_second_half_scored,away_second_half_scored,home_manager_working_days,away_manager_working_days,home_manager_name,away_manager_name,home_manager_start_date,home_manager_birthday,home_manager_country,away_manager_start_date,away_manager_birthday,away_manager_country,home_squad_size,home_average_age,home_amount_of_foreigners,home_e_market_value,home_total_market_value,away_squad_size,away_average_age,away_amount_of_foreigners,away_e_market_value,away_total_market_value,home_stadium,home_stadium_capacity,home_city,away_city,is_derby,day
1691,1649548800,Sunday,2021-2022,premier-league,1,Manchester City,Liverpool FC,1.98,3.84,3.78,1.3,1.86,1.31,1.05,10.59,1.23,4.08,1.35,3.17,1.79,2.11,2.29,1.65,2.87,1.43,1.62,2.3,1.5,2.64,3.57,1.28,2.88,1.44,1.09,7.19,1.11,6.26,1.64,2.26,7.57,1.07,5.92,1.12,1.02,12.18,1.03,10.78,1.66,2.22,6.2,1.12,4.69,1.19,1.04,11.3,1.06,9.29,2.71,2.29,3.94,2.45,2.71,3.58,1.22,1.44,1.6,1.26,1.52,1.42,1.34,3.24,1.67,2.16,2.62,1.47,4.85,1.17,1.17,4.33,1.32,3.06,1.84,1.85,2.81,1.37,4.02,1.24,3.0,1.39,1.89,1.87,2.08,1.98,1.75,1.83,7.32,8.56,7.36,13.12,11.62,17.37,35.0,30.0,10.67,6.04,10.6,31.7,10.27,15.24,9.91,31.7,20.76,23.82,101.0,100.0,3.0,13.0,29.0,5.0,6.0,8.5,21.0,13.0,6.0,2,2,2,0,1,1,2286,2553,Pep Guardiola,Jürgen Klopp,1467310000.0,32979600.0,Spain,1444241000.0,-80377200.0,Germany,22,27.5,15,4360000000.0,95930000000.0,27,27.1,19,3335000000.0,90050000000.0,Etihad Stadium,55097,Manchester,Liverpool,0,2022-04-10
1702,1649548800,Sunday,2021-2022,premier-league,1,Brentford FC,West Ham United,2.71,3.36,2.73,1.5,1.5,1.35,1.08,7.53,1.32,3.41,1.54,2.53,2.06,1.82,2.78,1.44,3.49,1.3,1.75,2.09,1.97,1.9,5.32,1.15,4.33,1.21,1.14,5.33,1.2,4.38,2.02,1.79,11.02,1.03,9.32,1.05,1.03,10.63,1.05,8.72,1.97,1.84,9.32,1.06,7.3,1.09,1.06,9.07,1.1,7.18,3.48,2.11,3.36,3.08,2.51,2.98,1.32,1.31,1.72,1.36,1.35,1.49,1.44,2.74,1.93,1.83,3.04,1.37,6.28,1.11,1.23,3.72,1.43,2.59,2.05,1.67,3.44,1.26,4.65,1.19,3.31,1.33,1.89,1.87,2.17,2.01,1.69,1.8,7.16,10.14,8.55,19.24,16.32,23.82,80.0,60.0,8.12,5.48,10.92,41.8,7.6,11.02,8.61,20.36,16.99,23.86,90.0,70.0,4.33,13.0,29.0,6.0,4.75,6.0,29.0,13.0,4.33,2,0,0,2,0,0,1449,1010,Thomas Frank,David Moyes,1539623000.0,118947600.0,Denmark,1577552000.0,-211100400.0,Scotland,25,25.8,22,924000000.0,23090000000.0,24,28.8,16,1453000000.0,34875000000.0,Brentford Community Stadium,17250,"Brentford, Middlesex",London,0,2022-04-10
1515,1649548800,Sunday,2021-2022,premier-league,1,Leicester City,Crystal Palace,2.5,3.29,3.03,1.42,1.58,1.36,1.09,7.13,1.36,3.21,1.58,2.42,2.12,1.77,2.95,1.4,3.77,1.27,1.8,2.03,1.72,2.19,4.75,1.17,3.92,1.24,1.12,6.05,1.16,5.13,1.67,2.2,9.57,1.04,8.09,1.06,1.03,10.83,1.04,10.37,1.75,2.07,8.26,1.07,6.47,1.11,1.05,10.2,1.08,8.09,3.18,2.09,3.76,2.84,2.47,3.32,1.25,1.34,1.72,1.3,1.4,1.51,1.46,2.69,1.99,1.78,3.14,1.35,6.68,1.09,1.24,3.6,1.46,2.5,2.1,1.65,3.6,1.24,4.65,1.19,3.31,1.33,1.92,1.84,2.14,2.01,1.71,1.8,6.74,9.57,8.41,17.73,15.9,23.93,70.0,60.0,7.57,5.47,11.61,46.84,7.43,11.46,8.98,21.35,18.56,26.46,101.0,80.0,4.0,13.0,29.0,5.5,4.75,6.5,26.0,15.0,4.75,2,1,2,0,0,1,1315,457,Brendan Rodgers,Patrick Vieira,1551200000.0,96829200.0,Northern Ireland,1625332000.0,204310800.0,France,27,27.0,18,1907000000.0,51480000000.0,26,27.5,14,1000000000.0,25995000000.0,King Power Stadium,34310,"Leicester, Leicestershire",London,0,2022-04-10
4369,1649548800,Sunday,2021-2022,championship,2,Fulham FC,Coventry City,1.41,5.05,7.34,1.11,3.01,1.19,1.02,12.07,1.2,4.33,1.25,3.76,1.6,2.37,1.99,1.87,2.51,1.54,1.81,1.98,1.14,5.68,2.09,1.77,1.61,2.31,1.01,13.0,1.02,12.9,1.24,4.23,4.27,1.21,3.11,1.36,0.0,0.0,0.0,0.0,1.22,4.3,3.64,1.29,2.62,1.49,0.0,0.0,0.0,0.0,1.89,2.66,6.47,1.75,3.03,5.95,1.1,1.89,1.46,1.12,2.04,1.37,1.31,3.51,1.59,2.38,2.38,1.57,4.22,1.22,1.18,4.81,1.31,3.46,1.86,1.94,2.78,1.44,4.38,1.21,3.38,1.32,1.86,1.86,2.01,1.97,1.8,1.84,7.27,6.53,7.36,8.29,9.39,17.8,14.0,17.0,13.6,8.09,14.08,43.15,15.37,27.77,15.86,58.78,38.07,38.01,101.0,101.0,2.0,17.0,41.0,3.75,7.5,17.0,17.0,17.0,13.0,1,3,0,1,2,1,460,2038,Marco Silva,Mark Robins,1625072000.0,237488400.0,Portugal,1488733000.0,-889200.0,England,26,26.9,20,565000000.0,14700000000.0,25,25.4,10,796000.0,1990000000.0,Craven Cottage,25700,London,"Coventry, West Midlands",0,2022-04-10
4444,1649635200,Monday,2021-2022,championship,2,Huddersfield Town,Luton Town,2.43,3.16,3.16,1.38,1.59,1.38,1.11,6.01,1.41,2.89,1.75,2.13,2.33,1.62,3.47,1.29,4.2,1.21,1.93,1.86,1.69,2.19,4.56,1.17,3.69,1.26,1.1,6.43,1.13,5.46,1.73,2.1,10.65,1.04,8.4,1.07,1.02,13.96,1.02,12.36,1.79,2.03,8.9,1.06,6.92,1.1,1.04,11.2,1.06,8.98,3.17,2.05,3.87,2.89,2.39,3.37,1.24,1.34,1.75,1.29,1.39,1.54,1.54,2.51,2.26,1.63,3.52,1.31,8.26,1.07,1.31,3.21,1.63,2.17,2.42,1.51,4.84,1.17,5.52,1.15,3.8,1.27,1.9,1.83,2.17,2.04,1.69,1.78,6.25,9.01,8.57,17.29,16.2,25.26,60.0,60.0,7.14,5.46,12.61,48.25,7.15,11.82,9.72,25.09,20.81,27.9,101.0,100.0,4.0,15.0,34.0,5.5,4.5,6.5,29.0,15.0,5.0,2,0,0,2,0,0,834,890,Carlos Corberán,Nathan Jones,1595437000.0,418492800.0,Spain,1590599000.0,107370000.0,Wales,28,26.0,10,134000000.0,3765000000.0,26,27.1,11,808000.0,2100000000.0,John Smith's Stadium,24554,"Huddersfield, West Yorkshire","Luton, Bedfordshire",0,2022-04-11


In [8]:
train_data.shape

(4482, 156)

# Разбиение фичей на группы [ПРОПУСТИТЬ](#numeric)

In [None]:
enumerated_features = dict()

for idx, feature in enumerate(list(train_data.columns)):
    enumerated_features[idx] = feature

In [None]:
enumerated_features

In [None]:
base_features = list(train_data.iloc[0:2,0:7].columns)
base_features.append(enumerated_features[124])
base_features.append(enumerated_features[125])
base_features.append(enumerated_features[156])

base_features

In [None]:
squad_features = list(train_data.iloc[0:2,141:151].columns)
squad_features

In [None]:
city_features = list(train_data.iloc[0:2,151:156].columns)
city_features

In [None]:
result_coef_features = [enumerated_features[7], enumerated_features[8], enumerated_features[9]]
result_coef_features

In [None]:
double_chance_features = list(train_data.filter(like='chance', axis=1).columns)
double_chance_features

In [None]:
total_coef_features = list(train_data.filter(like='total_over', axis=1).columns) + list(train_data.filter(like='total_under', axis=1).columns)
total_coef_features

In [None]:
handicap_features = list(train_data.iloc[0:2,27:37].columns)
handicap_features

In [None]:
half_features = list(train_data.iloc[0:2,37:88].columns)
half_features[:10]

In [None]:
odd_features = list(train_data.iloc[0:2,89:95].columns)

odd_features

In [None]:
correct_score_features = list(train_data.filter(like='correct_score', axis=1).columns)
correct_score_features

In [None]:
time_match_features = list(train_data.iloc[0:2,115:124].columns)
time_match_features

In [None]:
both_scored_features = list(train_data.iloc[0:2,85:89].columns)
both_scored_features += list(train_data.iloc[0:2,126:130].columns)
both_scored_features.append(enumerated_features[25])
both_scored_features.append(enumerated_features[26])

both_scored_features

In [None]:
manager_features = tuple(train_data.filter(like='manager', axis=1).columns)

manager_features

In [11]:
def prepare_for_yaml(features_list) -> str:
    
    result_yaml = ''
    
    for feature in features_list:
        result_yaml += f"- {feature}\n"
        
    return result_yaml

In [None]:
print(prepare_for_yaml(manager_features))

<a id='numeric'></a>
## Запускать отсюда)

In [12]:
numeric_features = tuple(train_data.select_dtypes(include=['int', 'float']).columns)
print(prepare_for_yaml(numeric_features))

- date
- league_level
- home_win_rate
- draw_rate
- away_win_rate
- home_double_chance_rate
- away_double_chance_rate
- no_draw_rate
- total_over_1_rate
- total_under_1_rate
- total_over_15_rate
- total_under_15_rate
- total_over_2_rate
- total_under_2_rate
- total_over_25_rate
- total_under_25_rate
- total_over_3_rate
- total_under_3_rate
- total_over_35_rate
- total_under_35_rate
- both_team_to_score_yes
- both_team_to_score_no
- home_handicap_0
- away_handicap_0
- home_handicap_minus_15
- away_handicap_minus_15
- home_handicap_minus_1
- away_handicap_minus_1
- home_handicap_plus_15
- away_handicap_plus_15
- home_handicap_plus_1
- away_handicap_plus_1
- home_first_half_handicap_0
- away_first_half_handicap_0
- home_first_half_handicap_minus_15
- away_first_half_handicap_minus_15
- home_first_half_handicap_minus_1
- away_first_half_handicap_minus_1
- home_first_half_handicap_plus_15
- away_first_half_handicap_plus_15
- home_first_half_handicap_plus_1
- away_first_half_handicap_plus_1


In [13]:
categorical_features = tuple(train_data.select_dtypes(include=['object']).columns)
print(prepare_for_yaml(categorical_features))

- day_of_week
- season
- league
- home_team
- away_team
- home_manager_name
- away_manager_name
- home_manager_country
- away_manager_country
- home_stadium
- home_city
- away_city



In [14]:
money_features = tuple(['away_e_market_value', 'away_total_market_value', 'home_e_market_value', 'home_total_market_value'])
print(prepare_for_yaml(money_features))

- away_e_market_value
- away_total_market_value
- home_e_market_value
- home_total_market_value



<a id='transformer'></a>
# Трансформер 

[наверх](#top)

In [None]:
transformer_context = {'data':train_data,
                       'cat_features':categorical_features,
                       'num_features':numeric_features,
                       'money_features':money_features,
                       'grouped_features':all_features_dict
                      }

transformer = DataTransformer(transformer_context)

In [None]:
train, val, test, decode_teams, teams_labels = transformer.run_logic() 

In [None]:
test.head()

In [None]:
train.target.value_counts()

In [None]:
print(f"Class_0 rate {train.target.value_counts()[0] / train.shape[0]}")
print(f"Class_1 rate {train.target.value_counts()[1] / train.shape[0]}")
print(f"Class_3 rate {train.target.value_counts()[3] / train.shape[0]}")

In [None]:
corr_df = train.corr()
corr_df.style.background_gradient(cmap='Reds')

In [None]:
corr_df[['target']].style.background_gradient(cmap='Reds')

# Проверить данные на:

- мультиколлинеарность
- forward/backward selection
- 


# Переход к [модели](#model)

## Generating some features and testing hypotheses

# Оставим только "не-коэффициенты" и декодируем названия команд 

In [None]:
def _set_target(row):
    """ Set target feature from score """

    if row.home_scored > row.away_scored:
        return 3
    elif row.home_scored == row.away_scored:
        return 1
    else:
        return 0

train_data_base = train_data[all_features_dict['base_features'] + all_features_dict['squad_features']].copy()
train_data_base['target'] = train_data_base.apply(_set_target, axis=1)
train_data_base.tail()

In [None]:
for feature in money_features:
    train_data_base[f"log_{feature}"] = train_data_base[feature].apply(np.log)

train_data_base.sample(5)

# ПЕРЕДЕЛАТЬ! Считаем сезонные total :
- забитые
- пропущенные
- количество очков

In [None]:
query = '(season == @season) & (league == @league)'

total_features = train_data_base.copy()

for league in total_features.league.unique():
    print(league)

    for season in total_features.season.unique():
        print(season)
        
        season_data = total_features.query(query)

        for team in total_features.home_team.unique():   
            
            team_data = season_data.query('((home_team == @team) | (away_team == @team))')
            
            print(team)
            print('____')
            total_points = 0
            total_scored = 0
            total_missed = 0


            for idx in team_data.index:

                if team_data.loc[idx, 'home_team'] == team:

                    total_points += team_data.loc[idx, 'target']
                    total_scored += team_data.loc[idx, 'home_scored']
                    total_missed += team_data.loc[idx, 'away_scored']

                else:

                    home = team_data.loc[idx, 'home_scored']
                    away = team_data.loc[idx, 'away_scored']

                    away_match_score = 3 if home < away else 1 if home == away else 0

                    total_points += away_match_score
                    total_scored += team_data.loc[idx, 'away_scored']
                    total_missed += team_data.loc[idx, 'home_scored']

            season_data['total_points'] = total_points
            season_data['total_scored'] = total_scored
            season_data['total_missed'] = total_missed

#             print(f"TEAM: {team} LEAGUE: {league}\ntotal points:{total_points}\ttotal scored:{total_scored}\ttotal missed:{total_missed}")
#             print('_________')
            total_features = season_data

        
total_features.head()        

In [32]:
home_team_alltime = train_data_base.groupby(['home_team'], as_index=False) \
                        .agg(home_mean_scored_season = ('home_scored', 'mean'),
                            home_mean_missed_season = ('away_scored', 'mean')                                 
                            )

away_team_alltime = train_data_base.groupby(['away_team'], as_index=False) \
                        .agg(away_mean_scored_alltime = ('home_scored', 'mean'),
                            away_mean_missed_alltime = ('away_scored', 'mean')                                 
                            )

home_team_alltime.head()

Unnamed: 0,home_team,home_mean_scored_season,home_mean_missed_season
0,AFC Bournemouth,1.641304,1.293478
1,Arsenal FC,1.988889,1.044444
2,Aston Villa,1.637255,1.294118
3,Barnsley FC,1.173913,1.326087
4,Birmingham City,1.142857,1.375


In [27]:
train_data_base = train_data_base.merge(home_team_alltime, how='left', on=['home_team', 'season'])
train_data_base = train_data_base.merge(away_team_alltime, how='left', on=['away_team', 'season'])

train_data_base.head()

Unnamed: 0,date,day_of_week,season,league,league_level,home_team,away_team,home_scored,away_scored,day,home_squad_size,home_average_age,home_amount_of_foreigners,home_e_market_value,home_total_market_value,away_squad_size,away_average_age,away_amount_of_foreigners,away_e_market_value,away_total_market_value,log_away_e_market_value,log_away_total_market_value,log_home_e_market_value,log_home_total_market_value,home_scored_by_season,home_missed_by_season,home_mean_scored_season,home_mean_missed_season,away_scored_by_season,away_missed_by_season,away_mean_scored_season,away_mean_missed_season
0,1474588800,Friday,2016-2017,championship,2,Preston North End,Wigan Athletic,1,0,2016-09-23,33,26.2,19,580000.0,1915000000.0,57,24.5,20,530000.0,3023000000.0,13.180632,21.829516,13.270783,21.372983,29,20,1.705882,1.176471,14,16,0.823529,0.941176
1,1474675200,Saturday,2016-2017,championship,2,Sheffield Wednesday,Nottingham Forest,2,1,2016-09-24,42,26.3,24,160000000.0,6713000000.0,45,25.3,30,141000000.0,6340000000.0,18.76427,22.570145,18.890684,22.627312,27,16,1.588235,0.941176,15,32,0.882353,1.882353
2,1474675200,Saturday,2016-2017,championship,2,Leeds United,Ipswich Town,1,0,2016-09-24,37,24.7,22,136000000.0,5045000000.0,42,25.0,18,618000.0,2595000000.0,13.334244,21.676852,18.728165,22.341663,25,11,1.388889,0.611111,13,25,0.8125,1.5625
3,1474675200,Saturday,2016-2017,championship,2,Reading FC,Huddersfield Town,1,0,2016-09-24,42,24.6,28,104000000.0,4350000000.0,32,25.0,17,123000000.0,3935000000.0,18.627695,22.093177,18.459901,22.193442,27,12,1.5,0.666667,18,27,1.0,1.5
4,1474675200,Saturday,2016-2017,championship,2,Wolverhampton Wanderers,Brentford FC,3,1,2016-09-24,44,24.0,24,136000000.0,5963000000.0,38,23.5,27,888000.0,3375000000.0,13.696727,21.939661,18.728165,22.50884,18,22,1.125,1.375,24,31,1.5,1.9375


# ПЕРЕДЕЛАТЬ! Личные встречи за все время забитые/пропущенные

In [None]:
# personal_battles = train_with_names.groupby(['home_team', 'away_team'], as_index=False) \
#                         .agg(total_home_scored = ('home_scored', 'sum'),
#                             total_home_missed = ('away_scored', 'sum'),
                            
#                             )

# personal_battles

# Подсчет текущих очков и вин/луз стриков по сезонам

In [None]:
query = '((home_team == @team) | (away_team == @team)) & (league == @season)'

def calculate_win_streak(actual_win_streak: int, match_result: int) -> int:
    
    new_win_streak = actual_win_streak
    
    if match_result == 3:
        
        new_win_streak += 1
    
    else:
        
        new_win_streak = 0
        
    return new_win_streak

def calculate_lose_streak(actual_lose_streak: int, match_result: int) -> int:
    
    new_lose_streak = actual_lose_streak
    
    if match_result == 0:
        
        new_lose_streak += 1
    
    else:
        
        new_lose_streak = 0
        
    return new_lose_streak


train_with_current_points = train_with_names.copy()

for season in train_with_current_points.league.unique():
    
    for team in train_with_current_points.home_team.unique():    

        current_points = 0
        current_win_streak = 0
        current_lose_streak = 0

        team_season_data = train_with_current_points.query(query)

        for idx in team_season_data.index:
            
#             if season == 'premier-league-2021-2022' and team == 'Arsenal':

#                 print(f"Match of {team_season_data.loc[idx, 'date']}\n"
#                          f"Home team: {team_season_data.loc[idx, 'home_team']}, "
#                          f"away team: {team_season_data.loc[idx, 'away_team']}"
#                          )

            if team_season_data.loc[idx, 'home_team'] == team:

                train_with_current_points.loc[idx, 'home_current_points'] = current_points

                current_points += team_season_data.loc[idx, 'target']

#                 if season == 'premier-league-2021-2022' and team == 'Arsenal':
    
#                     print(f"result is {team_season_data.loc[idx, 'target']} points for home team {team_season_data.loc[idx, 'home_team']}\n"
#                          f"Match score {team_season_data.loc[idx, 'home_scored']}:{team_season_data.loc[idx, 'away_scored']}")

                train_with_current_points.loc[idx, 'home_current_lose_streak'] = current_lose_streak

                train_with_current_points.loc[idx, 'home_current_win_streak'] = current_win_streak

                current_lose_streak = calculate_lose_streak(current_lose_streak, team_season_data.loc[idx, 'target'])

                current_win_streak = calculate_win_streak(current_win_streak, team_season_data.loc[idx, 'target'])

            else:

                train_with_current_points.loc[idx, 'away_current_points'] = current_points

                home = team_season_data.loc[idx, 'home_scored']
                away = team_season_data.loc[idx, 'away_scored']

                away_match_score = 3 if home < away else 1 if home == away else 0

                current_points += away_match_score
                
#                 if season == 'premier-league-2021-2022' and team == 'Arsenal':

#                     print(f"result is {away_match_score} points for away team {team_season_data.loc[idx, 'away_team']}\n"
#                          f"Match score {home}:{away}")

                train_with_current_points.loc[idx, 'away_current_lose_streak'] = current_lose_streak

                train_with_current_points.loc[idx, 'away_current_win_streak'] = current_win_streak

                current_lose_streak = calculate_lose_streak(current_lose_streak, away_match_score)

                current_win_streak = calculate_win_streak(current_win_streak, away_match_score)
            
#             if season == 'premier-league-2021-2022' and team == 'Arsenal':

#                 print('______________________________________________________________ \n')

train_with_current_points.home_current_points = train_with_current_points.home_current_points.astype(int)
train_with_current_points.away_current_points = train_with_current_points.away_current_points.astype(int)
train_with_current_points.away_current_win_streak = train_with_current_points.away_current_win_streak.astype(int)
train_with_current_points.away_current_lose_streak = train_with_current_points.away_current_lose_streak.astype(int)
train_with_current_points.home_current_win_streak = train_with_current_points.home_current_win_streak.astype(int)
train_with_current_points.home_current_lose_streak = train_with_current_points.home_current_lose_streak.astype(int)

In [None]:
train_with_current_points[(train_with_current_points.home_team == 'Manchester City')|(train_with_current_points.away_team == 'Manchester City')].tail(15)

In [None]:
train_with_names = train_with_names.merge(train_with_current_points, how='left')
train_with_names.sample(5)

# <a id='model'></a> Model  
[go to transformer](#transformer)

[go to top](#top)

In [None]:
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

In [None]:
categorical_features.append('home_team')
categorical_features.append('away_team')
categorical_features

In [None]:
cat_features = list(categorical_features)

model_data = {
    'train':train,
    'val':val,
    'test':test,
    'target':'target',
    'cat_features':cat_features
}

In [None]:
X_test = test.drop(columns=['target'])
y_test = test.target

In [None]:
model_params = {'n_estimators':1000,
                'learning_rate':0.01,
                'loss_function':'MultiClass',
                'task_type':'GPU',
                'verbose':250
                }

model = BoostingModel(params=model_params, data=model_data)

model.fit()
preds_class = model.predict(X_test)
preds_proba = model.predict_proba(X_test)

# Оценка качества

In [None]:
print(f'Validation ACCURACY SCORE: {accuracy_score(y_test, preds_class)}')

In [None]:
print(f'Validation ROC AUC SCORE: {roc_auc_score(y_test, preds_proba, multi_class="ovr")}')

In [None]:
train_preds_class = model.model.predict(train.drop(columns=['target']))

In [None]:
train_preds_df = pd.DataFrame()
train_preds_df['pred_class'] = train_preds_class.ravel()

In [None]:
print('True target rate: ')
print(f"Class_0 rate {train.target.value_counts()[0] / train.shape[0]}")
print(f"Class_1 rate {train.target.value_counts()[1] / train.shape[0]}")
print(f"Class_3 rate {train.target.value_counts()[3] / train.shape[0]}")

In [None]:
print('Predict target rate:')
print(f"Class_0 rate {train_preds_df.pred_class.value_counts()[0] / train_preds_df.shape[0]}")
print(f"Class_1 rate {train_preds_df.pred_class.value_counts()[1] / train_preds_df.shape[0]}")
print(f"Class_3 rate {train_preds_df.pred_class.value_counts()[3] / train_preds_df.shape[0]}")

In [None]:
results = pd.DataFrame()

results['true'] = y_test
results['pred_class'] = preds_class
results['score_0'] = preds_proba[:,0]
results['score_1'] = preds_proba[:,1]
results['score_3'] = preds_proba[:,2]

results

# Влияние фичей на качество модели

In [None]:
feature_importance = model.get_feature_importances()
feature_importance_df = pd.DataFrame()

feature_importance_df['feature'] = X_test.columns
feature_importance_df['importance'] = feature_importance

good_features = feature_importance_df[np.abs(feature_importance_df.importance) > 0].sort_values(by='importance',ascending=False).head(15)
bad_features = feature_importance_df.sort_values(by='importance',ascending=True).head(15)

print(f'Number of features {X_test.shape[1]}')

In [None]:
feature_importance_df.importance.sum()

In [None]:
good_features

In [None]:
bad_features

In [None]:
test_predictions = pd.DataFrame()
test_predictions['home_team'] = X_test.home_team.map(decode_teams)
test_predictions['away_team'] = X_test.away_team.map(decode_teams)
test_predictions['home_win_proba'] = preds_proba[:,0]
test_predictions['draw_proba'] = preds_proba[:,1]
test_predictions['away_win_proba'] = preds_proba[:,2]
test_predictions['home_win_rate'] = X_test.home_win_rate
test_predictions['draw_rate'] = X_test.draw_rate
test_predictions['away_win_rate'] = X_test.away_win_rate
test_predictions['result'] = y_test
test_predictions['predict'] = preds_class

In [None]:
test_predictions

# Считаем ROI - "коэффициент выгодности ставки"

In [None]:
class ROIChecker():
    def __init__(self, predictions):
        self.predictions = predictions
    
    def get_roi(self):
        
        self.predictions['home_win_ROI'] = self.predictions.home_win_rate * self.predictions.home_win_proba - 1
        self.predictions['away_win_ROI'] = self.predictions.away_win_rate * self.predictions.away_win_proba - 1
        self.predictions['draw_ROI'] = self.predictions.draw_rate * self.predictions.draw_proba - 1
        
        return self.predictions

In [None]:
checker = ROIChecker(test_predictions)
roi_info = checker.get_roi()

In [None]:
roi_info.head()

In [None]:
def explain_roi_info(roi_info):
    roi_cols = ['home_win_ROI', 'away_win_ROI', 'draw_ROI']
    best_roi_df = pd.DataFrame()
    for index, row in roi_info.iterrows():
        print(f"Match #{index+1}: {row.home_team} vs {row.away_team}")
        
        max_roi = np.max(roi_info.loc[index, roi_cols])
        current_choice = 'home_win_ROI'
        
        for col in roi_cols:
            if row[col] == max_roi:
                current_choice = col
        current_choice = ' '.join(current_choice.split('_')[:1])
        
        if max_roi > 0:
            print(f"Maximal ROI = {max_roi*100}% on {current_choice}")
        
        else:
            print("Нет положительного ROI")
            
        print('_______________________________ \n')
        
        best_roi_df.loc[index, 'home_team'] = row.home_team
        best_roi_df.loc[index, 'away_team'] = row.away_team
        best_roi_df.loc[index, 'best_ROI'] = max_roi
        best_roi_df.loc[index, 'choice'] = current_choice
        best_roi_df.loc[index, 'home_win_rate'] = row.home_win_rate
        best_roi_df.loc[index, 'draw_rate'] = row.draw_rate
        best_roi_df.loc[index, 'away_win_rate'] = row.away_win_rate
        best_roi_df.loc[index, 'result'] = row.result
        best_roi_df.loc[index, 'predict'] = row.predict

        
    return best_roi_df

In [None]:
best_roi_df = explain_roi_info(roi_info)

In [None]:
best_roi_df

In [None]:
def money_score(best_roi_df, bet=100):
    results = {'home':0, 'draw':1, 'away':2}
    cols = {'home':'home_win_rate', 'draw':'draw_rate', 'away':'away_win_rate'}
    profit = 0
    skipped_bets = 0
    accepted_bets = 0
    
    for index, row in best_roi_df.iterrows():
        if row[cols[row.choice]] > 100:
                skipped_bets += 1
        else:
            accepted_bets += 1

            if results[row.choice] == row.result:

                current_profit = bet * (row[cols[row.choice]] - 1)
                profit += current_profit
                print(f'Match {row.home_team} vs {row.away_team}')
                print(row.choice, row[cols[row.choice]], current_profit)
                print(f'Match score ')
                print('_____________________________________________________\n')
            else:
                profit -= bet
    return profit, skipped_bets, accepted_bets
    

In [None]:
score = money_score(best_roi_df, 100)

In [None]:
print(f'Skipped bets: {score[1]} \nAccepted bets: {score[2]} \nProfit: {score[0]}$')

# [НАВЕРХ](#top)