In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from data_transformer import DataTransformer
from scorer import MoneyScorer

In [47]:
train_data = pd.read_csv('data/england.csv')
test_data = pd.read_csv('data/future.csv')

In [48]:
def base_data_preprocess(data):
    
    preprocessed_data = data.copy()
    
    preprocessed_data = preprocessed_data.rename(columns={'away_tem':'away_team'})
    preprocessed_data = preprocessed_data.fillna(1.01)
    preprocessed_data.date = pd.to_datetime(preprocessed_data.date, format='%d.%m.%Y')
    preprocessed_data = preprocessed_data.sort_values(by='date')
    preprocessed_data = preprocessed_data.drop(columns=['link', 'country', 'league_level'])
    preprocessed_data.loc[preprocessed_data.league == 'premier-league', 'league'] = 'premier-league-2021-2022'
    
    return preprocessed_data

In [49]:
train_data = base_data_preprocess(train_data)
test_data = base_data_preprocess(test_data)

In [50]:
train_data.league.unique()

array(['premier-league-2016-2017', 'premier-league-2017-2018',
       'premier-league-2018-2019', 'premier-league-2019-2020',
       'premier-league-2020-2021', 'premier-league-2021-2022'],
      dtype=object)

In [51]:
test_data = train_data.tail(20)
drop_index = test_data.index
train_data = train_data.drop(drop_index, axis=0)


In [52]:
train_data.shape

(1720, 29)

In [53]:
numeric_features = train_data.select_dtypes(include=['int', 'float']).columns
numeric_features

Index(['season', 'home_win_rate', 'draw_rate', 'away_win_rate',
       'home_double_chance_rate', 'away_double_chance_rate', 'no_draw_rate',
       'total_over_1_rate', 'total_under_1_rate', 'total_over_15_rate',
       'total_under_15_rate', 'total_over_2_rate', 'total_under_2_rate',
       'total_over_25_rate', 'total_under_25_rate', 'total_over_3_rate',
       'total_under_3_rate', 'total_over_35_rate', 'total_under_35_rate',
       'both_team_to_score_yes', 'both_team_to_score_no', 'home_scored',
       'away_scored'],
      dtype='object')

In [54]:
date_features = ['date', 'time']

In [55]:
train_data['season'] = train_data.season.astype('object')
test_data['season'] = test_data.season.astype('object')

In [56]:
categorical_features = train_data.drop(columns=['date', 'time']).select_dtypes(include=['object']).columns
categorical_features

Index(['day_of_week', 'season', 'league', 'home_team', 'away_team'], dtype='object')

In [57]:
transformer_context = {'train':train_data,
                       'test':test_data,
                       'cat_features':categorical_features,
                       'num_features':numeric_features
                      }

transformer = DataTransformer(transformer_context)

In [58]:
train, val, test, decode_teams = transformer.run_logic() 

In [59]:
test.head()

Unnamed: 0,date,time,day_of_week,season,league,home_team,away_team,home_win_rate,draw_rate,away_win_rate,home_double_chance_rate,away_double_chance_rate,no_draw_rate,total_over_1_rate,total_under_1_rate,total_over_15_rate,total_under_15_rate,total_over_2_rate,total_under_2_rate,total_over_25_rate,total_under_25_rate,total_over_3_rate,total_under_3_rate,total_over_35_rate,total_under_35_rate,both_team_to_score_yes,both_team_to_score_no,home_scored,away_scored,target
1587,2022-02-24,19:45,Thursday,2022,premier-league-2021-2022,0,30,1.68,3.77,5.77,1.15,2.28,1.3,1.09,6.94,1.38,3.07,1.63,2.34,2.19,1.72,3.07,1.37,3.86,1.26,2.07,1.76,2,1,3
1659,2022-02-25,20:00,Friday,2022,premier-league-2021-2022,22,20,1.5,4.73,6.53,1.13,2.7,1.23,1.04,11.12,1.22,4.3,1.33,3.27,1.72,2.2,2.19,1.71,2.73,1.47,1.79,2.05,2,0,3
1563,2022-02-26,12:30,Saturday,2022,premier-league-2021-2022,13,26,3.89,3.77,1.96,1.92,1.27,1.3,1.04,10.02,1.21,4.33,1.3,3.5,1.68,2.25,2.13,1.77,2.64,1.5,1.59,2.35,0,4,0
1695,2022-02-26,15:00,Saturday,2022,premier-league-2021-2022,8,5,1.91,3.4,4.63,1.22,1.94,1.36,1.11,6.29,1.42,2.9,1.73,2.19,2.27,1.65,3.31,1.32,4.1,1.23,2.04,1.79,1,1,1
1596,2022-02-26,15:00,Saturday,2022,premier-league-2021-2022,3,19,2.42,3.13,3.33,1.36,1.62,1.39,1.13,5.77,1.46,2.76,1.81,2.09,2.42,1.59,3.61,1.28,4.54,1.2,1.99,1.82,0,2,0


# Оставим только "не-коэффициенты" и декодируем названия команд 

In [60]:
non_coef_features = ['date',
                     'time',
                     'day_of_week',
                     'league',
                     'home_team',
                     'away_team',
                     'home_scored',
                     'away_scored',
                     'target'                    
                    ]

train_with_names = train[non_coef_features].copy()

train_with_names.home_team = train_with_names.home_team.map(decode_teams)
train_with_names.away_team = train_with_names.away_team.map(decode_teams)

train_with_names.head()

Unnamed: 0,date,time,day_of_week,league,home_team,away_team,home_scored,away_scored,target
217,2016-09-24,14:00,Saturday,premier-league-2016-2017,Middlesbrough,Tottenham,1,2,0
276,2016-09-24,14:00,Saturday,premier-league-2016-2017,Stoke,West Brom,1,1,1
34,2016-09-24,11:30,Saturday,premier-league-2016-2017,Manchester Utd,Leicester,4,1,3
6,2016-09-24,14:00,Saturday,premier-league-2016-2017,Swansea,Manchester City,1,3,0
190,2016-09-24,14:00,Saturday,premier-league-2016-2017,Bournemouth,Everton,1,0,3


# Считаем сезонные:
- средние (пропущенные и забитые)
- общие (количество очков, забитые, пропущенные)

In [61]:
team_season_data = train_with_names.groupby(['home_team', 'league'], as_index=False) \
                        .agg(scored_by_season = ('home_scored','sum'), 
                                 missed_by_season = ('away_scored','sum'),
                                 total_points = ('target', 'sum'),
                                 mean_scored_season = ('home_scored', 'mean'),
                                 mean_missed_season = ('away_scored', 'mean')                                 
                                 )

team_season_data

Unnamed: 0,home_team,league,scored_by_season,missed_by_season,total_points,mean_scored_season,mean_missed_season
0,Arsenal,premier-league-2016-2017,32,11,39,2.000000,0.687500
1,Arsenal,premier-league-2017-2018,38,18,29,2.923077,1.384615
2,Arsenal,premier-league-2018-2019,38,16,39,2.235294,0.941176
3,Arsenal,premier-league-2019-2020,29,23,27,1.812500,1.437500
4,Arsenal,premier-league-2020-2021,21,15,24,1.400000,1.000000
...,...,...,...,...,...,...,...
115,West Ham,premier-league-2021-2022,19,16,15,1.900000,1.600000
116,Wolves,premier-league-2018-2019,24,20,28,1.411765,1.176471
117,Wolves,premier-league-2019-2020,24,15,26,1.600000,1.000000
118,Wolves,premier-league-2020-2021,18,17,22,1.200000,1.133333


# Личные встречи за все время забитые/пропущенные

In [62]:
personal_battles = train_with_names.groupby(['home_team', 'away_team'], as_index=False) \
                        .agg(total_home_scored = ('home_scored', 'sum'),
                            total_home_missed = ('away_scored', 'sum'),
                            
                            )

personal_battles

Unnamed: 0,home_team,away_team,total_home_scored,total_home_missed
0,Arsenal,Aston Villa,6,6
1,Arsenal,Bournemouth,9,2
2,Arsenal,Brentford,2,1
3,Arsenal,Brighton,4,3
4,Arsenal,Burnley,10,3
...,...,...,...,...
710,Wolves,Southampton,4,2
711,Wolves,Tottenham,4,7
712,Wolves,Watford,2,2
713,Wolves,West Brom,2,3


# Подсчет текущих очков и вин/луз стриков по сезонам

In [155]:
query = '((home_team == @team) | (away_team == @team)) & (league == @season)'

def calculate_win_streak(actual_win_streak: int, match_result: int) -> int:
    
    new_win_streak = actual_win_streak
    
    if match_result == 3:
        
        new_win_streak += 1
    
    else:
        
        new_win_streak = 0
        
    return new_win_streak

def calculate_lose_streak(actual_lose_streak: int, match_result: int) -> int:
    
    new_lose_streak = actual_lose_streak
    
    if match_result == 0:
        
        new_lose_streak += 1
    
    else:
        
        new_lose_streak = 0
        
    return new_lose_streak




train_with_current_points = train_with_names.copy()

for season in train_with_current_points.league.unique():
    
#     print(f'+++ Calculating {season} \n____________________')

    for team in train_with_current_points.home_team.unique():

    #         print(f'- Calculating team {team}')
    

        current_points = 0

        current_win_streak = 0

        current_lose_streak = 0

        team_season_data = train_with_current_points.query(query)

        for idx in team_season_data.index:
            
            if season == 'premier-league-2021-2022':

                print(f"Match of {team_season_data.loc[idx, 'date']}\n"
                         f"Home team: {team_season_data.loc[idx, 'home_team']}, "
                         f"away team: {team_season_data.loc[idx, 'away_team']}"
                         )

            if team_season_data.loc[idx, 'home_team'] == team:


                train_with_current_points.loc[idx, 'home_current_points'] = current_points

                current_points += team_season_data.loc[idx, 'target']

                if season == 'premier-league-2021-2022':

                    print(f"result is {team_season_data.loc[idx, 'target']} points for home team {team_season_data.loc[idx, 'home_team']}\n"
                         f"Match score {team_season_data.loc[idx, 'home_scored']}:{team_season_data.loc[idx, 'away_scored']}")

                train_with_current_points.loc[idx, 'home_current_lose_streak'] = current_lose_streak

                train_with_current_points.loc[idx, 'home_current_win_streak'] = current_win_streak

                current_lose_streak = calculate_lose_streak(current_lose_streak, team_season_data.loc[idx, 'target'])

                current_win_streak = calculate_win_streak(current_win_streak, team_season_data.loc[idx, 'target'])

            else:

                train_with_current_points.loc[idx, 'away_current_points'] = current_points

                home = team_season_data.loc[idx, 'home_scored']
                away = team_season_data.loc[idx, 'away_scored']

                away_match_score = 3 if home < away else 1 if home == away else 0

                current_points += away_match_score
                
                if season == 'premier-league-2021-2022':

                    print(f"result is {away_match_score} points for away team {team_season_data.loc[idx, 'away_team']}\n"
                         f"Match score {home}:{away}")

                train_with_current_points.loc[idx, 'away_current_lose_streak'] = current_lose_streak

                train_with_current_points.loc[idx, 'away_current_win_streak'] = current_win_streak

                current_lose_streak = calculate_lose_streak(current_lose_streak, away_match_score)

                current_win_streak = calculate_win_streak(current_win_streak, away_match_score)
            
            if season == 'premier-league-2021-2022':

                print('______________________________________________________________ \n')

train_with_current_points.home_current_points = train_with_current_points.home_current_points.astype(int)
train_with_current_points.away_current_points = train_with_current_points.away_current_points.astype(int)
train_with_current_points.away_current_win_streak = train_with_current_points.away_current_win_streak.astype(int)
train_with_current_points.away_current_lose_streak = train_with_current_points.away_current_lose_streak.astype(int)
train_with_current_points.home_current_win_streak = train_with_current_points.home_current_win_streak.astype(int)
train_with_current_points.home_current_lose_streak = train_with_current_points.home_current_lose_streak.astype(int)

Match of 2021-08-14 00:00:00
Home team: Manchester Utd, away team: Leeds
result is 3 points for home team Manchester Utd
Match score 5:1
______________________________________________________________ 

Match of 2021-08-22 00:00:00
Home team: Southampton, away team: Manchester Utd
result is 1 points for away team Manchester Utd
Match score 1:1
______________________________________________________________ 

Match of 2021-08-29 00:00:00
Home team: Wolves, away team: Manchester Utd
result is 3 points for away team Manchester Utd
Match score 0:1
______________________________________________________________ 

Match of 2021-09-11 00:00:00
Home team: Manchester Utd, away team: Newcastle
result is 3 points for home team Manchester Utd
Match score 4:1
______________________________________________________________ 

Match of 2021-09-19 00:00:00
Home team: West Ham, away team: Manchester Utd
result is 3 points for away team Manchester Utd
Match score 1:2
_________________________________________


Match of 2021-08-23 00:00:00
Home team: West Ham, away team: Leicester
result is 0 points for away team Leicester
Match score 4:1
______________________________________________________________ 

Match of 2021-08-28 00:00:00
Home team: Norwich, away team: Leicester
result is 3 points for away team Leicester
Match score 1:2
______________________________________________________________ 

Match of 2021-09-11 00:00:00
Home team: Leicester, away team: Manchester City
result is 0 points for home team Leicester
Match score 0:1
______________________________________________________________ 

Match of 2021-09-19 00:00:00
Home team: Brighton, away team: Leicester
result is 0 points for away team Leicester
Match score 2:1
______________________________________________________________ 

Match of 2021-09-25 00:00:00
Home team: Leicester, away team: Burnley
result is 1 points for home team Leicester
Match score 2:2
______________________________________________________________ 

Match of 2021-10-03

Match of 2021-08-14 00:00:00
Home team: Leicester, away team: Wolves
result is 0 points for away team Wolves
Match score 1:0
______________________________________________________________ 

Match of 2021-08-22 00:00:00
Home team: Wolves, away team: Tottenham
result is 0 points for home team Wolves
Match score 0:1
______________________________________________________________ 

Match of 2021-08-29 00:00:00
Home team: Wolves, away team: Manchester Utd
result is 0 points for home team Wolves
Match score 0:1
______________________________________________________________ 

Match of 2021-09-11 00:00:00
Home team: Watford, away team: Wolves
result is 3 points for away team Wolves
Match score 0:2
______________________________________________________________ 

Match of 2021-09-18 00:00:00
Home team: Wolves, away team: Brentford
result is 0 points for home team Wolves
Match score 0:2
______________________________________________________________ 

Match of 2021-09-26 00:00:00
Home team: Southam

In [154]:
train_with_current_points[(train_with_current_points.home_team == 'Manchester City')|(train_with_current_points.away_team == 'Manchester City')].tail(15)

Unnamed: 0,date,time,day_of_week,league,home_team,away_team,home_scored,away_scored,target,home_current_points,home_current_lose_streak,home_current_win_streak,away_current_points,away_current_lose_streak,away_current_win_streak
1698,2021-10-03,15:30,Sunday,premier-league-2021-2022,Liverpool,Manchester City,2,2,1,14,0,0,13,0,1
1702,2021-10-16,14:00,Saturday,premier-league-2021-2022,Manchester City,Burnley,2,0,3,14,0,0,3,0,0
1552,2021-10-23,16:30,Saturday,premier-league-2021-2022,Brighton,Manchester City,1,4,0,15,0,0,17,0,1
1550,2021-10-30,14:00,Saturday,premier-league-2021-2022,Manchester City,Crystal Palace,0,2,0,20,0,2,9,0,0
1557,2021-11-06,12:30,Saturday,premier-league-2021-2022,Manchester Utd,Manchester City,0,2,0,17,0,1,20,1,0
1631,2021-11-21,14:00,Sunday,premier-league-2021-2022,Manchester City,Everton,3,0,3,23,0,1,15,0,0
1647,2021-11-28,14:00,Sunday,premier-league-2021-2022,Manchester City,West Ham,2,1,3,26,0,2,23,1,0
1562,2021-12-01,20:15,Wednesday,premier-league-2021-2022,Aston Villa,Manchester City,1,2,0,16,0,2,29,0,3
1520,2021-12-04,17:30,Saturday,premier-league-2021-2022,Watford,Manchester City,1,3,0,13,2,0,32,0,4
1619,2021-12-11,12:30,Saturday,premier-league-2021-2022,Manchester City,Wolves,1,0,3,35,0,5,21,1,0


# ДАЛЬШЕ НЕ ЗАПУСКАТЬ!!!!!!!!!!!!!!

# Baseline model

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

In [None]:
cat_features = list(categorical_features)

Train = Pool(data=X_train,
             label=y_train,
             cat_features=cat_features)
            
Valid = Pool(data=X_val,
             label=y_val,
             cat_features=cat_features)

In [None]:
model = CatBoostClassifier( random_seed = 17,     
                           n_estimators = 1000,
                           learning_rate = 0.01,
                            thread_count = -1, 
                            verbose = 100,  
                            loss_function='MultiClass',
                            task_type = "GPU",
                          early_stopping_rounds=100)


model.fit(Train, eval_set=Valid)
preds_class = model.predict(Valid)
preds_proba = model.predict_proba(Valid)

In [None]:
print(f'Validation ACCURACY SCORE: {accuracy_score(y_val, preds_class)}')

In [None]:
print(f'Validation ROC AUC SCORE: {roc_auc_score(y_val, preds_proba, multi_class="ovr")}')

In [None]:
roc_auc_score(y_val, preds_proba, multi_class='ovr')

In [None]:
test_proba = model.predict_proba(X_test)

In [None]:
test

In [None]:
test_proba[:10]

In [None]:
# all_teams = train_data.sort_values(by='home_team').home_team.unique()
teams_labels = {team:number for number, team in enumerate(all_teams)}
decode_teams = {number:team for number, team in enumerate(all_teams)}

In [None]:
test_predictions = pd.DataFrame()
test_predictions['home_team'] = X_test.home_team.map(decode_teams)
test_predictions['away_team'] = X_test.away_team.map(decode_teams)
test_predictions['home_win_proba'] = test_proba[:,0]
test_predictions['draw_proba'] = test_proba[:,1]
test_predictions['away_win_proba'] = test_proba[:,2]
test_predictions['home_win_rate'] = X_test.home_win_rate
test_predictions['draw_rate'] = X_test.draw_rate
test_predictions['away_win_rate'] = X_test.away_win_rate
test_predictions['result'] = y_test


In [None]:
test_predictions

# Считаем ROI

In [None]:
class ROIChecker():
    def __init__(self, predictions):
        self.predictions = predictions
        self.roi_info = pd.DataFrame()

    
    def get_roi(self):
        
        self.predictions['home_win_ROI'] = self.predictions.home_win_rate * self.predictions.home_win_proba - 1
        self.predictions['away_win_ROI'] = self.predictions.away_win_rate * self.predictions.away_win_proba - 1
        self.predictions['draw_ROI'] = self.predictions.draw_rate * self.predictions.draw_proba - 1
        
        
        return self.predictions

In [None]:
checker = ROIChecker(test_predictions)
roi_info = checker.get_roi()

In [None]:
roi_info.head()

In [None]:
def explain_roi_info(roi_info):
    roi_cols = ['home_win_ROI', 'away_win_ROI', 'draw_ROI']
    best_roi_df = pd.DataFrame()
    for index, row in roi_info.iterrows():
        print(f"Match #{index+1}: {row.home_team} vs {row.away_team}")
        
        max_roi = np.max(roi_info.loc[index, roi_cols])
        current_choice = 'home_win_ROI'
        
        for col in roi_cols:
            if row[col] == max_roi:
                current_choice = col
        current_choice = ' '.join(current_choice.split('_')[:1])
        
        if max_roi > 0:
            print(f"Maximal ROI = {max_roi*100}% on {current_choice}")
        
        else:
            print("Нет положительного ROI")
            
        print('_______________________________ \n')
        
        best_roi_df.loc[index, 'home_team'] = row.home_team
        best_roi_df.loc[index, 'away_team'] = row.away_team
        best_roi_df.loc[index, 'best_ROI'] = max_roi
        best_roi_df.loc[index, 'choice'] = current_choice
        best_roi_df.loc[index, 'home_win_rate'] = row.home_win_rate
        best_roi_df.loc[index, 'draw_rate'] = row.draw_rate
        best_roi_df.loc[index, 'away_win_rate'] = row.away_win_rate
        best_roi_df.loc[index, 'result'] = row.result
        
    return best_roi_df

In [None]:
best_roi_df = explain_roi_info(roi_info)

In [None]:
best_roi_df

In [None]:
def money_score(best_roi_df, bet=100):
    results = {'home':0, 'draw':1, 'away':2}
    cols = {'home':'home_win_rate', 'draw':'draw_rate', 'away':'away_win_rate'}
    profit = 0
    skipped_bets = 0
    accepted_bets = 0
    
    for index, row in best_roi_df.iterrows():
        if row[cols[row.choice]] > 2.5:
                skipped_bets += 1
        else:
            accepted_bets += 1

            if results[row.choice] == row.result:

                current_profit = bet * (row[cols[row.choice]] - 1)
                profit += current_profit
                print(row.choice, row[cols[row.choice]], current_profit)
            else:
                profit -= bet
    return profit, skipped_bets, accepted_bets
    

In [None]:
score = money_score(best_roi_df, 100)

In [None]:
print(f'Skipped bets: {score[1]} \nAccepted bets: {score[2]} \nProfit: {score[0]}$')

# Наброски фичей

In [None]:
train_data.groupby(['home_team'],as_index=False).mean().head()

In [None]:
train_data.loc[:,['away_team', 'season', 'away_scored']].groupby(['away_team','season'], as_index=False).sum()