# Prepare icehockey data from swehockey

In [1]:
import pandas as pd
import numpy as np

import datetime

import swehockey.swehockey_scraper as swe 

In [2]:
# Make copy of old data
filename = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M.csv")

df_games_old = pd.read_csv("C:/Users/marcu/Documents/projects/icehockey_analytics/data/df_games.csv", error_bad_lines=False)
df_games_old.to_csv('C:/Users/marcu/Documents/projects/icehockey_analytics/data/00_backup/df_games_' + filename)


df_teams_old = pd.read_csv("C:/Users/marcu/Documents/projects/icehockey_analytics/data/df_teams.csv", error_bad_lines=False)
df_teams_old.to_csv('C:/Users/marcu/Documents/projects/icehockey_analytics/data/00_backup/df_teams_' + filename)

# Predictions
df_2020_prediction_old = pd.read_csv("C:/Users/marcu/Documents/projects/icehockey_analytics/data/df_2020_prediction.csv", error_bad_lines=False)
df_2020_prediction_old.to_csv('C:/Users/marcu/Documents/projects/icehockey_analytics/data/00_backup/df_2020_prediction_' + filename)

In [3]:
# Read in needed schedule-ids
df_scheduleid = pd.read_csv("https://raw.githubusercontent.com/msjoelin/swehockey_scraper/master/data/scheduleid.csv", 
                           error_bad_lines=False,
                          dtype=str)

# get games for schedule ids
games = swe.getGames(df_scheduleid)

# Clean up the output with cleanGames
df_games_clean = swe.cleanGames(games.copy())

df_games_clean.to_csv("data/df_games.csv", index=False)

# Create dataframe on teamlevel
df_teams = swe.getTeamData(df_games_clean)

df_teams.to_csv("data/df_teams.csv", index=False)

In [4]:
# sklearn will be used to build the model
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# To save the model
import pickle

In [5]:
# Import game data. Only keep relevant columns

df_games = pd.read_csv("C:/Users/marcu/Documents/projects/icehockey_analytics/data/df_games.csv", error_bad_lines=False)[['date', 'game', 'score', 'league', 'season', 'game_id', 
                'home', 'away', 'score_home', 'score_away', 'result']]

print(df_games.shape[0], " matches imported")

df_teams = pd.read_csv("C:/Users/marcu/Documents/projects/icehockey_analytics/data/df_teams.csv", error_bad_lines=False)[['game_id', 'date', 'team', 'h_a', 
                     'win_R5', 'draw_R5', 'lost_R5',  
                     'H2H_W', 'H2H_D', 'H2H_L',
                    'scored_avg_R5', 'conceded_avg_R5',
                    'points_cum_prev_avg', 'points_cum_h_a_prev_avg']]
print(df_teams.shape[0], " teamrows imported")


# Split into home and away data
df_home = df_teams[df_teams['h_a']=='home'].copy().add_suffix('_home').rename(columns={"date_home": "date", "team_home": "team"})
df_away = df_teams[df_teams['h_a']=='away'].copy().add_suffix('_away').rename(columns={"date_away": "date", "team_away": "team"})


df_games_trend = pd.merge(df_games, df_home, 
                          left_on=['date', 'home'], 
                          right_on=['date', 'team'],                
                          how='left')

df_games_trend = pd.merge(df_games_trend, df_away, 
                          left_on=['date', 'away'], 
                          right_on=['date', 'team'],
                          how='left')

df_games_trend.drop_duplicates(subset =['date', 'game'], keep = False, inplace = True) 

# Introduce binary outcome variables


df_games_trend=df_games_trend.drop(['H2H_W_away', 'H2H_D_away', 'H2H_L_away', 
                                    'team_x', 'team_y',
                                   'game_id_home', 'game_id_away', 
                                   'h_a_home', 'h_a_away'], axis=1)


df_games_trend = df_games_trend.dropna(subset=['win_R5_home', 'draw_R5_home', 'lost_R5_home', 
         'H2H_W_home', 'H2H_D_home', 'H2H_L_home', 
         'scored_avg_R5_home', 'conceded_avg_R5_home',
         'points_cum_prev_avg_home', 'points_cum_h_a_prev_avg_home',
         'win_R5_away', 'draw_R5_away', 'lost_R5_away',
         'scored_avg_R5_away', 'conceded_avg_R5_away',
         'points_cum_prev_avg_away', 'points_cum_h_a_prev_avg_away']).reset_index(drop=True)

df_2020 = df_games_trend[df_games_trend['season']=='2020/21']

print("Data current season: " + str(df_2020.shape[0]))



9912  matches imported
19824  teamrows imported
Data current season: 149


In [6]:

X_2020 = df_2020.iloc[:,11:].values
y_2020 = df_2020.iloc[:,10].values
info_2020 = df_2020.iloc[:,0:9].values

scaler = StandardScaler()
X_2020 = scaler.fit_transform(X_2020)

In [7]:
rf = pickle.load(open("C:/Users/marcu/Documents/projects/icehockey_analytics/ml_models/rf_predict_game.sav", 'rb'))



In [8]:
y_pred2020_rf = rf.predict(X_2020)
y_pred2020_rf_prob = rf.predict_proba(X_2020)

df_2020 = pd.DataFrame(info_2020)

df_2020.columns = ['date', 'game', 'score', 'league', 'season', 'game_id', 'home', 'away','score_home']

df_2020.head()

Unnamed: 0,date,game,score,league,season,game_id,home,away,score_home
0,2020-10-01,Leksands IF - Örebro HK,3 - 5,shl,2020/21,490420,Leksands IF,Örebro HK,3
1,2020-10-01,Luleå HF - HV 71,5 - 4,shl,2020/21,490424,Luleå HF,HV 71,5
2,2020-10-01,Skellefteå AIK - Frölunda HC,3 - 1,shl,2020/21,490425,Skellefteå AIK,Frölunda HC,3
3,2020-10-01,Växjö Lakers HC - Rögle BK,2 - 1,shl,2020/21,490427,Växjö Lakers HC,Rögle BK,2
4,2020-10-01,Linköping HC - Brynäs IF,5 - 4,shl,2020/21,490428,Linköping HC,Brynäs IF,5


In [9]:
# Set the outcome variables and probabilities
df_2020['actual'] = y_2020
df_2020['predicted'] = y_pred2020_rf
df_2020['prob_home'] = y_pred2020_rf_prob[:,0]
df_2020['prob_draw'] = y_pred2020_rf_prob[:,1]
df_2020['prob_away'] = y_pred2020_rf_prob[:,2]

# Map back sign outcome
mapping = {0:'home', 1:'draw', 2:'away'}
df_2020 = df_2020.replace({'actual': mapping, 'predicted': mapping})

df_2020['correct']=0
df_2020.loc[df_2020['actual']==df_2020['predicted'],'correct']=1

# Check mean values by sign
print("Correct ratio by prediction")
df_2020.groupby(['predicted'])['correct'].mean()

df_2020.to_csv("C:/Users/marcu/Documents/projects/icehockey_analytics/data/df_2020_prediction.csv", index=False)

Correct ratio by prediction


In [10]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

gauth = GoogleAuth()
gauth.LocalWebserverAuth()

drive = GoogleDrive(gauth)

update_file = drive.CreateFile({"id": "15A9q85U8KiA5bC-rRINlz64QZJFSLvq9JafLUT1h5bw"})
update_file.SetContentFile("C:/Users/marcu/Documents/projects/icehockey_analytics/data/df_2020_prediction.csv")
update_file.Upload()

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=71573049861-amtqabdfctmnqg31vqgcrkdfnnuvf4sj.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [11]:
 # Link to report https://datastudio.google.com/u/0/reporting/59fa9193-7c85-414e-a49a-582ce03d29c5/page/LLkjB/edit