In [1]:
import os
import pandas as pd
import numpy as np
import statistics
from scipy.stats import gmean
import sys

In [2]:
!{sys.executable} -m pip install psycopg2-binary



In [3]:
database_connect = 'postgres://doadmin:rdxo4w05qb3vq10l@db-postgresql-fra1-36671-do-user-4768937-0.db.ondigitalocean.com:25060/hockey'

In [4]:
class DataFrameTransformer:
    def __init__(self, df):
        self.df = df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in self.transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df)
        return self.df

In [5]:
table_player_stats_name = 'data_for_model3'
df_player_stats = pd.read_sql_table(table_player_stats_name, database_connect)
df_player_stats.sample(5)

Unnamed: 0,player_stats_id,player_stats_khl_id,season_id,team_id,player_id,games,points,goals,assists,penalty,...,player_height,player_weight,player_site_id,player_age,player_name,player_unicode_name,player_khl_id,player_nhl_id,season_year,season_type
12764,13699.0,,77,74,38007,79,43.0,21.0,22.0,58.0,...,191.0,102.0,37998,28.0,Chris Kreider\n ...,"chris kreider a.k.a. ""christopher kreider""",,8475184.0,2015,regular
29335,,5959.0,76,188,1000,20,11.0,5.0,6.0,18.0,...,170.0,76.0,9347,31.0,Justin Azevedo,justin azevedo,362.0,,2015,playoff
6339,6846.0,,62,89,9089,61,11.0,4.0,7.0,68.0,...,191.0,91.0,9093,41.0,Matt Bradley,matt bradley,,8465059.0,2010,regular
27206,,3285.0,88,200,287,17,3.0,1.0,2.0,27.0,...,181.0,78.0,262660,23.0,Artyom Sergeyev,artyom sergeyev,888.0,,2019,playoff
35244,,12447.0,89,15526,1758,30,7.0,1.0,6.0,11.0,...,187.0,88.0,223794,25.0,Alexander Akmaldinov,alexander akmaldinov,351.0,,2019,regular


In [6]:
table_team_stats_name = 'team_stats'
df_team_stats = pd.read_sql_table(table_team_stats_name, database_connect)
df_team_stats.sample(5)

Unnamed: 0,id,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
15314,16412,198,92,37,41.0,97,94,15,8.0,False,4.0,
2787,2977,46,91,35,,1,97,8,,True,,
5080,5439,203,91,37,,3,99,2,,True,,
2368,2530,20132,91,37,,3,82,17,,True,,
4045,4327,204,91,37,,2,89,18,,True,,


In [7]:
df_temp = df_team_stats[:]
df_temp.drop('id', axis=1, inplace=True)
df_team_stats = df_temp.drop_duplicates(keep='first')
df_team_stats = df_team_stats.reset_index(drop=True)
df_team_stats

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
0,216,85,4,,6,10,14,,True,,
1,216,85,56,,10,116,11,,True,,
2,216,80,60,100.0,163,137,11,7.0,False,5.0,
3,216,83,60,104.0,145,124,9,6.0,False,4.0,
4,216,82,5,,9,13,11,,True,,
...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,53
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,23
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,24
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,22


## Features for teams

In [8]:
def add_feature_from_dict(df, d, feature_name, attr_name, season_count, step):
    df[feature_name] = None
    for k, v in d.items():
        s = 0
        for i in range(1, season_count + 1):
            if d.get(k - step * i) != None:
                s += d.get(k - step * i)
        df[feature_name][df[attr_name] == k] = s / season_count
    return df

In [9]:
# transform methods for seasons
def add_avg_games_for_season(df):
    d = df.groupby('season_id')['games'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_games_count_season',  'season_id', 3, 3)

def add_avg_points_for_season(df):
    d = df.groupby('season_id')['points'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_points_season',  'season_id', 3, 3)

def add_avg_goals_scored_for_season(df):
    d = df.groupby('season_id')['goals_scored'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_scored_season', 'season_id', 3, 3)

def add_avg_goals_missed_for_season(df):
    d = df.groupby('season_id')['goals_missed'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_missed_season', 'season_id', 3, 3)

def add_avg_position_in_championship_for_season(df):
    d = df.groupby('season_id')['position_in_championship'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_championship_season', 'season_id', 3, 3)

def add_avg_position_in_conference_for_season(df):
    d = df.groupby('season_id')['position_in_conference'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_conference_season', 'season_id', 3, 3)

def add_avg_position_in_division_for_season(df):
    d = df.groupby('season_id')['position_in_division'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_division_season', 'season_id', 3, 3)

def add_team_count_for_season(df):
    d = df.groupby('season_id').apply(lambda x: len(x['team_id'].unique())).to_dict()
    return add_feature_from_dict(df, d, 'feature_team_count_season', 'season_id', 3, 3)

In [10]:
transforms = [add_avg_games_for_season, add_avg_points_for_season, add_avg_position_in_division_for_season, add_avg_position_in_conference_for_season, add_avg_position_in_championship_for_season, add_avg_goals_missed_for_season, add_avg_goals_scored_for_season, add_team_count_for_season]
transformer = DataFrameTransformer(df_team_stats[:])
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a s

Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id,feature_mean_games_count_season,feature_mean_points_season,feature_mean_position_in_division_season,feature_mean_position_in_conference_season,feature_mean_position_in_championship_season,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season
0,216,85,4,,6,10,14,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333
1,216,85,56,,10,116,11,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333
2,216,80,60,100.0,163,137,11,7.0,False,5.0,,82,92.2011,4.18637,7.89573,15.2557,224.189,225.177,28.6667
3,216,83,60,104.0,145,124,9,6.0,False,4.0,,78.3977,91.7713,4.13833,7.79673,15.1081,211.707,212.742,38.3333
4,216,82,5,,9,13,11,,True,,,43.3372,,,,7.69208,119.826,10.9299,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,53,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,23,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,24,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,22,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333


In [11]:
conda install progressbar2

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [12]:
import progressbar
def calculate_team_feature(df, feature_name, attr, season_count, step):
    print("Calculating feature for", attr)
    df[feature_name] = None
    bar = progressbar.ProgressBar(maxval=12, \
        widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    step = 1
    for ind in range(len(df)):
        season = df.loc[ind, 'season_id']
        team = df.loc[ind, 'team_id']
        s = 0
        for i in range(1, season_count + 1):
            s += df.loc[(df['season_id'] == season - step * i) & (df['team_id'] == team)][attr].sum()
        df.loc[ind, feature_name] = s / season_count
        if ind % 100 == 0:
            bar.update(step)
            step += 1
    bar.finish()
    return df

In [13]:
# transform methods for teams
def add_avg_games_for_team(df):
    return calculate_team_feature(df, 'feature_mean_games_count_team', 'games', 3, 3)

def add_avg_points_for_team(df):
    return calculate_team_feature(df, 'feature_mean_points_team', 'points', 3, 3)

def add_avg_goals_scored_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_scored_team', 'goals_scored', 3, 3)

def add_avg_goals_missed_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_missed_team', 'goals_missed', 3, 3)

def add_avg_position_in_championship_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_championship_team', 'position_in_championship', 3, 3)

def add_avg_position_in_conference_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_conference_team', 'position_in_conference', 3, 3)

def add_avg_position_in_division_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_division_team', 'position_in_division', 3, 3)

In [14]:
transforms = [add_avg_games_for_team, add_avg_points_for_team, add_avg_goals_scored_for_team, add_avg_goals_missed_for_team, add_avg_position_in_championship_for_team, add_avg_position_in_conference_for_team, add_avg_position_in_division_for_team]
transformer = DataFrameTransformer(df_team_feature)
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
                                                                               [                                                                        ] N/A%

Calculating feature for games


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
[                                                                        ] N/A%

Calculating feature for points


[                                                                        ] N/A%

Calculating feature for goals_scored


[                                                                        ] N/A%

Calculating feature for goals_missed


[                                                                        ] N/A%

Calculating feature for position_in_championship


[                                                                        ] N/A%

Calculating feature for position_in_conference


[                                                                        ] N/A%

Calculating feature for position_in_division




Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,...,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_mean_goals_scored_team,feature_mean_goals_missed_team,feature_mean_position_in_championship_team,feature_mean_position_in_conference_team,feature_mean_position_in_division_team
0,216,85,4,,6,10,14,,True,,...,100.221,12.4836,24.3333,41.6667,34.6667,53.6667,94,9.66667,2,1.33333
1,216,85,56,,10,116,11,,True,,...,100.221,12.4836,24.3333,43.6667,34.6667,59.3333,106.333,9.33333,2,1.33333
2,216,80,60,100.0,163,137,11,7.0,False,5.0,...,224.189,225.177,28.6667,21.6667,0,4.33333,56.3333,8.66667,0,0
3,216,83,60,104.0,145,124,9,6.0,False,4.0,...,211.707,212.742,38.3333,23.6667,0,11,65,6.33333,0,0
4,216,82,5,,9,13,11,,True,,...,119.826,10.9299,16,41.6667,33.3333,58.6667,102,12.3333,2.33333,1.66667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,...,202.57,203.18,48.3333,0,0,0,0,0,0,0
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,...,202.57,203.18,48.3333,27.3333,30.6667,85.3333,85,5.66667,3,1.33333
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,...,202.57,203.18,48.3333,27.3333,32.6667,84.6667,76.3333,4,2,1
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,...,202.57,203.18,48.3333,27.3333,31.6667,85.3333,83.6667,4.66667,2.66667,1


## Features for player

In [15]:
def av_games_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_games_overall'] = ssum/count
                if not np.isnan(list(b[j]["games"])[i]):
                    count+=1
                    ssum+=list(b[j]["games"])[i]
        return pd.concat(b)
def av_goals_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_goals_overall'] = ssum/count
                if not np.isnan(list(b[j]["goals"])[i]):
                    count+=1
                    ssum+=list(b[j]["goals"])[i]
        return pd.concat(b)
def av_assists_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_assists_overall'] = ssum/count
                if not np.isnan(list(b[j]["assists"])[i]):
                    count+=1
                    ssum+=list(b[j]["assists"])[i]
        return pd.concat(b)
def av_penalty_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_penalty_overall'] = ssum/count
                if not np.isnan(list(b[j]["penalty"])[i]):
                    count+=1
                    ssum+=list(b[j]["penalty"])[i]
        return pd.concat(b)
def av_p_m_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_p_m_overall'] = ssum/count
                if not np.isnan(list(b[j]["p_m"])[i]):
                    count+=1
                    ssum+=list(b[j]["p_m"])[i]
        return pd.concat(b)
def av_p_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_+_overall'] = ssum/count
                if not np.isnan(list(b[j]["+"])[i]):
                    count+=1
                    ssum+=list(b[j]["+"])[i]
        return pd.concat(b)
def av_p_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_-_overall'] = ssum/count
                if not np.isnan(list(b[j]["-"])[i]):
                    count+=1
                    ssum+=list(b[j]["-"])[i]
        return pd.concat(b)
def av_esg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_esg_overall'] = ssum/count
                if not np.isnan(list(b[j]["esg"])[i]):
                    count+=1
                    ssum+=list(b[j]["esg"])[i]
        return pd.concat(b)
def av_ppg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_ppg_overall'] = ssum/count
                if not np.isnan(list(b[j]["ppg"])[i]):
                    count+=1
                    ssum+=list(b[j]["ppg"])[i]
        return pd.concat(b)
def av_shg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_shg_overall'] = ssum/count
                if not np.isnan(list(b[j]["shg"])[i]):
                    count+=1
                    ssum+=list(b[j]["shg"])[i]
        return pd.concat(b)
def av_otg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_otg_overall'] = ssum/count
                if not np.isnan(list(b[j]["otg"])[i]):
                    count+=1
                    ssum+=list(b[j]["otg"])[i]
        return pd.concat(b)
def av_gwg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_gwg_overall'] = ssum/count
                if not np.isnan(list(b[j]["gwg"])[i]):
                    count+=1
                    ssum+=list(b[j]["gwg"])[i]
        return pd.concat(b)
def av_sds_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sds_overall'] = ssum/count
                if not np.isnan(list(b[j]["sds"])[i]):
                    count+=1
                    ssum+=list(b[j]["sds"])[i]
        return pd.concat(b)
def av_sog_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sog_overall'] = ssum/count
                if not np.isnan(list(b[j]["sog"])[i]):
                    count+=1
                    ssum+=list(b[j]["sog"])[i]
        return pd.concat(b)
def av__SOG_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%SOG_overall'] = ssum/count
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    count+=1
                    ssum+=list(b[j]["%SOG"])[i]
        return pd.concat(b)
def av_S_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_S/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["S/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["S/G"])[i]
        return pd.concat(b)
def av_fo_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fo_overall'] = ssum/count
                if not np.isnan(list(b[j]["fo"])[i]):
                    count+=1
                    ssum+=list(b[j]["fo"])[i]
        return pd.concat(b)
def av_fow_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fow_overall'] = ssum/count
                if not np.isnan(list(b[j]["fow"])[i]):
                    count+=1
                    ssum+=list(b[j]["fow"])[i]
        return pd.concat(b)
def av__FO_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%FO_overall'] = ssum/count
                if not np.isnan(list(b[j]["%FO"])[i]):
                    count+=1
                    ssum+=list(b[j]["%FO"])[i]
        return pd.concat(b)
def av_TOI_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_TOI/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["TOI/G"])[i]
        return pd.concat(b)
def av_SFT_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_SFT/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["SFT/G"])[i]
        return pd.concat(b)
def av_hits_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_hits_overall'] = ssum/count
                if not np.isnan(list(b[j]["hits"])[i]):
                    count+=1
                    ssum+=list(b[j]["hits"])[i]
        return pd.concat(b)
def av_bls_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_bls_overall'] = ssum/count
                if not np.isnan(list(b[j]["bls"])[i]):
                    count+=1
                    ssum+=list(b[j]["bls"])[i]
        return pd.concat(b)
def av_foa_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 0:
                        b[j].loc[b[j].index[i], 'av_foa_overall'] = ssum/count
                    if not np.isnan(list(b[j]["foa"])[i]):
                        count+=1
                        ssum+=list(b[j]["foa"])[i]
        return pd.concat(b)

In [16]:
transforms = [av_foa_overall, av_bls_overall, av_hits_overall, av_SFT_G_overall, av_fow_overall, av_fo_overall, av_S_G_overall, av__SOG_overall, av_sog_overall, av_sds_overall, av_gwg_overall, av_otg_overall, av_shg_overall, av_ppg_overall, av_esg_overall, av_games_overall, av_goals_overall, av_assists_overall, av_penalty_overall, av_p_m_overall, av_p_overall, av_p_overall]
transformer = DataFrameTransformer(df_player_stats)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas 

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_overall,av_-_overall,...,sog,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded
24236,100.0,12.9,53.0,47.0,4.1,27.8,25:17,30.0,,,...,217.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24235,-,0.0,0.0,2.0,3.0,28.7,32:26,1.0,12.900000,47.000000,...,9.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24234,-,7.7,51.0,58.0,3.3,25.0,25:33,27.0,6.450000,24.500000,...,183.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24232,-,0.0,0.0,4.0,3.8,24.5,27:31,2.0,6.866667,35.666667,...,15.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24233,50.0,5.3,51.0,41.0,4.2,27.1,27:17,27.0,5.150000,27.750000,...,225.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29314,25.0,0.0,1.0,3.0,2.0,23.2,18:10,2.0,9.527273,18.272727,...,8.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921
29313,43.8,14.0,16.0,14.0,1.9,19.3,14:27,9.0,8.733333,17.000000,...,50.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29312,37.5,0.0,1.0,6.0,1.4,18.2,12:20,2.0,9.138462,16.769231,...,11.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29310,11.1,3.6,24.0,29.0,2.4,22.9,16:09,17.0,8.485714,16.000000,...,111.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921


In [17]:
def av_games_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [18]:
transforms = [av_games_in_last_2,
av_goals_in_last_2,
av_assists_in_last_2,
av_penalty_in_last_2,
av_p_m_in_last_2,
av_p_in_last_2,
av_m_in_last_2,
av_esg_in_last_2,
av_ppg_in_last_2,
av_shg_in_last_2,
av_otg_in_last_2,
av_gwg_in_last_2,
av_sds_in_last_2,
av_sog_in_last_2,
av__SOG_in_last_2,
av_S_G_in_last_2,
av_fo_in_last_2,
av_fow_in_last_2,
av_SFT_G_in_last_2,
av_hits_in_last_2,
av_bls_in_last_2,
av_foa_in_last_2]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas 

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_overall,...,sog,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded
24236,100.0,12.9,53.0,47.0,4.1,27.8,25:17,30.0,,,...,217.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24235,-,0.0,0.0,2.0,3.0,28.7,32:26,1.0,,12.900000,...,9.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24234,-,7.7,51.0,58.0,3.3,25.0,25:33,27.0,6.45,6.450000,...,183.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24232,-,0.0,0.0,4.0,3.8,24.5,27:31,2.0,3.85,6.866667,...,15.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24233,50.0,5.3,51.0,41.0,4.2,27.1,27:17,27.0,3.85,5.150000,...,225.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29314,25.0,0.0,1.0,3.0,2.0,23.2,18:10,2.0,5.25,9.527273,...,8.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921
29313,43.8,14.0,16.0,14.0,1.9,19.3,14:27,9.0,0.00,8.733333,...,50.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29312,37.5,0.0,1.0,6.0,1.4,18.2,12:20,2.0,7.00,9.138462,...,11.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29310,11.1,3.6,24.0,29.0,2.4,22.9,16:09,17.0,1.80,8.485714,...,111.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921


In [19]:
def av_games_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_games_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_+_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_-_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [20]:
transforms = [av_games_in_last_3,
av_goals_in_last_3,
av_assists_in_last_3,
av_penalty_in_last_3,
av_p_m_in_last_3,
av_p_in_last_3,
av_m_in_last_3,
av_esg_in_last_3,
av_ppg_in_last_3,
av_shg_in_last_3,
av_otg_in_last_3,
av_gwg_in_last_3,
av_sds_in_last_3,
av_sog_in_last_3,
av__SOG_in_last_3,
av_S_G_in_last_3,
av_fo_in_last_3,
av_fow_in_last_3,
av_SFT_G_in_last_3,
av_hits_in_last_3,
av_bls_in_last_3,
av_foa_in_last_3]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas 

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_in_last_3,...,sog,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded
24236,100.0,12.9,53.0,47.0,4.1,27.8,25:17,30.0,,,...,217.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24235,-,0.0,0.0,2.0,3.0,28.7,32:26,1.0,,,...,9.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24234,-,7.7,51.0,58.0,3.3,25.0,25:33,27.0,6.45,,...,183.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24232,-,0.0,0.0,4.0,3.8,24.5,27:31,2.0,3.85,6.866667,...,15.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24233,50.0,5.3,51.0,41.0,4.2,27.1,27:17,27.0,3.85,2.566667,...,225.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29314,25.0,0.0,1.0,3.0,2.0,23.2,18:10,2.0,5.25,7.000000,...,8.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921
29313,43.8,14.0,16.0,14.0,1.9,19.3,14:27,9.0,0.00,3.500000,...,50.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29312,37.5,0.0,1.0,6.0,1.4,18.2,12:20,2.0,7.00,8.166667,...,11.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29310,11.1,3.6,24.0,29.0,2.4,22.9,16:09,17.0,1.80,5.866667,...,111.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921


## Merge datasets

In [21]:
df_player_feature['team_id'] = df_player_feature['team_id'].astype(int)
result_input_df = pd.merge(df_player_feature, df_team_feature, on=['team_id', 'season_id'], suffixes=('_player', '_team'))
pd.set_option('display.max_columns', 500)
result_input_df

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_in_last_3,av_%SOG_overall,av_+_in_last_2,av_+_in_last_3,av_-_in_last_2,av_-_in_last_3,av_-_overall,av_S/G_in_last_2,av_S/G_in_last_3,av_S/G_overall,av_SFT/G_in_last_2,av_SFT/G_in_last_3,av_SFT/G_overall,av_assists_in_last_2,av_assists_in_last_3,av_assists_overall,av_bls_in_last_2,av_bls_in_last_3,av_bls_overall,av_esg_in_last_2,av_esg_in_last_3,av_esg_overall,av_fo_in_last_2,av_fo_in_last_3,av_fo_overall,av_foa_in_last_2,av_foa_in_last_3,av_foa_overall,av_fow_in_last_2,av_fow_in_last_3,av_fow_overall,av_games_in_last_2,av_games_in_last_3,av_games_overall,av_goals_in_last_2,av_goals_in_last_3,av_goals_overall,av_gwg_in_last_2,av_gwg_in_last_3,av_gwg_overall,av_hits_in_last_2,av_hits_in_last_3,av_hits_overall,av_otg_in_last_2,av_otg_in_last_3,av_otg_overall,av_p_m_in_last_2,av_p_m_in_last_3,av_p_m_overall,av_penalty_in_last_2,av_penalty_in_last_3,av_penalty_overall,av_ppg_in_last_2,av_ppg_in_last_3,av_ppg_overall,av_sds_in_last_2,av_sds_in_last_3,av_sds_overall,av_shg_in_last_2,av_shg_in_last_3,av_shg_overall,av_sog_in_last_2,av_sog_in_last_3,av_sog_overall,bls,esg,fo,foa,fow,games_player,goals,gwg,hits,league_full_name,league_id,league_short_name,otg,p_m,penalty,player_age,player_height,player_id,player_khl_id,player_name,player_nationality,player_nhl_id,player_position,player_shoots,player_site_id,player_stats_id,player_stats_khl_id,player_unicode_name,player_weight,player_youth_team,points_player,ppg,sds,season_id,season_type,season_year,shg,sog,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded,games_team,points_team,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id,feature_mean_games_count_season,feature_mean_points_season,feature_mean_position_in_division_season,feature_mean_position_in_conference_season,feature_mean_position_in_championship_season,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_mean_goals_scored_team,feature_mean_goals_missed_team,feature_mean_position_in_championship_team,feature_mean_position_in_conference_team,feature_mean_position_in_division_team
0,-,7.800000,14.000000,10.000000,3.40000,25.800000,24:41,9.0,9.400000,7.766667,5.442857,35.000000,40.333333,38.000000,26.666667,29.428571,3.75000,3.900000,3.56250,24.300000,25.100000,26.314286,29.5,20.333333,17.857143,,,,8.500000,7.333333,5.750000,0.000000,0.000000,0.571429,,,,0.000000,0.333333,0.375000,52.5,37.333333,32.571429,9.5,10.333333,11.000000,1.000000,1.666667,2.250000,,,,0.00000,0.000000,0.285714,20.0,14.333333,2.714286,17.5,25.000000,37.875000,6.000000,4.000000,4.428571,0.000000,0.000000,0.142857,0.000000,0.000000,0.125000,128.000000,160.333333,130.000000,,2.000000,0.000000,,0.000000,15,4.0,0.000000,,Kontinental Hockey League,1,KHL,0.00000,4,4.0,38.0,178.0,0,417,Kevin Dallman,Kazakhstan ...,8469795.0,D,R,9129,,9.0,kevin dallman,88.0,-,13.0,2.000000,0.000000,70,playoff,2013,0.000000,51.000000,"St. Petersburg, RUS",Sportivnyi Klub Armii St. Petersburg,203,17.0,https://www.eliteprospects.com/league/khl,SKA St. Petersburg\n \n\n \nKHL,,199,1946,52,,6,182,1,,True,,,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0
1,-,7.800000,14.000000,10.000000,3.40000,25.800000,24:41,9.0,9.400000,7.766667,5.442857,35.000000,40.333333,38.000000,26.666667,29.428571,3.75000,3.900000,3.56250,24.300000,25.100000,26.314286,29.5,20.333333,17.857143,,,,8.500000,7.333333,5.750000,0.000000,0.000000,0.571429,,,,0.000000,0.333333,0.375000,52.5,37.333333,32.571429,9.5,10.333333,11.000000,1.000000,1.666667,2.250000,,,,0.00000,0.000000,0.285714,20.0,14.333333,2.714286,17.5,25.000000,37.875000,6.000000,4.000000,4.428571,0.000000,0.000000,0.142857,0.000000,0.000000,0.125000,128.000000,160.333333,130.000000,,2.000000,0.000000,,0.000000,15,4.0,0.000000,,Kontinental Hockey League,1,KHL,0.00000,4,4.0,38.0,178.0,0,417,Kevin Dallman,Kazakhstan ...,8469795.0,D,R,9129,,9.0,kevin dallman,88.0,-,13.0,2.000000,0.000000,70,playoff,2013,0.000000,51.000000,"St. Petersburg, RUS",Sportivnyi Klub Armii St. Petersburg,203,17.0,https://www.eliteprospects.com/league/khl,SKA St. Petersburg\n \n\n \nKHL,,199,1946,15,,56,30,4,,True,,,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0
2,0.0,3.100000,7.000000,7.000000,2.50000,16.000000,12:52,3.0,10.400000,8.800000,10.100000,10.500000,9.000000,9.500000,9.333333,11.200000,2.45000,2.300000,2.40000,15.950000,16.833333,18.400000,2.5,4.333333,6.800000,,,,3.000000,3.000000,4.400000,2.500000,2.000000,3.000000,,,,0.500000,0.666667,0.800000,19.0,20.333333,24.600000,4.0,3.666667,5.600000,1.000000,1.000000,1.600000,,,,0.50000,0.333333,0.200000,-2.5,-0.333333,1.600000,20.0,14.666667,20.800000,0.000000,0.333333,1.000000,0.000000,0.000000,0.000000,0.500000,0.333333,0.200000,59.500000,47.666667,60.600000,,1.000000,1.000000,,0.000000,13,1.0,0.000000,,Kontinental Hockey League,1,KHL,0.00000,0,6.0,40.0,182.0,1017,509,Maxim Afinogenov,Russia,8466202.0,RW,L,8672,,6213.0,maxim afinogenov,88.0,Dynamo Moskva,4.0,0.000000,0.000000,70,playoff,2013,0.000000,32.000000,"St. Petersburg, RUS",Sportivnyi Klub Armii St. Petersburg,203,17.0,https://www.eliteprospects.com/league/khl,SKA St. Petersburg\n \n\n \nKHL,,199,1946,52,,6,182,1,,True,,,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0
3,0.0,3.100000,7.000000,7.000000,2.50000,16.000000,12:52,3.0,10.400000,8.800000,10.100000,10.500000,9.000000,9.500000,9.333333,11.200000,2.45000,2.300000,2.40000,15.950000,16.833333,18.400000,2.5,4.333333,6.800000,,,,3.000000,3.000000,4.400000,2.500000,2.000000,3.000000,,,,0.500000,0.666667,0.800000,19.0,20.333333,24.600000,4.0,3.666667,5.600000,1.000000,1.000000,1.600000,,,,0.50000,0.333333,0.200000,-2.5,-0.333333,1.600000,20.0,14.666667,20.800000,0.000000,0.333333,1.000000,0.000000,0.000000,0.000000,0.500000,0.333333,0.200000,59.500000,47.666667,60.600000,,1.000000,1.000000,,0.000000,13,1.0,0.000000,,Kontinental Hockey League,1,KHL,0.00000,0,6.0,40.0,182.0,1017,509,Maxim Afinogenov,Russia,8466202.0,RW,L,8672,,6213.0,maxim afinogenov,88.0,Dynamo Moskva,4.0,0.000000,0.000000,70,playoff,2013,0.000000,32.000000,"St. Petersburg, RUS",Sportivnyi Klub Armii St. Petersburg,203,17.0,https://www.eliteprospects.com/league/khl,SKA St. Petersburg\n \n\n \nKHL,,199,1946,15,,56,30,4,,True,,,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0
4,52.4,19.400000,14.000000,4.000000,2.10000,19.700000,15:15,6.0,24.650000,20.866667,14.687500,22.000000,18.000000,11.000000,8.333333,5.250000,1.10000,0.933333,1.16250,16.150000,15.033333,14.337500,8.0,7.333333,5.125000,,,,6.000000,4.666667,3.375000,357.000000,311.000000,243.875000,,,,189.500000,164.000000,130.750000,31.5,29.333333,24.500000,7.0,5.333333,3.875000,0.500000,0.333333,0.625000,,,,0.00000,0.000000,0.000000,11.0,9.666667,6.250000,7.0,6.000000,5.000000,1.000000,0.666667,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,33.500000,27.333333,23.750000,,6.000000,185.000000,,97.000000,15,6.0,2.000000,,Kontinental Hockey League,1,KHL,0.00000,10,4.0,32.0,181.0,1031,712,Alexander Kucheryavenko,Russia,,C,R,9590,,6484.0,alexander kucheryavenko,86.0,HK Belgorod,12.0,0.000000,0.000000,70,playoff,2013,0.000000,31.000000,"St. Petersburg, RUS",Sportivnyi Klub Armii St. Petersburg,203,17.0,https://www.eliteprospects.com/league/khl,SKA St. Petersburg\n \n\n \nKHL,,199,1946,52,,6,182,1,,True,,,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31398,,6.896228,11.990429,12.097877,1.43948,19.302574,,5.0,6.896228,6.896228,6.896228,11.990429,11.990429,12.097877,12.097877,12.097877,1.43948,1.439480,1.43948,19.302574,19.302574,19.302574,26.0,25.333333,20.500000,15.939426,15.939426,15.939426,2.357327,2.357327,2.357327,83.831912,83.831912,83.831912,4.587027,4.587027,4.587027,41.769503,41.769503,41.769503,68.0,70.333333,58.750000,14.0,15.666667,15.250000,0.607895,0.607895,0.607895,20.399939,20.399939,20.399939,0.07005,0.070050,0.070050,-3.5,-3.000000,-2.583333,23.0,24.000000,21.500000,1.000563,1.000563,1.000563,0.082516,0.082516,0.082516,0.098328,0.098328,0.098328,40.651681,40.651681,40.651681,15.939426,2.357327,83.831912,4.587027,41.769503,13,2.0,0.607895,20.399939,National Hockey League,9,NHL,0.07005,2,8.0,48.0,180.0,9072,,Stu Barnes,Canada \n ...,8455521.0,C,R,9074,4205.0,,stu barnes,83.0,-,7.0,1.000563,0.082516,38,regular,2002,0.098328,40.651681,"Dallas, TX",,70,,https://www.eliteprospects.com/league/nhl,Dallas Stars\n \n\n \nNHL,25.0,59,1993,82,111.0,245,169,2,1.0,True,1.0,25,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667
31399,,6.896228,11.990429,12.097877,1.43948,19.302574,,20.0,6.896228,6.896228,6.896228,11.990429,11.990429,12.097877,12.097877,12.097877,1.43948,1.439480,1.43948,19.302574,19.302574,19.302574,13.5,9.000000,8.454545,15.939426,15.939426,15.939426,2.357327,2.357327,2.357327,83.831912,83.831912,83.831912,4.587027,4.587027,4.587027,41.769503,41.769503,41.769503,51.0,34.333333,34.272727,4.5,3.000000,3.545455,0.607895,0.607895,0.607895,20.399939,20.399939,20.399939,0.07005,0.070050,0.070050,2.0,1.333333,-2.272727,57.0,38.000000,27.090909,1.000563,1.000563,1.000563,0.082516,0.082516,0.082516,0.098328,0.098328,0.098328,40.651681,40.651681,40.651681,15.939426,2.357327,83.831912,4.587027,41.769503,80,7.0,0.607895,20.399939,National Hockey League,9,NHL,0.07005,28,94.0,46.0,191.0,9082,,Philippe Boucher,Canada,8458527.0,D,R,9087,12540.0,,philippe boucher,99.0,-,27.0,1.000563,0.082516,38,regular,2002,0.098328,40.651681,"Dallas, TX",,70,,https://www.eliteprospects.com/league/nhl,Dallas Stars\n \n\n \nNHL,25.0,59,1993,82,111.0,245,169,2,1.0,True,1.0,25,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667
31400,,6.896228,11.990429,12.097877,1.43948,19.302574,,1.0,6.896228,6.896228,6.896228,11.990429,11.990429,12.097877,12.097877,12.097877,1.43948,1.439480,1.43948,19.302574,19.302574,19.302574,0.0,0.000000,0.000000,15.939426,15.939426,15.939426,2.357327,2.357327,2.357327,83.831912,83.831912,83.831912,4.587027,4.587027,4.587027,41.769503,41.769503,41.769503,19.5,13.333333,13.333333,0.5,0.333333,0.333333,0.607895,0.607895,0.607895,20.399939,20.399939,20.399939,0.07005,0.070050,0.070050,-1.5,-1.000000,-1.000000,41.0,27.333333,27.333333,1.000563,1.000563,1.000563,0.082516,0.082516,0.082516,0.098328,0.098328,0.098328,40.651681,40.651681,40.651681,15.939426,2.357327,83.831912,4.587027,41.769503,43,1.0,0.607895,20.399939,National Hockey League,9,NHL,0.07005,1,69.0,45.0,184.0,9134,,Aaron Downey,Canada,8465992.0,RW,R,9139,9748.0,,aaron downey,98.0,-,2.0,1.000563,0.082516,38,regular,2002,0.098328,40.651681,"Dallas, TX",,70,,https://www.eliteprospects.com/league/nhl,Dallas Stars\n \n\n \nNHL,25.0,59,1993,82,111.0,245,169,2,1.0,True,1.0,25,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667
31401,,6.896228,11.990429,12.097877,1.43948,19.302574,,0.0,,,6.896228,,,,,12.097877,,,1.43948,,,19.302574,,,1.000000,,,15.939426,,,2.357327,,,83.831912,,,4.587027,,,41.769503,,,33.000000,,,0.000000,,,0.607895,,,20.399939,,,0.070050,,,-8.000000,,,62.000000,,,1.000563,,,0.082516,,,0.098328,,,40.651681,15.939426,2.357327,83.831912,4.587027,41.769503,16,2.0,0.607895,20.399939,National Hockey League,9,NHL,0.07005,1,29.0,39.0,193.0,9145,,John Erskine,Canada,8467365.0,D,L,9148,15565.0,,john erskine,100.0,-,2.0,1.000563,0.082516,38,regular,2002,0.098328,40.651681,"Dallas, TX",,70,,https://www.eliteprospects.com/league/nhl,Dallas Stars\n \n\n \nNHL,25.0,59,1993,82,111.0,245,169,2,1.0,True,1.0,25,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667


## Data preprocessing

In [24]:
print('Binary features are ')
for column in result_input_df.columns:
    if len(result_input_df[column].unique()) == 2:
        print(column, len(result_input_df[column].unique()))

Binary features are 
season_type 2
playoff_fact 2


In [25]:
print('Categorical features are ')
for column in result_input_df.columns:
    if len(result_input_df[column].unique()) > 2 and len(result_input_df[column].unique()) <= 100:
        print(column, len(result_input_df[column].unique()))

Categorical features are 
+ 64
- 55
S/G 52
assists 92
av_esg_in_last_2 53
av_esg_in_last_3 79
av_foa_in_last_2 69
av_foa_in_last_3 100
av_gwg_in_last_2 22
av_gwg_in_last_3 32
av_otg_in_last_2 9
av_otg_in_last_3 12
av_ppg_in_last_2 36
av_ppg_in_last_3 55
av_sds_in_last_2 13
av_sds_in_last_3 19
av_shg_in_last_2 9
av_shg_in_last_3 15
esg 26
foa 38
games_player 85
goals 68
gwg 12
league_full_name 3
league_id 3
league_short_name 3
otg 6
player_age 60
player_height 41
player_position 26
player_shoots 3
player_weight 95
ppg 20
sds 7
season_id 41
season_year 30
shg 6
team_country 54
team_full_name 12
team_id 57
team_khl_id 28
team_league_link 4
team_name 57
team_nhl_id 31
team_site_id 57
team_year_founded 35
games_team 33
position_in_championship 31
position_in_conference 17
position_in_division 9
nhl_id 31
feature_mean_games_count_season 24
feature_mean_points_season 31
feature_mean_position_in_division_season 30
feature_mean_position_in_conference_season 29
feature_mean_position_in_champions

In [26]:
def target_encoding(df, cat_name, target, weight):
    mean = df[target].mean()
    agg = df.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    return df[cat_name].map(smooth)

In [27]:
def data_preprocessing(data_input):
    
    # Select bad columns and remove them
    cols_2_drop_team = ['games_team', 'points_team', 'goals_scored',
                        'goals_missed','position_in_championship', 'position_in_conference',
                        'playoff_fact', 'position_in_division',  'nhl_id']
    data_input = data_input.drop(cols_2_drop_team, axis=1)
    
#     Index([
#  
#       
#       
#        'player_age', 
#   ],
#       dtype='object')
    cols_2_drop_player = ['team_id', 'season_id', 'player_stats_id', 'player_stats_khl_id', 
                          'player_id', 'games_player', 'points_player',
                          'assists', 'penalty', 'p_m', '+', '-', 'esg', 'ppg',
                          'shg', 'otg', 'gwg', 'sds', 'sog', '%SOG', 'S/G', 'fo', 
                          'fow', '%FO', 'TOI/G', 'SFT/G', 'hits', 'bls', 'foa', 'league_id', 
                          'league_full_name', 'team_full_name', 'team_league_link', 'team_site_id',
                          'team_khl_id', 'team_nhl_id', 'player_site_id', 'player_khl_id', 'player_nhl_id',
                          'player_name', 'player_unicode_name']
    
    data_input = data_input.drop(cols_2_drop_player, axis=1)
    
    # Encode categorical features using OneHotEncoding
    data_input = pd.get_dummies(data_input, columns=['player_shoots'], prefix_sep='=')
    data_input = data_input.drop(['player_shoots=-'], axis=1)
    
    data_input = pd.get_dummies(data_input, columns=['league_short_name'], prefix_sep='=')
    data_input = data_input.drop(['league_short_name=VHL'], axis=1)
    
    # Encode categorical features using TargetEncoding
    target_column = 'goals'
    data_input['team_name'] = target_encoding(df=data_input, cat_name='team_name', target=target_column, weight=10)
    data_input['team_country'] = target_encoding(df=data_input, cat_name='team_country', target=target_column, weight=10)
    data_input['player_nationality'] = target_encoding(df=data_input, cat_name='player_nationality', target=target_column, weight=10)    
    data_input['player_youth_team'] = target_encoding(df=data_input, cat_name='player_youth_team', target=target_column, weight=10)
    data_input['player_position'] = target_encoding(df=data_input, cat_name='player_position', target=target_column, weight=10)      

    # Encode binary features
    data_input.loc[:, 'season_type'] = data_input['season_type'].replace({'regular': 0, 'playoff': 1})
    
    # Get actual player age
    cur_year = 2019
    data_input['player_age'] = data_input['player_age'] - cur_year + data_input['season_year']
    
    return data_input

In [30]:
result_input_df.to_csv("../data/data_with_feature.csv")

In [31]:
data_train = result_input_df[result_input_df['season_id'] != 86]
data_test = result_input_df[result_input_df['season_id'] == 86]

In [32]:
prep_data_train = data_preprocessing(data_train)
prep_data_test = data_preprocessing(data_test)
prep_data_train

Unnamed: 0,av_%SOG_in_last_2,av_%SOG_in_last_3,av_%SOG_overall,av_+_in_last_2,av_+_in_last_3,av_-_in_last_2,av_-_in_last_3,av_-_overall,av_S/G_in_last_2,av_S/G_in_last_3,av_S/G_overall,av_SFT/G_in_last_2,av_SFT/G_in_last_3,av_SFT/G_overall,av_assists_in_last_2,av_assists_in_last_3,av_assists_overall,av_bls_in_last_2,av_bls_in_last_3,av_bls_overall,av_esg_in_last_2,av_esg_in_last_3,av_esg_overall,av_fo_in_last_2,av_fo_in_last_3,av_fo_overall,av_foa_in_last_2,av_foa_in_last_3,av_foa_overall,av_fow_in_last_2,av_fow_in_last_3,av_fow_overall,av_games_in_last_2,av_games_in_last_3,av_games_overall,av_goals_in_last_2,av_goals_in_last_3,av_goals_overall,av_gwg_in_last_2,av_gwg_in_last_3,av_gwg_overall,av_hits_in_last_2,av_hits_in_last_3,av_hits_overall,av_otg_in_last_2,av_otg_in_last_3,av_otg_overall,av_p_m_in_last_2,av_p_m_in_last_3,av_p_m_overall,av_penalty_in_last_2,av_penalty_in_last_3,av_penalty_overall,av_ppg_in_last_2,av_ppg_in_last_3,av_ppg_overall,av_sds_in_last_2,av_sds_in_last_3,av_sds_overall,av_shg_in_last_2,av_shg_in_last_3,av_shg_overall,av_sog_in_last_2,av_sog_in_last_3,av_sog_overall,goals,player_age,player_height,player_nationality,player_position,player_weight,player_youth_team,season_type,season_year,team_country,team_name,team_year_founded,feature_mean_games_count_season,feature_mean_points_season,feature_mean_position_in_division_season,feature_mean_position_in_conference_season,feature_mean_position_in_championship_season,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_mean_goals_scored_team,feature_mean_goals_missed_team,feature_mean_position_in_championship_team,feature_mean_position_in_conference_team,feature_mean_position_in_division_team,player_shoots=L,player_shoots=R,league_short_name=KHL,league_short_name=NHL
0,9.400000,7.766667,5.442857,35.000000,40.333333,38.000000,26.666667,29.428571,3.75000,3.900000,3.56250,24.300000,25.100000,26.314286,29.5,20.333333,17.857143,,,,8.500000,7.333333,5.750000,0.000000,0.000000,0.571429,,,,0.000000,0.333333,0.375000,52.5,37.333333,32.571429,9.5,10.333333,11.000000,1.000000,1.666667,2.250000,,,,0.00000,0.000000,0.285714,20.0,14.333333,2.714286,17.5,25.000000,37.875000,6.000000,4.000000,4.428571,0.000000,0.000000,0.142857,0.000000,0.000000,0.125000,128.000000,160.333333,130.000000,4.0,32.0,178.0,6.360204,2.673695,88.0,5.798342,1,2013,3.035964,3.035964,1946,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0,0,1,1,0
1,9.400000,7.766667,5.442857,35.000000,40.333333,38.000000,26.666667,29.428571,3.75000,3.900000,3.56250,24.300000,25.100000,26.314286,29.5,20.333333,17.857143,,,,8.500000,7.333333,5.750000,0.000000,0.000000,0.571429,,,,0.000000,0.333333,0.375000,52.5,37.333333,32.571429,9.5,10.333333,11.000000,1.000000,1.666667,2.250000,,,,0.00000,0.000000,0.285714,20.0,14.333333,2.714286,17.5,25.000000,37.875000,6.000000,4.000000,4.428571,0.000000,0.000000,0.142857,0.000000,0.000000,0.125000,128.000000,160.333333,130.000000,4.0,32.0,178.0,6.360204,2.673695,88.0,5.798342,1,2013,3.035964,3.035964,1946,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0,0,1,1,0
2,10.400000,8.800000,10.100000,10.500000,9.000000,9.500000,9.333333,11.200000,2.45000,2.300000,2.40000,15.950000,16.833333,18.400000,2.5,4.333333,6.800000,,,,3.000000,3.000000,4.400000,2.500000,2.000000,3.000000,,,,0.500000,0.666667,0.800000,19.0,20.333333,24.600000,4.0,3.666667,5.600000,1.000000,1.000000,1.600000,,,,0.50000,0.333333,0.200000,-2.5,-0.333333,1.600000,20.0,14.666667,20.800000,0.000000,0.333333,1.000000,0.000000,0.000000,0.000000,0.500000,0.333333,0.200000,59.500000,47.666667,60.600000,1.0,34.0,182.0,3.348200,8.567880,88.0,6.097482,1,2013,3.035964,3.035964,1946,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0,1,0,1,0
3,10.400000,8.800000,10.100000,10.500000,9.000000,9.500000,9.333333,11.200000,2.45000,2.300000,2.40000,15.950000,16.833333,18.400000,2.5,4.333333,6.800000,,,,3.000000,3.000000,4.400000,2.500000,2.000000,3.000000,,,,0.500000,0.666667,0.800000,19.0,20.333333,24.600000,4.0,3.666667,5.600000,1.000000,1.000000,1.600000,,,,0.50000,0.333333,0.200000,-2.5,-0.333333,1.600000,20.0,14.666667,20.800000,0.000000,0.333333,1.000000,0.000000,0.000000,0.000000,0.500000,0.333333,0.200000,59.500000,47.666667,60.600000,1.0,34.0,182.0,3.348200,8.567880,88.0,6.097482,1,2013,3.035964,3.035964,1946,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0,1,0,1,0
4,24.650000,20.866667,14.687500,22.000000,18.000000,11.000000,8.333333,5.250000,1.10000,0.933333,1.16250,16.150000,15.033333,14.337500,8.0,7.333333,5.125000,,,,6.000000,4.666667,3.375000,357.000000,311.000000,243.875000,,,,189.500000,164.000000,130.750000,31.5,29.333333,24.500000,7.0,5.333333,3.875000,0.500000,0.333333,0.625000,,,,0.00000,0.000000,0.000000,11.0,9.666667,6.250000,7.0,6.000000,5.000000,1.000000,0.666667,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,33.500000,27.333333,23.750000,6.0,26.0,181.0,3.348200,8.594177,86.0,3.287684,1,2013,3.035964,3.035964,1946,46.4861,,,,3.06944,144.514,17.25,6.33333,18,0,1.66667,57,2.33333,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31398,6.896228,6.896228,6.896228,11.990429,11.990429,12.097877,12.097877,12.097877,1.43948,1.439480,1.43948,19.302574,19.302574,19.302574,26.0,25.333333,20.500000,15.939426,15.939426,15.939426,2.357327,2.357327,2.357327,83.831912,83.831912,83.831912,4.587027,4.587027,4.587027,41.769503,41.769503,41.769503,68.0,70.333333,58.750000,14.0,15.666667,15.250000,0.607895,0.607895,0.607895,20.399939,20.399939,20.399939,0.07005,0.070050,0.070050,-3.5,-3.000000,-2.583333,23.0,24.000000,21.500000,1.000563,1.000563,1.000563,0.082516,0.082516,0.082516,0.098328,0.098328,0.098328,40.651681,40.651681,40.651681,2.0,31.0,180.0,11.570001,8.594177,83.0,5.798342,0,2002,7.124342,7.124342,1993,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667,0,1,0,1
31399,6.896228,6.896228,6.896228,11.990429,11.990429,12.097877,12.097877,12.097877,1.43948,1.439480,1.43948,19.302574,19.302574,19.302574,13.5,9.000000,8.454545,15.939426,15.939426,15.939426,2.357327,2.357327,2.357327,83.831912,83.831912,83.831912,4.587027,4.587027,4.587027,41.769503,41.769503,41.769503,51.0,34.333333,34.272727,4.5,3.000000,3.545455,0.607895,0.607895,0.607895,20.399939,20.399939,20.399939,0.07005,0.070050,0.070050,2.0,1.333333,-2.272727,57.0,38.000000,27.090909,1.000563,1.000563,1.000563,0.082516,0.082516,0.082516,0.098328,0.098328,0.098328,40.651681,40.651681,40.651681,7.0,29.0,191.0,6.378095,2.673695,99.0,5.798342,0,2002,7.124342,7.124342,1993,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667,0,1,0,1
31400,6.896228,6.896228,6.896228,11.990429,11.990429,12.097877,12.097877,12.097877,1.43948,1.439480,1.43948,19.302574,19.302574,19.302574,0.0,0.000000,0.000000,15.939426,15.939426,15.939426,2.357327,2.357327,2.357327,83.831912,83.831912,83.831912,4.587027,4.587027,4.587027,41.769503,41.769503,41.769503,19.5,13.333333,13.333333,0.5,0.333333,0.333333,0.607895,0.607895,0.607895,20.399939,20.399939,20.399939,0.07005,0.070050,0.070050,-1.5,-1.000000,-1.000000,41.0,27.333333,27.333333,1.000563,1.000563,1.000563,0.082516,0.082516,0.082516,0.098328,0.098328,0.098328,40.651681,40.651681,40.651681,1.0,28.0,184.0,6.378095,8.567880,98.0,5.798342,0,2002,7.124342,7.124342,1993,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667,0,1,0,1
31401,,,6.896228,,,,,12.097877,,,1.43948,,,19.302574,,,1.000000,,,15.939426,,,2.357327,,,83.831912,,,4.587027,,,41.769503,,,33.000000,,,0.000000,,,0.607895,,,20.399939,,,0.070050,,,-8.000000,,,62.000000,,,1.000563,,,0.082516,,,0.098328,,,40.651681,2.0,22.0,193.0,6.378095,2.673695,100.0,5.798342,0,2002,7.124342,7.124342,1993,82,86.5595,2.92099,7.78864,15.0983,220.37,222.089,26.3333,82.6667,101,249.667,215.667,5,2.33333,1.66667,1,0,0,1


In [34]:
y_column = 'goals'
X_columns = prep_data_train.columns[prep_data_train.columns != y_column]
X_columns

Index(['av_%SOG_in_last_2', 'av_%SOG_in_last_3', 'av_%SOG_overall',
       'av_+_in_last_2', 'av_+_in_last_3', 'av_-_in_last_2', 'av_-_in_last_3',
       'av_-_overall', 'av_S/G_in_last_2', 'av_S/G_in_last_3',
       'av_S/G_overall', 'av_SFT/G_in_last_2', 'av_SFT/G_in_last_3',
       'av_SFT/G_overall', 'av_assists_in_last_2', 'av_assists_in_last_3',
       'av_assists_overall', 'av_bls_in_last_2', 'av_bls_in_last_3',
       'av_bls_overall', 'av_esg_in_last_2', 'av_esg_in_last_3',
       'av_esg_overall', 'av_fo_in_last_2', 'av_fo_in_last_3', 'av_fo_overall',
       'av_foa_in_last_2', 'av_foa_in_last_3', 'av_foa_overall',
       'av_fow_in_last_2', 'av_fow_in_last_3', 'av_fow_overall',
       'av_games_in_last_2', 'av_games_in_last_3', 'av_games_overall',
       'av_goals_in_last_2', 'av_goals_in_last_3', 'av_goals_overall',
       'av_gwg_in_last_2', 'av_gwg_in_last_3', 'av_gwg_overall',
       'av_hits_in_last_2', 'av_hits_in_last_3', 'av_hits_overall',
       'av_otg_in_last_2', 

In [35]:
X_train, y_train = prep_data_train[X_columns].values, prep_data_train[y_column].values
X_test, y_test = prep_data_test[X_columns].values, prep_data_test[y_column].values

## LightGBM

In [36]:
conda install -c conda-forge lightgbm

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/gvyarduhin/opt/anaconda3

  added / updated specs:
    - lightgbm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.8.2                |           py37_0         3.0 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following packages will be SUPERSEDED by a higher-priority channel:

  conda                                           pkgs/main --> conda-forge



Downloading and Extracting Packages
conda-4.8.2          | 3.0 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [41]:
import lightgbm as ltb

In [113]:
model = ltb.LGBMRegressor()
model.fit(X_train, y_train)
predicted_y = model.predict(X_test)

In [115]:
from sklearn.metrics import mean_squared_error, r2_score
print('MSE %.2f' % mean_squared_error(y_test, predicted_y))
print('R^2 %.2f' % r2_score(y_test, predicted_y))

MSE 24.52
R^2 0.61


## CatBoost

In [44]:
conda install -c conda-forge catboost

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/gvyarduhin/opt/anaconda3

  added / updated specs:
    - catboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    catboost-0.22              |           py37_0         9.7 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         9.7 MB

The following NEW packages will be INSTALLED:

  catboost           conda-forge/osx-64::catboost-0.22-py37_0



Downloading and Extracting Packages
catboost-0.22        | 9.7 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [46]:
from catboost import CatBoostRegressor

In [121]:
best_acc = 0
best_depth = 2
best_rate = 0.05
best_iterations = 500
bar = progressbar.ProgressBar(maxval=6*3 + 1, \
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
step = 1
for depth in [2, 3, 4, 5, 7, 10]:
        for iterations in [500, 1000, 2000]:   
            model = CatBoostRegressor(iterations=iterations, learning_rate=0.05, depth=depth, silent=True)
            # Fit model
            model.fit(X_train, y_train)
            predicted_y = model.predict(X_test)
            r2 = r2_score(y_test, predicted_y)
            if r2 > best_acc:
                best_acc = r2
                best_depth = depth
                best_iterations = iterations
                best_rate = rate
            bar.update(step)
            step += 1
bar.finish()
best_depth, best_iterations



(4, 0.85, 2000)

In [122]:
print('R^2 %.2f' % best_acc)

R^2 0.62


## PLS

In [49]:
from sklearn.cross_decomposition import PLSRegression

In [62]:
X_train_noNan = prep_data_train[X_columns].fillna(X_train.mean())
X_test_noNan = prep_data_test[X_columns].fillna(X_test.mean())

In [76]:
for column in X_columns:
    X_train_noNan[column].fillna(np.nanmean(X_train_noNan[column].values), inplace=True)
    X_test_noNan[column].fillna(np.nanmean(X_test_noNan[column].values), inplace=True)
X_test_noNan.isnull().sum().sum()

0

In [119]:
best_acc = 0
best_n = 2
best_iter = 500
for n in [2, 3, 4, 5, 7, 10, 15, 20, 30, 50, 85]:
    for max_iter in [500, 1000, 2000, 4000]:
        model = PLSRegression(n_components=n, max_iter=max_iter)
        # Fit model
        model.fit(X_train_noNan, y_train)
        predicted_y = model.predict(X_test_noNan)
        r2 = r2_score(y_test, predicted_y)
        if r2 > best_acc:
            best_acc = r2
            best_n = n
            best_iter = max_iter
best_n, best_iter

(85, 500)

In [120]:
print('R^2 %.2f' % best_acc)

R^2 0.54
