In [1]:
import os
import pandas as pd
import numpy as np
import statistics
from scipy.stats import gmean
import sys

In [2]:
!{sys.executable} -m pip install psycopg2-binary



In [3]:
database_connect = 'postgres://doadmin:rdxo4w05qb3vq10l@db-postgresql-fra1-36671-do-user-4768937-0.db.ondigitalocean.com:25060/hockey'

In [4]:
class DataFrameTransformer:
    def __init__(self, df):
        self.df = df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in self.transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df)
        return self.df

In [5]:
table_player_stats_name = 'data_for_model2'
df_player_stats = pd.read_sql_table(table_player_stats_name, database_connect)
df_player_stats.sample(5)

Unnamed: 0,player_stats_khl_id,season_id,team,№,gp,g,assists,pts,+/-,+,...,player_height,player_weight,player_site_id,player_age,player_name,player_unicode_name,player_khl_id,player_nhl_id,season_year,season_type
2788,7823,85,188,55,19,3.0,6.0,9.0,10,12.0,...,179.0,72.0,211956,24.0,Vladimir Tkachyov ...,"vladimir tkachyov a.k.a. ""vladimir tkachev""",845.0,,2018,playoff
11166,13188,86,649,69,12,0.0,1.0,1.0,-3,1.0,...,180.0,86.0,54802,25.0,Nikolajs Jelisejevs ...,"nikolajs jelisejevs a.k.a. ""nikolajs jelisejevs""",614.0,,2018,regular
262,3496,80,13644,48,25,0.0,6.0,6.0,-4,16.0,...,190.0,100.0,43819,27.0,Ivan Glazkov,ivan glazkov,245.0,,2016,regular
4383,10159,77,194,15,26,1.0,7.0,8.0,-12,6.0,...,193.0,103.0,9586,34.0,Grigori Shafigulin,grigori shafigulin,,,2015,regular
5684,6724,79,200,74,19,2.0,5.0,7.0,-4,9.0,...,185.0,83.0,536898,46.0,Nikolai Prokhorkin,nikolai prokhorkin,97.0,,2016,playoff


In [6]:
table_team_stats_name = 'team_stats'
df_team_stats = pd.read_sql_table(table_team_stats_name, database_connect)
df_team_stats.sample(5)

Unnamed: 0,id,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
19443,20831,216,92,38,42.0,106,102,12,7.0,False,3.0,
15940,17083,216,92,38,42.0,106,102,12,7.0,False,3.0,
12739,13653,649,91,37,,3,69,23,,True,,
8707,9331,931,91,35,,2,79,14,,True,,
12808,13727,188,91,36,,6,111,1,,True,,


In [7]:
df_temp = df_team_stats[:]
df_temp.drop('id', axis=1, inplace=True)
df_team_stats = df_temp.drop_duplicates(keep='first')
df_team_stats = df_team_stats.reset_index(drop=True)
df_team_stats

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
0,216,85,4,,6,10,14,,True,,
1,216,85,56,,10,116,11,,True,,
2,216,80,60,100.0,163,137,11,7.0,False,5.0,
3,216,83,60,104.0,145,124,9,6.0,False,4.0,
4,216,82,5,,9,13,11,,True,,
...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,53
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,23
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,24
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,22


## Features for teams

In [8]:
def add_feature_from_dict(df, d, feature_name, attr_name, season_count, step):
    df[feature_name] = None
    for k, v in d.items():
        s = 0
        for i in range(1, season_count + 1):
            if d.get(k - step * i) != None:
                s += d.get(k - step * i)
        df[feature_name][df[attr_name] == k] = s / season_count
    return df

In [9]:
# transform methods for seasons
def add_avg_games_for_season(df):
    d = df.groupby('season_id')['games'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_games_count_season',  'season_id', 3, 3)

def add_avg_points_for_season(df):
    d = df.groupby('season_id')['points'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_points_season',  'season_id', 3, 3)

def add_avg_goals_scored_for_season(df):
    d = df.groupby('season_id')['goals_scored'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_scored_season', 'season_id', 3, 3)

def add_avg_goals_missed_for_season(df):
    d = df.groupby('season_id')['goals_missed'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_missed_season', 'season_id', 3, 3)

def add_avg_position_in_championship_for_season(df):
    d = df.groupby('season_id')['position_in_championship'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_championship_season', 'season_id', 3, 3)

def add_avg_position_in_conference_for_season(df):
    d = df.groupby('season_id')['position_in_conference'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_conference_season', 'season_id', 3, 3)

def add_avg_position_in_division_for_season(df):
    d = df.groupby('season_id')['position_in_division'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_division_season', 'season_id', 3, 3)

def add_team_count_for_season(df):
    d = df.groupby('season_id').apply(lambda x: len(x['team_id'].unique())).to_dict()
    return add_feature_from_dict(df, d, 'feature_team_count_season', 'season_id', 3, 3)

In [10]:
transforms = [add_avg_games_for_season, add_avg_points_for_season, add_avg_position_in_division_for_season, add_avg_position_in_conference_for_season, add_avg_position_in_championship_for_season, add_avg_goals_missed_for_season, add_avg_goals_scored_for_season, add_team_count_for_season]
transformer = DataFrameTransformer(df_team_stats[:])
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a s

Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id,feature_mean_games_count_season,feature_mean_points_season,feature_mean_position_in_division_season,feature_mean_position_in_conference_season,feature_mean_position_in_championship_season,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season
0,216,85,4,,6,10,14,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333
1,216,85,56,,10,116,11,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333
2,216,80,60,100.0,163,137,11,7.0,False,5.0,,82,92.2011,4.18637,7.89573,15.2557,224.189,225.177,28.6667
3,216,83,60,104.0,145,124,9,6.0,False,4.0,,78.3977,91.7713,4.13833,7.79673,15.1081,211.707,212.742,38.3333
4,216,82,5,,9,13,11,,True,,,43.3372,,,,7.69208,119.826,10.9299,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,53,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,23,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,24,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,22,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333


In [11]:
conda install progressbar2

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [12]:
import progressbar
def calculate_team_feature(df, feature_name, attr, season_count, step):
    print("Calculating feature for", attr)
    df[feature_name] = None
    bar = progressbar.ProgressBar(maxval=12, \
        widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    step = 1
    for ind in range(len(df)):
        season = df.loc[ind, 'season_id']
        team = df.loc[ind, 'team_id']
        s = 0
        for i in range(1, season_count + 1):
            s += df.loc[(df['season_id'] == season - step * i) & (df['team_id'] == team)][attr].sum()
        df.loc[ind, feature_name] = s / season_count
        if ind % 100 == 0:
            bar.update(step)
            step += 1
    bar.finish()
    return df

In [13]:
# transform methods for teams
def add_avg_games_for_team(df):
    return calculate_team_feature(df, 'feature_mean_games_count_team', 'games', 3, 3)

def add_avg_points_for_team(df):
    return calculate_team_feature(df, 'feature_mean_points_team', 'points', 3, 3)

def add_avg_goals_scored_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_scored_team', 'goals_scored', 3, 3)

def add_avg_goals_missed_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_missed_team', 'goals_missed', 3, 3)

def add_avg_position_in_championship_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_championship_team', 'position_in_championship', 3, 3)

def add_avg_position_in_conference_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_conference_team', 'position_in_conference', 3, 3)

def add_avg_position_in_division_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_division_team', 'position_in_division', 3, 3)

In [14]:
transforms = [add_avg_games_for_team, add_avg_points_for_team, add_avg_goals_scored_for_team, add_avg_goals_missed_for_team, add_avg_position_in_championship_for_team, add_avg_position_in_conference_for_team, add_avg_position_in_division_for_team]
transformer = DataFrameTransformer(df_team_feature)
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
                                                                               [                                                                        ] N/A%

Calculating feature for games


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
[                                                                        ] N/A%

Calculating feature for points


[                                                                        ] N/A%

Calculating feature for goals_scored


[                                                                        ] N/A%

Calculating feature for goals_missed


[                                                                        ] N/A%

Calculating feature for position_in_championship


[                                                                        ] N/A%

Calculating feature for position_in_conference


[                                                                        ] N/A%

Calculating feature for position_in_division




Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,...,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_mean_goals_scored_team,feature_mean_goals_missed_team,feature_mean_position_in_championship_team,feature_mean_position_in_conference_team,feature_mean_position_in_division_team
0,216,85,4,,6,10,14,,True,,...,100.221,12.4836,24.3333,41.6667,34.6667,53.6667,94,9.66667,2,1.33333
1,216,85,56,,10,116,11,,True,,...,100.221,12.4836,24.3333,43.6667,34.6667,59.3333,106.333,9.33333,2,1.33333
2,216,80,60,100.0,163,137,11,7.0,False,5.0,...,224.189,225.177,28.6667,21.6667,0,4.33333,56.3333,8.66667,0,0
3,216,83,60,104.0,145,124,9,6.0,False,4.0,...,211.707,212.742,38.3333,23.6667,0,11,65,6.33333,0,0
4,216,82,5,,9,13,11,,True,,...,119.826,10.9299,16,41.6667,33.3333,58.6667,102,12.3333,2.33333,1.66667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,...,202.57,203.18,48.3333,0,0,0,0,0,0,0
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,...,202.57,203.18,48.3333,27.3333,30.6667,85.3333,85,5.66667,3,1.33333
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,...,202.57,203.18,48.3333,27.3333,32.6667,84.6667,76.3333,4,2,1
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,...,202.57,203.18,48.3333,27.3333,31.6667,85.3333,83.6667,4.66667,2.66667,1


## Features for player

In [15]:
def av_games_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_games_overall'] = ssum/count
                if not np.isnan(list(b[j]["games"])[i]):
                    count+=1
                    ssum+=list(b[j]["games"])[i]
        return pd.concat(b)
def av_goals_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_goals_overall'] = ssum/count
                if not np.isnan(list(b[j]["goals"])[i]):
                    count+=1
                    ssum+=list(b[j]["goals"])[i]
        return pd.concat(b)
def av_assists_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_assists_overall'] = ssum/count
                if not np.isnan(list(b[j]["assists"])[i]):
                    count+=1
                    ssum+=list(b[j]["assists"])[i]
        return pd.concat(b)
def av_penalty_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_penalty_overall'] = ssum/count
                if not np.isnan(list(b[j]["penalty"])[i]):
                    count+=1
                    ssum+=list(b[j]["penalty"])[i]
        return pd.concat(b)
def av_p_m_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_p_m_overall'] = ssum/count
                if not np.isnan(list(b[j]["p_m"])[i]):
                    count+=1
                    ssum+=list(b[j]["p_m"])[i]
        return pd.concat(b)
def av_p_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_+_overall'] = ssum/count
                if not np.isnan(list(b[j]["+"])[i]):
                    count+=1
                    ssum+=list(b[j]["+"])[i]
        return pd.concat(b)
def av_m_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_-_overall'] = ssum/count
                if not np.isnan(list(b[j]["-"])[i]):
                    count+=1
                    ssum+=list(b[j]["-"])[i]
        return pd.concat(b)
def av_esg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_esg_overall'] = ssum/count
                if not np.isnan(list(b[j]["esg"])[i]):
                    count+=1
                    ssum+=list(b[j]["esg"])[i]
        return pd.concat(b)
def av_ppg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_ppg_overall'] = ssum/count
                if not np.isnan(list(b[j]["ppg"])[i]):
                    count+=1
                    ssum+=list(b[j]["ppg"])[i]
        return pd.concat(b)
def av_shg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_shg_overall'] = ssum/count
                if not np.isnan(list(b[j]["shg"])[i]):
                    count+=1
                    ssum+=list(b[j]["shg"])[i]
        return pd.concat(b)
def av_otg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_otg_overall'] = ssum/count
                if not np.isnan(list(b[j]["otg"])[i]):
                    count+=1
                    ssum+=list(b[j]["otg"])[i]
        return pd.concat(b)
def av_gwg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_gwg_overall'] = ssum/count
                if not np.isnan(list(b[j]["gwg"])[i]):
                    count+=1
                    ssum+=list(b[j]["gwg"])[i]
        return pd.concat(b)
def av_sds_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sds_overall'] = ssum/count
                if not np.isnan(list(b[j]["sds"])[i]):
                    count+=1
                    ssum+=list(b[j]["sds"])[i]
        return pd.concat(b)
def av_sog_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sog_overall'] = ssum/count
                if not np.isnan(list(b[j]["sog"])[i]):
                    count+=1
                    ssum+=list(b[j]["sog"])[i]
        return pd.concat(b)
def av__SOG_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%SOG_overall'] = ssum/count
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    count+=1
                    ssum+=list(b[j]["%SOG"])[i]
        return pd.concat(b)
def av_S_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_S/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["S/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["S/G"])[i]
        return pd.concat(b)
def av_fo_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fo_overall'] = ssum/count
                if not np.isnan(list(b[j]["fo"])[i]):
                    count+=1
                    ssum+=list(b[j]["fo"])[i]
        return pd.concat(b)
def av_fow_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fow_overall'] = ssum/count
                if not np.isnan(list(b[j]["fow"])[i]):
                    count+=1
                    ssum+=list(b[j]["fow"])[i]
        return pd.concat(b)
def av__FO_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%FO_overall'] = ssum/count
                if not np.isnan(list(b[j]["%FO"])[i]):
                    count+=1
                    ssum+=list(b[j]["%FO"])[i]
        return pd.concat(b)
def av_TOI_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_TOI/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["TOI/G"])[i]
        return pd.concat(b)
def av_SFT_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_SFT/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["SFT/G"])[i]
        return pd.concat(b)
def av_hits_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_hits_overall'] = ssum/count
                if not np.isnan(list(b[j]["hits"])[i]):
                    count+=1
                    ssum+=list(b[j]["hits"])[i]
        return pd.concat(b)
def av_bls_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_bls_overall'] = ssum/count
                if not np.isnan(list(b[j]["bls"])[i]):
                    count+=1
                    ssum+=list(b[j]["bls"])[i]
        return pd.concat(b)
def av_foa_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 0:
                        b[j].loc[b[j].index[i], 'av_foa_overall'] = ssum/count
                    if not np.isnan(list(b[j]["foa"])[i]):
                        count+=1
                        ssum+=list(b[j]["foa"])[i]
        return pd.concat(b)

In [16]:
df_player_stats = df_player_stats.rename({'player_name': 'player_id', 'team': 'team_id', 'gp': 'games', 'g': 'goals', 'pim': 'penalty', '+/-': 'p_m'}, axis=1)
df_player_stats.columns

Index(['player_stats_khl_id', 'season_id', 'team_id', '№', 'games', 'goals',
       'assists', 'pts', 'p_m', '+', '-', 'penalty', 'esg', 'ppg', 'shg',
       'otg', 'gwg', 'sds', 'sog', '%SOG', 'S/G', 'fo', 'fow', '%FO', 'TOI/G',
       'SFT/G', 'hits', 'bls', 'foa', 'league_id', 'league_short_name',
       'league_full_name', 'team_name', 'team_country', 'team_year_founded',
       'team_full_name', 'team_league_link', 'team_site_id', 'team_khl_id',
       'team_nhl_id', 'player_nationality', 'player_youth_team',
       'player_position', 'player_shoots', 'player_height', 'player_weight',
       'player_site_id', 'player_age', 'player_id', 'player_unicode_name',
       'player_khl_id', 'player_nhl_id', 'season_year', 'season_type'],
      dtype='object')

In [21]:
transforms = [av_m_overall, av_foa_overall, av_bls_overall, av_hits_overall, av_SFT_G_overall, av_fow_overall, av_fo_overall, av_S_G_overall, av__SOG_overall, av_sog_overall, av_sds_overall, av_gwg_overall, av_otg_overall, av_shg_overall, av_ppg_overall, av_esg_overall, av_games_overall, av_goals_overall, av_assists_overall, av_penalty_overall, av_p_m_overall, av_p_overall]
transformer = DataFrameTransformer(df_player_stats)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas 

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_overall,av_+_overall,...,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded,№
1522,0.0,0.0,0.0,2.0,0.8,13.0,8:29,0.0,6.200000,,...,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17
1521,47.3,6.2,14.0,12.0,2.3,18.1,14:08,5.0,,0.000000,...,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17
1449,50.0,16.7,9.0,7.0,2.7,19.7,16:22,4.0,2.066667,7.000000,...,"Minsk, BLR",,1577,10.0,https://www.eliteprospects.com/league/khl,Dinamo Minsk\n \n\n \nKHL,,1678,1976,21
1450,33.3,0.0,0.0,4.0,1.5,21.0,15:52,0.0,3.100000,7.666667,...,"Minsk, BLR",,1577,10.0,https://www.eliteprospects.com/league/khl,Dinamo Minsk\n \n\n \nKHL,,1678,1976,21
7905,-,0.0,13.0,11.0,1.8,24.3,21:12,4.0,,,...,"Cherepovets, RUS",HK Severstal Cherepovets,205,4.0,https://www.eliteprospects.com/league/khl,Severstal Cherepovets\n \n\n \nKHL,,198,1956,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4881,-,0.0,5.0,4.0,0.8,21.2,13:48,1.0,3.820000,8.400000,...,Yaroslavl,,198,26.0,https://www.eliteprospects.com/league/khl,Lokomotiv Yaroslavl\n \n\n \nKHL,,191,1949,88
285,-,0.0,0.0,1.0,0.8,23.5,18:45,1.0,3.183333,7.833333,...,"Sochi, RUS",,15526,13.0,https://www.eliteprospects.com/league/khl,HK Sochi\n \n\n \nKHL,,17166,2014,8
294,-,3.3,12.0,8.0,2.7,21.4,18:37,8.0,2.728571,6.714286,...,"Sochi, RUS",,15526,13.0,https://www.eliteprospects.com/league/khl,HK Sochi\n \n\n \nKHL,,17166,2014,8
4865,-,0.0,3.0,1.0,1.4,24.1,15:37,1.0,2.800000,7.375000,...,Yaroslavl,,198,26.0,https://www.eliteprospects.com/league/khl,Lokomotiv Yaroslavl\n \n\n \nKHL,,191,1949,84


In [22]:
def av_games_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [23]:
transforms = [av_games_in_last_2,
av_goals_in_last_2,
av_assists_in_last_2,
av_penalty_in_last_2,
av_p_m_in_last_2,
av_p_in_last_2,
av_m_in_last_2,
av_esg_in_last_2,
av_ppg_in_last_2,
av_shg_in_last_2,
av_otg_in_last_2,
av_gwg_in_last_2,
av_sds_in_last_2,
av_sog_in_last_2,
av__SOG_in_last_2,
av_S_G_in_last_2,
av_fo_in_last_2,
av_fow_in_last_2,
av_SFT_G_in_last_2,
av_hits_in_last_2,
av_bls_in_last_2,
av_foa_in_last_2]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas 

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_overall,...,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded,№
1522,0.0,0.0,0.0,2.0,0.8,13.0,8:29,0.0,,6.200000,...,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17
1521,47.3,6.2,14.0,12.0,2.3,18.1,14:08,5.0,,,...,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17
1449,50.0,16.7,9.0,7.0,2.7,19.7,16:22,4.0,0.00,2.066667,...,"Minsk, BLR",,1577,10.0,https://www.eliteprospects.com/league/khl,Dinamo Minsk\n \n\n \nKHL,,1678,1976,21
1450,33.3,0.0,0.0,4.0,1.5,21.0,15:52,0.0,3.10,3.100000,...,"Minsk, BLR",,1577,10.0,https://www.eliteprospects.com/league/khl,Dinamo Minsk\n \n\n \nKHL,,1678,1976,21
7905,-,0.0,13.0,11.0,1.8,24.3,21:12,4.0,,,...,"Cherepovets, RUS",HK Severstal Cherepovets,205,4.0,https://www.eliteprospects.com/league/khl,Severstal Cherepovets\n \n\n \nKHL,,198,1956,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4881,-,0.0,5.0,4.0,0.8,21.2,13:48,1.0,2.00,3.820000,...,Yaroslavl,,198,26.0,https://www.eliteprospects.com/league/khl,Lokomotiv Yaroslavl\n \n\n \nKHL,,191,1949,88
285,-,0.0,0.0,1.0,0.8,23.5,18:45,1.0,2.00,3.183333,...,"Sochi, RUS",,15526,13.0,https://www.eliteprospects.com/league/khl,HK Sochi\n \n\n \nKHL,,17166,2014,8
294,-,3.3,12.0,8.0,2.7,21.4,18:37,8.0,0.00,2.728571,...,"Sochi, RUS",,15526,13.0,https://www.eliteprospects.com/league/khl,HK Sochi\n \n\n \nKHL,,17166,2014,8
4865,-,0.0,3.0,1.0,1.4,24.1,15:37,1.0,1.65,2.800000,...,Yaroslavl,,198,26.0,https://www.eliteprospects.com/league/khl,Lokomotiv Yaroslavl\n \n\n \nKHL,,191,1949,84


In [24]:
def av_games_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_games_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_+_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_-_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [25]:
transforms = [av_games_in_last_3,
av_goals_in_last_3,
av_assists_in_last_3,
av_penalty_in_last_3,
av_p_m_in_last_3,
av_p_in_last_3,
av_m_in_last_3,
av_esg_in_last_3,
av_ppg_in_last_3,
av_shg_in_last_3,
av_otg_in_last_3,
av_gwg_in_last_3,
av_sds_in_last_3,
av_sog_in_last_3,
av__SOG_in_last_3,
av_S_G_in_last_3,
av_fo_in_last_3,
av_fow_in_last_3,
av_SFT_G_in_last_3,
av_hits_in_last_3,
av_bls_in_last_3,
av_foa_in_last_3]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas 

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_in_last_3,...,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded,№
1522,0.0,0.0,0.0,2.0,0.8,13.0,8:29,0.0,,,...,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17
1521,47.3,6.2,14.0,12.0,2.3,18.1,14:08,5.0,,,...,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17
1449,50.0,16.7,9.0,7.0,2.7,19.7,16:22,4.0,0.00,2.066667,...,"Minsk, BLR",,1577,10.0,https://www.eliteprospects.com/league/khl,Dinamo Minsk\n \n\n \nKHL,,1678,1976,21
1450,33.3,0.0,0.0,4.0,1.5,21.0,15:52,0.0,3.10,,...,"Minsk, BLR",,1577,10.0,https://www.eliteprospects.com/league/khl,Dinamo Minsk\n \n\n \nKHL,,1678,1976,21
7905,-,0.0,13.0,11.0,1.8,24.3,21:12,4.0,,,...,"Cherepovets, RUS",HK Severstal Cherepovets,205,4.0,https://www.eliteprospects.com/league/khl,Severstal Cherepovets\n \n\n \nKHL,,198,1956,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4881,-,0.0,5.0,4.0,0.8,21.2,13:48,1.0,2.00,1.333333,...,Yaroslavl,,198,26.0,https://www.eliteprospects.com/league/khl,Lokomotiv Yaroslavl\n \n\n \nKHL,,191,1949,88
285,-,0.0,0.0,1.0,0.8,23.5,18:45,1.0,2.00,1.333333,...,"Sochi, RUS",,15526,13.0,https://www.eliteprospects.com/league/khl,HK Sochi\n \n\n \nKHL,,17166,2014,8
294,-,3.3,12.0,8.0,2.7,21.4,18:37,8.0,0.00,1.333333,...,"Sochi, RUS",,15526,13.0,https://www.eliteprospects.com/league/khl,HK Sochi\n \n\n \nKHL,,17166,2014,8
4865,-,0.0,3.0,1.0,1.4,24.1,15:37,1.0,1.65,1.100000,...,Yaroslavl,,198,26.0,https://www.eliteprospects.com/league/khl,Lokomotiv Yaroslavl\n \n\n \nKHL,,191,1949,84


## Features for league

In [26]:
def med_goals_per_season(df):
    a = df.groupby('season_id')['goals'].median()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['featrue_median_goals_previous_season']))
    return df

def med_assists_per_season(df):
    a = df.groupby('season_id')['assists'].median()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['featrue_median_assists_previous_season']))
    return df


def med_penalty_per_season(df):
    a = df.groupby('season_id')['penalty'].median()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['featrue_median_penalty_previous_season']))
    return df


# Average goals per season
def avg_goals_per_season(df):
    a = df.groupby('season_id')['goals'].mean()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_goals_preious_season']))
    return df

# Average assists per season
def avg_assists_per_season(df):
    a = df.groupby('season_id')['assists'].mean()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_assists_previous_season']))
    return df

# Sum of penalty per season
def avg_penalty_per_season(df):
    a = df.groupby('season_id')['penalty'].mean()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_penalty_season']))
    return df

In [27]:
_tpr = df_player_feature.copy()

In [28]:
transforms = [med_goals_per_season,
med_assists_per_season,
med_penalty_per_season,
avg_goals_per_season,
avg_assists_per_season,
avg_penalty_per_season]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_in_last_3,...,team_nhl_id,team_site_id,team_year_founded,№,featrue_median_goals_previous_season,featrue_median_assists_previous_season,featrue_median_penalty_previous_season,feature_mean_goals_preious_season,feature_mean_assists_previous_season,feature_mean_penalty_season
1522,0.0,0.0,0.0,2.0,0.8,13.0,8:29,0.0,,,...,,1724,2006,17,0.0,1.0,4.0,0.996904,1.507740,7.383901
1521,47.3,6.2,14.0,12.0,2.3,18.1,14:08,5.0,,,...,,1724,2006,17,3.0,5.0,16.0,4.386445,6.877527,21.638526
1449,50.0,16.7,9.0,7.0,2.7,19.7,16:22,4.0,0.00,2.066667,...,,1678,1976,21,2.0,4.0,14.0,3.916760,6.052868,18.976378
1450,33.3,0.0,0.0,4.0,1.5,21.0,15:52,0.0,3.10,,...,,1678,1976,21,0.0,1.0,2.0,1.003040,1.586626,6.006079
7905,-,0.0,13.0,11.0,1.8,24.3,21:12,4.0,,,...,,198,1956,55,3.0,5.0,16.0,4.386445,6.877527,21.638526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4881,-,0.0,5.0,4.0,0.8,21.2,13:48,1.0,2.00,1.333333,...,,191,1949,88,3.0,4.0,14.0,4.506866,6.960050,17.739076
285,-,0.0,0.0,1.0,0.8,23.5,18:45,1.0,2.00,1.333333,...,,17166,2014,8,1.0,1.0,4.0,1.065868,1.634731,5.182635
294,-,3.3,12.0,8.0,2.7,21.4,18:37,8.0,0.00,1.333333,...,,17166,2014,8,3.0,4.0,14.0,4.506866,6.960050,17.739076
4865,-,0.0,3.0,1.0,1.4,24.1,15:37,1.0,1.65,1.100000,...,,191,1949,84,3.0,4.0,14.0,4.506866,6.960050,17.739076


## Merge datasets

In [29]:
df_player_feature['team_id'] = df_player_feature['team_id'].astype(int)
result_input_df = pd.merge(df_player_feature, df_team_feature, on=['team_id', 'season_id'], suffixes=('_player', '_team'))
pd.set_option('display.max_columns', 500)
result_input_df

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_in_last_3,av_%SOG_overall,av_+_in_last_2,av_+_in_last_3,av_+_overall,av_-_in_last_2,av_-_in_last_3,av_-_overall,av_S/G_in_last_2,av_S/G_in_last_3,av_S/G_overall,av_SFT/G_in_last_2,av_SFT/G_in_last_3,av_SFT/G_overall,av_assists_in_last_2,av_assists_in_last_3,av_assists_overall,av_bls_in_last_2,av_bls_in_last_3,av_bls_overall,av_esg_in_last_2,av_esg_in_last_3,av_esg_overall,av_fo_in_last_2,av_fo_in_last_3,av_fo_overall,av_foa_in_last_2,av_foa_in_last_3,av_foa_overall,av_fow_in_last_2,av_fow_in_last_3,av_fow_overall,av_games_in_last_2,av_games_in_last_3,av_games_overall,av_goals_in_last_2,av_goals_in_last_3,av_goals_overall,av_gwg_in_last_2,av_gwg_in_last_3,av_gwg_overall,av_hits_in_last_2,av_hits_in_last_3,av_hits_overall,av_otg_in_last_2,av_otg_in_last_3,av_otg_overall,av_p_m_in_last_2,av_p_m_in_last_3,av_p_m_overall,av_penalty_in_last_2,av_penalty_in_last_3,av_penalty_overall,av_ppg_in_last_2,av_ppg_in_last_3,av_ppg_overall,av_sds_in_last_2,av_sds_in_last_3,av_sds_overall,av_shg_in_last_2,av_shg_in_last_3,av_shg_overall,av_sog_in_last_2,av_sog_in_last_3,av_sog_overall,bls,esg,fo,foa,fow,games_player,goals,gwg,hits,league_full_name,league_id,league_short_name,otg,p_m,penalty,player_age,player_height,player_id,player_khl_id,player_nationality,player_nhl_id,player_position,player_shoots,player_site_id,player_stats_khl_id,player_unicode_name,player_weight,player_youth_team,ppg,pts,sds,season_id,season_type,season_year,shg,sog,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded,№,featrue_median_goals_previous_season,featrue_median_assists_previous_season,featrue_median_penalty_previous_season,feature_mean_goals_preious_season,feature_mean_assists_previous_season,feature_mean_penalty_season,games_team,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id,feature_mean_games_count_season,feature_mean_points_season,feature_mean_position_in_division_season,feature_mean_position_in_conference_season,feature_mean_position_in_championship_season,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_mean_goals_scored_team,feature_mean_goals_missed_team,feature_mean_position_in_championship_team,feature_mean_position_in_conference_team,feature_mean_position_in_division_team
0,0.0,0.0,0.0,2.0,0.8,13.0,8:29,0.0,,,6.200000,,,,,,12.000000,,,,,,18.100000,,,5.000000,,,7.000000,,,,,,55.000000,,,,,,,,,28.000000,,,,,,,,,,,,0.000000,,,2.000000,,,,,,0.000000,,,2.000000,,,,,,,0.0,0.0,1.0,1.0,0.0,5,0.0,0,2.0,Kontinental Hockey League,1,KHL,0.0,-2,0.0,30.0,183.0,Aaron Palushaj,1046,USA,8474030.0,RW/LW,R,12513,11778,aaron palushaj,87.0,-,0,0.0,0.0,76,playoff,2015,0,4.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,,,,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
1,0.0,12.5,1.0,2.0,2.0,20.2,14:34,0.0,8.70,7.166667,9.230000,4.0,6.333333,8.800000,7.0,9.333333,7.800000,1.35,1.433333,1.580000,16.55,16.733333,15.800000,1.0,3.000000,3.500000,6.5,,8.000000,3.5,3.333333,3.100000,2.5,1.666667,1.500000,4.5,,6.000000,1.0,0.666667,0.500000,19.0,27.666667,29.100000,4.0,3.666667,3.800000,1.5,1.000000,1.100000,26.0,,38.000000,0.5,0.333333,0.200000,-3.0,-3.000000,1.000000,20.5,25.666667,19.400000,0.5,0.333333,0.600000,0.0,0.000000,0.000000,0.0,0.000000,0.100000,28.5,43.333333,43.500000,1.0,1.0,2.0,1.0,0.0,4,1.0,1,9.0,Kontinental Hockey League,1,KHL,0.0,-1,0.0,34.0,180.0,Alexander Nesterov,,Russia,,LW/RW,L,19390,9604,alexander nesterov,88.0,CSKA Moskva,0,1.0,0.0,76,playoff,2015,0,8.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,23,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,,,,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
2,44.2,16.7,1.0,0.0,1.2,23.4,19:23,0.0,6.90,5.733333,5.830000,8.5,8.000000,10.222222,10.5,9.666667,9.500000,1.05,1.400000,1.266667,19.50,18.033333,16.020000,7.5,5.666667,6.100000,,,19.000000,0.5,0.666667,1.666667,537.5,410.333333,288.500000,,,,120.0,110.333333,109.777778,37.0,32.000000,26.700000,0.5,0.666667,1.888889,0.0,0.000000,0.111111,,,,0.0,0.000000,0.000000,4.5,2.000000,1.500000,11.0,9.333333,9.111111,2.0,1.333333,0.600000,0.0,0.333333,0.100000,0.0,0.000000,0.000000,27.5,29.333333,31.666667,4.0,0.0,95.0,1.0,42.0,5,1.0,0,3.0,Kontinental Hockey League,1,KHL,0.0,1,2.0,34.0,183.0,Alexander Rybakov,12,Russia,,C,L,19748,7533,alexander rybakov,94.0,Ak Bars Kazan,1,1.0,0.0,76,playoff,2015,0,6.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,12,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,,,,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
3,-,11.1,1.0,1.0,1.8,22.4,20:56,3.0,4.65,8.833333,8.083333,12.0,17.333333,13.500000,17.5,18.000000,16.166667,2.20,1.900000,1.575000,23.75,22.666667,19.900000,11.0,12.666667,11.083333,,,18.000000,2.0,5.000000,3.000000,6.0,6.000000,11.000000,,,16.000000,2.5,2.666667,5.000000,32.0,38.333333,29.583333,4.5,6.666667,4.083333,0.0,0.666667,0.583333,,,46.000000,0.0,0.000000,0.000000,-5.5,-0.666667,-2.666667,33.0,44.000000,22.333333,2.0,1.333333,1.000000,0.0,0.000000,0.000000,0.5,0.333333,0.083333,54.0,57.333333,42.000000,2.0,1.0,0.0,1.0,0.0,5,1.0,0,2.0,Kontinental Hockey League,1,KHL,0.0,0,14.0,40.0,167.0,Alexei Simakov,,Russia,,RW,L,16738,6118,alexei simakov,79.0,Yunost Yekaterinburg,0,4.0,0.0,76,playoff,2015,0,9.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,14,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,,,,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
4,-,0.0,0.0,0.0,1.0,17.0,11:47,0.0,9.00,,9.000000,11.0,14.000000,14.000000,18.0,,18.000000,1.35,1.500000,1.500000,15.55,,15.550000,8.0,,8.000000,,,,3.0,4.666667,4.666667,0.5,,0.500000,,,3.000000,0.0,0.000000,0.000000,40.5,,40.500000,3.5,5.000000,5.000000,1.0,1.333333,1.333333,,,5.000000,0.0,,0.000000,0.0,,0.000000,18.0,20.000000,20.000000,0.5,,0.500000,0.0,,0.000000,0.0,0.000000,0.000000,41.5,54.000000,54.000000,0.0,0.0,0.0,0.0,0.0,1,0.0,0,0.0,Kontinental Hockey League,1,KHL,0.0,0,0.0,31.0,189.0,Alexei Yefimov,,Russia,,LW,R,91871,11803,alexei yefimov,101.0,-,0,0.0,0.0,76,playoff,2015,0,1.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,95,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,,,,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8069,52.5,0.0,0.0,2.0,0.8,23.2,16:33,1.0,14.45,11.700000,8.646667,12.0,15.666667,10.812500,7.0,7.000000,8.533333,1.60,1.700000,1.312500,25.30,25.133333,18.080000,7.0,7.333333,5.800000,6.5,6.666667,10.000000,5.0,5.666667,3.562500,236.5,212.333333,255.933333,6.5,7.333333,8.714286,184.0,191.000000,141.000000,22.0,18.333333,24.200000,6.0,7.000000,4.250000,2.5,3.333333,1.250000,30.0,27.666667,29.285714,0.0,0.000000,0.133333,5.5,3.666667,1.533333,13.0,21.666667,25.375000,0.5,0.333333,0.533333,0.0,0.000000,0.266667,0.0,0.333333,0.062500,47.0,56.000000,37.875000,3.0,0.0,59.0,0.0,31.0,4,0.0,0,8.0,Kontinental Hockey League,1,KHL,0.0,-2,0.0,31.0,180.0,Vladimir Galuzin,26,Russia,,C,L,20630,6920,vladimir galuzin,81.0,Torpedo Nizhny Novgorod,0,1.0,0.0,85,playoff,2018,0,3.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,10,1.0,1.0,4.0,1.065868,1.634731,5.182635,56,,10,116,11,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333,43.6667,34.6667,59.3333,106.333,9.33333,2,1.33333
8070,-,0.0,1.0,0.0,0.5,17.5,11:23,0.0,0.00,0.000000,0.541667,0.5,2.333333,4.818182,2.0,1.333333,8.500000,0.55,0.633333,0.536364,16.35,15.066667,15.733333,0.0,0.000000,1.416667,12.0,8.666667,20.200000,0.0,0.000000,0.090909,0.0,0.000000,0.333333,0.0,1.000000,1.750000,0.0,0.000000,0.090909,12.5,9.000000,19.083333,0.0,0.000000,0.181818,0.0,0.000000,0.000000,6.0,20.000000,29.000000,0.0,0.000000,0.000000,0.0,0.000000,-3.833333,13.5,15.000000,12.909091,0.0,0.000000,0.083333,0.0,0.000000,0.000000,0.0,0.000000,0.000000,2.5,10.000000,11.727273,1.0,0.0,0.0,0.0,0.0,2,0.0,0,1.0,Kontinental Hockey League,1,KHL,0.0,1,2.0,31.0,184.0,Yevgeni Kurbatov ...,455,Russia,,D,L,15172,3729,"yevgeni kurbatov a.k.a. ""evgeny kurbatov""",95.0,Avtomobilist Yekaterinburg,0,0.0,0.0,85,playoff,2018,0,1.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,28,1.0,1.0,4.0,1.065868,1.634731,5.182635,4,,6,10,14,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333,41.6667,34.6667,53.6667,94,9.66667,2,1.33333
8071,-,0.0,1.0,0.0,0.5,17.5,11:23,0.0,0.00,0.000000,0.541667,0.5,2.333333,4.818182,2.0,1.333333,8.500000,0.55,0.633333,0.536364,16.35,15.066667,15.733333,0.0,0.000000,1.416667,12.0,8.666667,20.200000,0.0,0.000000,0.090909,0.0,0.000000,0.333333,0.0,1.000000,1.750000,0.0,0.000000,0.090909,12.5,9.000000,19.083333,0.0,0.000000,0.181818,0.0,0.000000,0.000000,6.0,20.000000,29.000000,0.0,0.000000,0.000000,0.0,0.000000,-3.833333,13.5,15.000000,12.909091,0.0,0.000000,0.083333,0.0,0.000000,0.000000,0.0,0.000000,0.000000,2.5,10.000000,11.727273,1.0,0.0,0.0,0.0,0.0,2,0.0,0,1.0,Kontinental Hockey League,1,KHL,0.0,1,2.0,31.0,184.0,Yevgeni Kurbatov ...,455,Russia,,D,L,15172,3729,"yevgeni kurbatov a.k.a. ""evgeny kurbatov""",95.0,Avtomobilist Yekaterinburg,0,0.0,0.0,85,playoff,2018,0,1.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,28,1.0,1.0,4.0,1.065868,1.634731,5.182635,56,,10,116,11,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333,43.6667,34.6667,59.3333,106.333,9.33333,2,1.33333
8072,50.0,0.0,1.0,0.0,2.2,23.8,14:51,1.0,12.85,13.633333,11.866667,8.0,6.000000,5.000000,3.5,6.000000,4.555556,1.95,1.700000,1.311111,20.40,20.533333,16.922222,2.0,2.333333,2.111111,7.5,6.333333,4.857143,2.5,1.666667,1.666667,64.5,64.666667,27.444444,5.0,4.333333,4.142857,34.0,22.666667,13.666667,25.0,25.000000,17.111111,4.0,3.000000,2.666667,1.0,0.666667,0.222222,14.5,12.000000,10.571429,0.0,0.000000,0.000000,4.5,1.333333,0.444444,14.0,11.333333,8.444444,1.5,1.666667,1.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,39.5,28.666667,23.888889,2.0,0.0,54.0,0.0,27.0,4,0.0,0,2.0,Kontinental Hockey League,1,KHL,0.0,1,0.0,26.0,185.0,Yevgeni Mozer ...,531,Russia,,W/C,R,211957,10324,"yevgeni mozer a.k.a. ""evgeny mozer""",90.0,Avangard Omsk,0,1.0,0.0,85,playoff,2018,0,9.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,11,1.0,1.0,4.0,1.065868,1.634731,5.182635,4,,6,10,14,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333,41.6667,34.6667,53.6667,94,9.66667,2,1.33333


In [36]:
result_input_df.drop(['feature_mean_points_season', 'feature_mean_position_in_division_season', 'feature_mean_position_in_conference_season'], axis=1, inplace=True)
result_input_df

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_in_last_2,av_%SOG_in_last_3,av_%SOG_overall,av_+_in_last_2,av_+_in_last_3,av_+_overall,av_-_in_last_2,av_-_in_last_3,av_-_overall,av_S/G_in_last_2,av_S/G_in_last_3,av_S/G_overall,av_SFT/G_in_last_2,av_SFT/G_in_last_3,av_SFT/G_overall,av_assists_in_last_2,av_assists_in_last_3,av_assists_overall,av_bls_in_last_2,av_bls_in_last_3,av_bls_overall,av_esg_in_last_2,av_esg_in_last_3,av_esg_overall,av_fo_in_last_2,av_fo_in_last_3,av_fo_overall,av_foa_in_last_2,av_foa_in_last_3,av_foa_overall,av_fow_in_last_2,av_fow_in_last_3,av_fow_overall,av_games_in_last_2,av_games_in_last_3,av_games_overall,av_goals_in_last_2,av_goals_in_last_3,av_goals_overall,av_gwg_in_last_2,av_gwg_in_last_3,av_gwg_overall,av_hits_in_last_2,av_hits_in_last_3,av_hits_overall,av_otg_in_last_2,av_otg_in_last_3,av_otg_overall,av_p_m_in_last_2,av_p_m_in_last_3,av_p_m_overall,av_penalty_in_last_2,av_penalty_in_last_3,av_penalty_overall,av_ppg_in_last_2,av_ppg_in_last_3,av_ppg_overall,av_sds_in_last_2,av_sds_in_last_3,av_sds_overall,av_shg_in_last_2,av_shg_in_last_3,av_shg_overall,av_sog_in_last_2,av_sog_in_last_3,av_sog_overall,bls,esg,fo,foa,fow,games_player,goals,gwg,hits,league_full_name,league_id,league_short_name,otg,p_m,penalty,player_age,player_height,player_id,player_khl_id,player_nationality,player_nhl_id,player_position,player_shoots,player_site_id,player_stats_khl_id,player_unicode_name,player_weight,player_youth_team,ppg,pts,sds,season_id,season_type,season_year,shg,sog,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded,№,featrue_median_goals_previous_season,featrue_median_assists_previous_season,featrue_median_penalty_previous_season,feature_mean_goals_preious_season,feature_mean_assists_previous_season,feature_mean_penalty_season,games_team,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id,feature_mean_games_count_season,feature_mean_position_in_championship_season,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_mean_goals_scored_team,feature_mean_goals_missed_team,feature_mean_position_in_championship_team,feature_mean_position_in_conference_team,feature_mean_position_in_division_team
0,0.0,0.0,0.0,2.0,0.8,13.0,8:29,0.0,,,6.200000,,,,,,12.000000,,,,,,18.100000,,,5.000000,,,7.000000,,,,,,55.000000,,,,,,,,,28.000000,,,,,,,,,,,,0.000000,,,2.000000,,,,,,0.000000,,,2.000000,,,,,,,0.0,0.0,1.0,1.0,0.0,5,0.0,0,2.0,Kontinental Hockey League,1,KHL,0.0,-2,0.0,30.0,183.0,Aaron Palushaj,1046,USA,8474030.0,RW/LW,R,12513,11778,aaron palushaj,87.0,-,0,0.0,0.0,76,playoff,2015,0,4.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,17,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
1,0.0,12.5,1.0,2.0,2.0,20.2,14:34,0.0,8.70,7.166667,9.230000,4.0,6.333333,8.800000,7.0,9.333333,7.800000,1.35,1.433333,1.580000,16.55,16.733333,15.800000,1.0,3.000000,3.500000,6.5,,8.000000,3.5,3.333333,3.100000,2.5,1.666667,1.500000,4.5,,6.000000,1.0,0.666667,0.500000,19.0,27.666667,29.100000,4.0,3.666667,3.800000,1.5,1.000000,1.100000,26.0,,38.000000,0.5,0.333333,0.200000,-3.0,-3.000000,1.000000,20.5,25.666667,19.400000,0.5,0.333333,0.600000,0.0,0.000000,0.000000,0.0,0.000000,0.100000,28.5,43.333333,43.500000,1.0,1.0,2.0,1.0,0.0,4,1.0,1,9.0,Kontinental Hockey League,1,KHL,0.0,-1,0.0,34.0,180.0,Alexander Nesterov,,Russia,,LW/RW,L,19390,9604,alexander nesterov,88.0,CSKA Moskva,0,1.0,0.0,76,playoff,2015,0,8.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,23,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
2,44.2,16.7,1.0,0.0,1.2,23.4,19:23,0.0,6.90,5.733333,5.830000,8.5,8.000000,10.222222,10.5,9.666667,9.500000,1.05,1.400000,1.266667,19.50,18.033333,16.020000,7.5,5.666667,6.100000,,,19.000000,0.5,0.666667,1.666667,537.5,410.333333,288.500000,,,,120.0,110.333333,109.777778,37.0,32.000000,26.700000,0.5,0.666667,1.888889,0.0,0.000000,0.111111,,,,0.0,0.000000,0.000000,4.5,2.000000,1.500000,11.0,9.333333,9.111111,2.0,1.333333,0.600000,0.0,0.333333,0.100000,0.0,0.000000,0.000000,27.5,29.333333,31.666667,4.0,0.0,95.0,1.0,42.0,5,1.0,0,3.0,Kontinental Hockey League,1,KHL,0.0,1,2.0,34.0,183.0,Alexander Rybakov,12,Russia,,C,L,19748,7533,alexander rybakov,94.0,Ak Bars Kazan,1,1.0,0.0,76,playoff,2015,0,6.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,12,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
3,-,11.1,1.0,1.0,1.8,22.4,20:56,3.0,4.65,8.833333,8.083333,12.0,17.333333,13.500000,17.5,18.000000,16.166667,2.20,1.900000,1.575000,23.75,22.666667,19.900000,11.0,12.666667,11.083333,,,18.000000,2.0,5.000000,3.000000,6.0,6.000000,11.000000,,,16.000000,2.5,2.666667,5.000000,32.0,38.333333,29.583333,4.5,6.666667,4.083333,0.0,0.666667,0.583333,,,46.000000,0.0,0.000000,0.000000,-5.5,-0.666667,-2.666667,33.0,44.000000,22.333333,2.0,1.333333,1.000000,0.0,0.000000,0.000000,0.5,0.333333,0.083333,54.0,57.333333,42.000000,2.0,1.0,0.0,1.0,0.0,5,1.0,0,2.0,Kontinental Hockey League,1,KHL,0.0,0,14.0,40.0,167.0,Alexei Simakov,,Russia,,RW,L,16738,6118,alexei simakov,79.0,Yunost Yekaterinburg,0,4.0,0.0,76,playoff,2015,0,9.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,14,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
4,-,0.0,0.0,0.0,1.0,17.0,11:47,0.0,9.00,,9.000000,11.0,14.000000,14.000000,18.0,,18.000000,1.35,1.500000,1.500000,15.55,,15.550000,8.0,,8.000000,,,,3.0,4.666667,4.666667,0.5,,0.500000,,,3.000000,0.0,0.000000,0.000000,40.5,,40.500000,3.5,5.000000,5.000000,1.0,1.333333,1.333333,,,5.000000,0.0,,0.000000,0.0,,0.000000,18.0,20.000000,20.000000,0.5,,0.500000,0.0,,0.000000,0.0,0.000000,0.000000,41.5,54.000000,54.000000,0.0,0.0,0.0,0.0,0.0,1,0.0,0,0.0,Kontinental Hockey League,1,KHL,0.0,0,0.0,31.0,189.0,Alexei Yefimov,,Russia,,LW,R,91871,11803,alexei yefimov,101.0,-,0,0.0,0.0,76,playoff,2015,0,1.0,Yekaterinburg,,1621,16.0,https://www.eliteprospects.com/league/khl,Avtomobilist Yekaterinburg\n \n\n ...,,1724,2006,95,0.0,1.0,4.0,0.996904,1.507740,7.383901,5,,9,11,12,,True,,,43.297,4.45726,124.959,14.5064,8.66667,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8069,52.5,0.0,0.0,2.0,0.8,23.2,16:33,1.0,14.45,11.700000,8.646667,12.0,15.666667,10.812500,7.0,7.000000,8.533333,1.60,1.700000,1.312500,25.30,25.133333,18.080000,7.0,7.333333,5.800000,6.5,6.666667,10.000000,5.0,5.666667,3.562500,236.5,212.333333,255.933333,6.5,7.333333,8.714286,184.0,191.000000,141.000000,22.0,18.333333,24.200000,6.0,7.000000,4.250000,2.5,3.333333,1.250000,30.0,27.666667,29.285714,0.0,0.000000,0.133333,5.5,3.666667,1.533333,13.0,21.666667,25.375000,0.5,0.333333,0.533333,0.0,0.000000,0.266667,0.0,0.333333,0.062500,47.0,56.000000,37.875000,3.0,0.0,59.0,0.0,31.0,4,0.0,0,8.0,Kontinental Hockey League,1,KHL,0.0,-2,0.0,31.0,180.0,Vladimir Galuzin,26,Russia,,C,L,20630,6920,vladimir galuzin,81.0,Torpedo Nizhny Novgorod,0,1.0,0.0,85,playoff,2018,0,3.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,10,1.0,1.0,4.0,1.065868,1.634731,5.182635,56,,10,116,11,,True,,,39.3224,11.0884,100.221,12.4836,24.3333,43.6667,34.6667,59.3333,106.333,9.33333,2,1.33333
8070,-,0.0,1.0,0.0,0.5,17.5,11:23,0.0,0.00,0.000000,0.541667,0.5,2.333333,4.818182,2.0,1.333333,8.500000,0.55,0.633333,0.536364,16.35,15.066667,15.733333,0.0,0.000000,1.416667,12.0,8.666667,20.200000,0.0,0.000000,0.090909,0.0,0.000000,0.333333,0.0,1.000000,1.750000,0.0,0.000000,0.090909,12.5,9.000000,19.083333,0.0,0.000000,0.181818,0.0,0.000000,0.000000,6.0,20.000000,29.000000,0.0,0.000000,0.000000,0.0,0.000000,-3.833333,13.5,15.000000,12.909091,0.0,0.000000,0.083333,0.0,0.000000,0.000000,0.0,0.000000,0.000000,2.5,10.000000,11.727273,1.0,0.0,0.0,0.0,0.0,2,0.0,0,1.0,Kontinental Hockey League,1,KHL,0.0,1,2.0,31.0,184.0,Yevgeni Kurbatov ...,455,Russia,,D,L,15172,3729,"yevgeni kurbatov a.k.a. ""evgeny kurbatov""",95.0,Avtomobilist Yekaterinburg,0,0.0,0.0,85,playoff,2018,0,1.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,28,1.0,1.0,4.0,1.065868,1.634731,5.182635,4,,6,10,14,,True,,,39.3224,11.0884,100.221,12.4836,24.3333,41.6667,34.6667,53.6667,94,9.66667,2,1.33333
8071,-,0.0,1.0,0.0,0.5,17.5,11:23,0.0,0.00,0.000000,0.541667,0.5,2.333333,4.818182,2.0,1.333333,8.500000,0.55,0.633333,0.536364,16.35,15.066667,15.733333,0.0,0.000000,1.416667,12.0,8.666667,20.200000,0.0,0.000000,0.090909,0.0,0.000000,0.333333,0.0,1.000000,1.750000,0.0,0.000000,0.090909,12.5,9.000000,19.083333,0.0,0.000000,0.181818,0.0,0.000000,0.000000,6.0,20.000000,29.000000,0.0,0.000000,0.000000,0.0,0.000000,-3.833333,13.5,15.000000,12.909091,0.0,0.000000,0.083333,0.0,0.000000,0.000000,0.0,0.000000,0.000000,2.5,10.000000,11.727273,1.0,0.0,0.0,0.0,0.0,2,0.0,0,1.0,Kontinental Hockey League,1,KHL,0.0,1,2.0,31.0,184.0,Yevgeni Kurbatov ...,455,Russia,,D,L,15172,3729,"yevgeni kurbatov a.k.a. ""evgeny kurbatov""",95.0,Avtomobilist Yekaterinburg,0,0.0,0.0,85,playoff,2018,0,1.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,28,1.0,1.0,4.0,1.065868,1.634731,5.182635,56,,10,116,11,,True,,,39.3224,11.0884,100.221,12.4836,24.3333,43.6667,34.6667,59.3333,106.333,9.33333,2,1.33333
8072,50.0,0.0,1.0,0.0,2.2,23.8,14:51,1.0,12.85,13.633333,11.866667,8.0,6.000000,5.000000,3.5,6.000000,4.555556,1.95,1.700000,1.311111,20.40,20.533333,16.922222,2.0,2.333333,2.111111,7.5,6.333333,4.857143,2.5,1.666667,1.666667,64.5,64.666667,27.444444,5.0,4.333333,4.142857,34.0,22.666667,13.666667,25.0,25.000000,17.111111,4.0,3.000000,2.666667,1.0,0.666667,0.222222,14.5,12.000000,10.571429,0.0,0.000000,0.000000,4.5,1.333333,0.444444,14.0,11.333333,8.444444,1.5,1.666667,1.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,39.5,28.666667,23.888889,2.0,0.0,54.0,0.0,27.0,4,0.0,0,2.0,Kontinental Hockey League,1,KHL,0.0,1,0.0,26.0,185.0,Yevgeni Mozer ...,531,Russia,,W/C,R,211957,10324,"yevgeni mozer a.k.a. ""evgeny mozer""",90.0,Avangard Omsk,0,1.0,0.0,85,playoff,2018,0,9.0,"Nizhny Novgorod, RUS",,216,2.0,https://www.eliteprospects.com/league/khl,Torpedo Nizhny Novgorod\n \n\n \nKHL,,200,1946,11,1.0,1.0,4.0,1.065868,1.634731,5.182635,4,,6,10,14,,True,,,39.3224,11.0884,100.221,12.4836,24.3333,41.6667,34.6667,53.6667,94,9.66667,2,1.33333


In [50]:
result_input_df.to_csv("../data/data_with_feature2.csv")

## Data preprocessing

In [37]:
print('Binary features are ')
for column in result_input_df.columns:
    if len(result_input_df[column].unique()) == 2:
        print(column, len(result_input_df[column].unique()))

Binary features are 
league_full_name 2
league_id 2
league_short_name 2
season_type 2
team_league_link 2
playoff_fact 2


In [38]:
print('Categorical features are ')
for column in result_input_df.columns:
    if len(result_input_df[column].unique()) > 2 and len(result_input_df[column].unique()) <= 100:
        print(column, len(result_input_df[column].unique()))

Categorical features are 
+ 63
- 54
S/G 51
assists 51
av_-_in_last_2 91
av_assists_in_last_2 74
av_assists_in_last_3 97
av_esg_in_last_2 41
av_esg_in_last_3 48
av_foa_in_last_2 57
av_foa_in_last_3 72
av_goals_in_last_2 57
av_goals_in_last_3 71
av_gwg_in_last_2 15
av_gwg_in_last_3 21
av_otg_in_last_2 6
av_otg_in_last_3 7
av_otg_overall 86
av_p_m_in_last_2 90
av_ppg_in_last_2 27
av_ppg_in_last_3 36
av_sds_in_last_2 8
av_sds_in_last_3 9
av_shg_in_last_2 7
av_shg_in_last_3 8
esg 25
foa 37
games_player 62
goals 34
gwg 11
otg 5
p_m 60
penalty 99
player_age 42
player_height 37
player_nationality 57
player_position 19
player_shoots 3
player_weight 64
ppg 19
pts 73
sds 6
season_id 16
season_year 12
shg 5
team_country 25
team_full_name 10
team_id 27
team_khl_id 27
team_name 27
team_site_id 27
team_year_founded 19
№ 98
featrue_median_goals_previous_season 4
featrue_median_assists_previous_season 3
featrue_median_penalty_previous_season 4
feature_mean_goals_preious_season 14
feature_mean_assists_p

In [39]:
def target_encoding(df, cat_name, target, weight):
    mean = df[target].mean()
    agg = df.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    return df[cat_name].map(smooth)

In [47]:
pd.set_option('display.width', 200)

In [49]:
np.array(result_input_df.columns)

array(['%FO', '%SOG', '+', '-', 'S/G', 'SFT/G', 'TOI/G', 'assists',
       'av_%SOG_in_last_2', 'av_%SOG_in_last_3', 'av_%SOG_overall',
       'av_+_in_last_2', 'av_+_in_last_3', 'av_+_overall',
       'av_-_in_last_2', 'av_-_in_last_3', 'av_-_overall',
       'av_S/G_in_last_2', 'av_S/G_in_last_3', 'av_S/G_overall',
       'av_SFT/G_in_last_2', 'av_SFT/G_in_last_3', 'av_SFT/G_overall',
       'av_assists_in_last_2', 'av_assists_in_last_3',
       'av_assists_overall', 'av_bls_in_last_2', 'av_bls_in_last_3',
       'av_bls_overall', 'av_esg_in_last_2', 'av_esg_in_last_3',
       'av_esg_overall', 'av_fo_in_last_2', 'av_fo_in_last_3',
       'av_fo_overall', 'av_foa_in_last_2', 'av_foa_in_last_3',
       'av_foa_overall', 'av_fow_in_last_2', 'av_fow_in_last_3',
       'av_fow_overall', 'av_games_in_last_2', 'av_games_in_last_3',
       'av_games_overall', 'av_goals_in_last_2', 'av_goals_in_last_3',
       'av_goals_overall', 'av_gwg_in_last_2', 'av_gwg_in_last_3',
       'av_gwg_overall

In [129]:
def data_preprocessing(data_input):
    
    # Select bad columns and remove them
    cols_2_drop_team = ['games_team', 'goals_scored',
                        'goals_missed','position_in_championship', 'position_in_conference',
                        'playoff_fact', 'position_in_division',  'nhl_id', 'points']
    data_input = data_input.drop(cols_2_drop_team, axis=1)
    
#     Index([
#  
#       
#       
#        'player_age', 
#   ],
#       dtype='object')
    cols_2_drop_player = ['team_id', 'season_id', 'player_stats_khl_id', 
                          'player_id', 'games_player', 'player_unicode_name',
                          'assists', 'penalty', 'p_m', '+', '-', 'esg', 'ppg',
                          'shg', 'otg', 'gwg', 'sds', 'sog', '%SOG', 'S/G', 'fo', 'pts',
                          'fow', '%FO', 'TOI/G', 'SFT/G', 'hits', 'bls', 'foa', 'league_id', 
                          'league_full_name', 'team_full_name', 'team_league_link', 'team_site_id',
                          'team_khl_id', 'team_nhl_id', 'player_site_id', 'player_khl_id', 'player_nhl_id']
    
    data_input = data_input.drop(cols_2_drop_player, axis=1)
    
    # Encode categorical features using OneHotEncoding
    data_input = pd.get_dummies(data_input, columns=['player_shoots'], prefix_sep='=')
    data_input = data_input.drop(['player_shoots=-'], axis=1)
    
    data_input = pd.get_dummies(data_input, columns=['league_short_name'], prefix_sep='=')
    data_input = data_input.drop(['league_short_name=VHL'], axis=1)
    
    # Encode categorical features using TargetEncoding
    target_column = 'goals'
    data_input['team_name'] = target_encoding(df=data_input, cat_name='team_name', target=target_column, weight=10)
    data_input['team_country'] = target_encoding(df=data_input, cat_name='team_country', target=target_column, weight=10)
    data_input['player_nationality'] = target_encoding(df=data_input, cat_name='player_nationality', target=target_column, weight=10)    
    data_input['player_youth_team'] = target_encoding(df=data_input, cat_name='player_youth_team', target=target_column, weight=10)
    data_input['player_position'] = target_encoding(df=data_input, cat_name='player_position', target=target_column, weight=10)      

    # Encode binary features
    data_input.loc[:, 'season_type'] = data_input['season_type'].replace({'regular': 0, 'playoff': 1})
    
    # Get actual player age
    cur_year = 2019
    data_input['player_age'] = data_input['player_age'] - cur_year + data_input['season_year']
    
    return data_input

In [130]:
result_input_df.drop('№', axis=1, inplace=True)
result_input_df

KeyError: "['№'] not found in axis"

In [131]:
data_train = result_input_df[result_input_df['season_id'] != 86]
data_test = result_input_df[result_input_df['season_id'] == 86]

In [132]:
prep_data_train = data_preprocessing(data_train)
prep_data_test = data_preprocessing(data_test)

In [133]:
def change_type(df, columns):
    df[columns] = df[columns].astype(float)
    return df

In [134]:
arr = []
for column in prep_data_train.columns:
    if column not in prep_data_train.corr().columns:
        arr.append(column)
prep_data_train = change_type(prep_data_train, arr)
prep_data_test = change_type(prep_data_test, arr)

In [135]:
columns = prep_data_train.columns
cor_ind = []
for column in columns:
    if column in cor_ind:
        continue
    a = prep_data_train.corr()[column]
    temp = np.abs(a)
    temp = temp.index[temp > 0.9]
    cor_ind.extend(list(temp[temp != column]))
cor_ind

['av_S/G_in_last_3',
 'av_SFT/G_in_last_3',
 'av_bls_in_last_3',
 'av_bls_in_last_3',
 'av_goals_in_last_2',
 'av_goals_in_last_3',
 'av_goals_overall',
 'av_fo_in_last_3',
 'av_fow_in_last_2',
 'av_fow_in_last_3',
 'av_fo_in_last_3',
 'av_fow_in_last_3',
 'av_fow_overall',
 'av_foa_overall',
 'av_hits_overall',
 'av_ppg_in_last_3',
 'featrue_median_assists_previous_season',
 'featrue_median_penalty_previous_season',
 'feature_mean_goals_preious_season',
 'feature_mean_assists_previous_season',
 'feature_mean_penalty_season',
 'feature_mean_games_count_season',
 'feature_mean_goals_missed_season',
 'feature_mean_goals_scored_season',
 'team_name',
 'feature_team_count_season',
 'feature_mean_goals_missed_team',
 'feature_mean_position_in_division_team',
 'player_shoots=R']

In [136]:
prep_data_train = prep_data_train.drop(cor_ind, axis=1)
prep_data_test = prep_data_test.drop(cor_ind, axis=1)

In [140]:
pd.reset_option('^display.', silent=True)

In [141]:
y_column = 'goals'
X_columns = prep_data_train.columns[prep_data_train.columns != y_column]
X_columns

Index(['av_%SOG_in_last_2', 'av_%SOG_in_last_3', 'av_%SOG_overall',
       'av_+_in_last_2', 'av_+_in_last_3', 'av_+_overall', 'av_-_in_last_2',
       'av_-_in_last_3', 'av_-_overall', 'av_S/G_in_last_2', 'av_S/G_overall',
       'av_SFT/G_in_last_2', 'av_SFT/G_overall', 'av_assists_in_last_2',
       'av_assists_in_last_3', 'av_assists_overall', 'av_bls_in_last_2',
       'av_bls_overall', 'av_esg_in_last_2', 'av_esg_in_last_3',
       'av_esg_overall', 'av_fo_in_last_2', 'av_fo_overall',
       'av_foa_in_last_2', 'av_foa_in_last_3', 'av_games_in_last_2',
       'av_games_in_last_3', 'av_games_overall', 'av_gwg_in_last_2',
       'av_gwg_in_last_3', 'av_gwg_overall', 'av_hits_in_last_2',
       'av_hits_in_last_3', 'av_otg_in_last_2', 'av_otg_in_last_3',
       'av_otg_overall', 'av_p_m_in_last_2', 'av_p_m_in_last_3',
       'av_p_m_overall', 'av_penalty_in_last_2', 'av_penalty_in_last_3',
       'av_penalty_overall', 'av_ppg_in_last_2', 'av_ppg_overall',
       'av_sds_in_last_2', 

In [142]:
X_train, y_train = prep_data_train[X_columns].values, prep_data_train[y_column].values
X_test, y_test = prep_data_test[X_columns].values, prep_data_test[y_column].values

In [143]:
from sklearn.preprocessing import StandardScaler
X_scaled_train = StandardScaler().fit_transform(X_train)
X_scaled_test = StandardScaler().fit_transform(X_test)

## LightGBM

In [144]:
import lightgbm as ltb

In [145]:
model = ltb.LGBMRegressor()
model.fit(X_scaled_train, y_train)
predicted_y = model.predict(X_scaled_test)

In [146]:
def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100.0

In [147]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
print('MSE %.2f' % mean_squared_error(y_test, predicted_y))
print('R^2 %.2f' % r2_score(y_test, predicted_y))
print('MAPE %.2f' % mean_absolute_percentage_error(y_test, predicted_y))

MSE 15.34
R^2 0.34
MAPE 57.01
