In [1]:
import os
import pandas as pd
import numpy as np
import statistics
from scipy.stats import gmean
import sys

In [2]:
!{sys.executable} -m pip install psycopg2-binary



In [3]:
database_connect = 'postgres://doadmin:rdxo4w05qb3vq10l@db-postgresql-fra1-36671-do-user-4768937-0.db.ondigitalocean.com:25060/hockey'

In [4]:
class DataFrameTransformer:
    def __init__(self, df):
        self.df = df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in self.transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df)
        return self.df

In [5]:
table_player_stats_name = 'data_for_model3'
df_player_stats = pd.read_sql_table(table_player_stats_name, database_connect)
df_player_stats.sample(5)

Unnamed: 0,player_stats_id,player_stats_khl_id,season_id,team_id,player_id,games,points,goals,assists,penalty,...,player_height,player_weight,player_site_id,player_age,player_name,player_unicode_name,player_khl_id,player_nhl_id,season_year,season_type
28169,,4464.0,59,877,579,19,5.0,2.0,3.0,26.0,...,193.0,93.0,22179,38.0,Alexander Gorelov,alexander gorelov,,,2009,regular
5553,5998.0,,29,88,14820,76,23.0,9.0,14.0,28.0,...,190.0,90.0,14815,47.0,Craig Johnson,craig johnson,,8456729.0,1999,regular
3847,4124.0,,59,82,4155,73,34.0,6.0,28.0,37.0,...,184.0,86.0,4152,33.0,Anton Strålman,anton stralman,,8471873.0,2009,regular
7004,7564.0,,68,76,10450,48,24.0,4.0,20.0,38.0,...,184.0,99.0,10458,31.0,Kyle Okposo,kyle okposo,,8473449.0,2012,regular
16126,17192.0,,29,65,4723,79,25.0,6.0,19.0,139.0,...,188.0,100.0,4727,44.0,Bryan McCabe,bryan mccabe,,8459462.0,1999,regular


In [6]:
table_team_stats_name = 'team_stats'
df_team_stats = pd.read_sql_table(table_team_stats_name, database_connect)
df_team_stats.sample(5)

Unnamed: 0,id,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
30794,33434,200,92,38,45.0,91,84,7,4.0,False,3.0,
18523,19846,206,92,36,45.0,81,80,9,5.0,False,2.0,
31224,33895,206,92,36,45.0,81,80,9,5.0,False,2.0,
11221,12026,13644,88,62,,5,139,21,,True,,
28731,31228,20132,92,37,41.0,82,87,17,9.0,False,5.0,


In [7]:
df_temp = df_team_stats[:]
df_temp.drop('id', axis=1, inplace=True)
df_team_stats = df_temp.drop_duplicates(keep='first')
df_team_stats = df_team_stats.reset_index(drop=True)
df_team_stats

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
0,216,85,4,,6,10,14,,True,,
1,216,85,56,,10,116,11,,True,,
2,216,80,60,100.0,163,137,11,7.0,False,5.0,
3,216,83,60,104.0,145,124,9,6.0,False,4.0,
4,216,82,5,,9,13,11,,True,,
...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,53
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,23
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,24
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,22


## Features for teams

In [8]:
def add_feature_from_dict(df, d, feature_name, attr_name, season_count, step):
    df[feature_name] = None
    for k, v in d.items():
        s = 0
        for i in range(1, season_count + 1):
            if d.get(k - step * i) != None:
                s += d.get(k - step * i)
        df[feature_name][df[attr_name] == k] = s / season_count
    return df

In [9]:
# transform methods for seasons
def add_avg_games_for_season(df):
    d = df.groupby('season_id')['games'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_games_count_season',  'season_id', 3, 3)

def add_avg_points_for_season(df):
    d = df.groupby('season_id')['points'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_points_season',  'season_id', 3, 3)

def add_avg_goals_scored_for_season(df):
    d = df.groupby('season_id')['goals_scored'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_scored_season', 'season_id', 3, 3)

def add_avg_goals_missed_for_season(df):
    d = df.groupby('season_id')['goals_missed'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_missed_season', 'season_id', 3, 3)

def add_avg_position_in_championship_for_season(df):
    d = df.groupby('season_id')['position_in_championship'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_championship_season', 'season_id', 3, 3)

def add_avg_position_in_conference_for_season(df):
    d = df.groupby('season_id')['position_in_conference'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_conference_season', 'season_id', 3, 3)

def add_avg_position_in_division_for_season(df):
    d = df.groupby('season_id')['position_in_division'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_division_season', 'season_id', 3, 3)

def add_team_count_for_season(df):
    d = df.groupby('season_id').apply(lambda x: len(x['team_id'].unique())).to_dict()
    return add_feature_from_dict(df, d, 'feature_team_count_season', 'season_id', 3, 3)

In [10]:
transforms = [add_avg_games_for_season, add_avg_points_for_season, add_avg_position_in_division_for_season, add_avg_position_in_conference_for_season, add_avg_position_in_championship_for_season, add_avg_goals_missed_for_season, add_avg_goals_scored_for_season, add_team_count_for_season]
transformer = DataFrameTransformer(df_team_stats)
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id,feature_mean_games_count_season,feature_mean_points_season,feature_mean_position_in_division_season,feature_mean_position_in_conference_season,feature_mean_position_in_championship_season,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season
0,216,85,4,,6,10,14,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333
1,216,85,56,,10,116,11,,True,,,39.3224,,,,11.0884,100.221,12.4836,24.3333
2,216,80,60,100.0,163,137,11,7.0,False,5.0,,82,92.2011,4.18637,7.89573,15.2557,224.189,225.177,28.6667
3,216,83,60,104.0,145,124,9,6.0,False,4.0,,78.3977,91.7713,4.13833,7.79673,15.1081,211.707,212.742,38.3333
4,216,82,5,,9,13,11,,True,,,43.3372,,,,7.69208,119.826,10.9299,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,53,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,23,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,24,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,22,74.7931,91.1064,4.16561,7.84836,15.2258,202.57,203.18,48.3333


In [11]:
conda install progressbar2

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/gvyarduhin/opt/anaconda3

  added / updated specs:
    - progressbar2


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.8.2                |           py37_0         2.8 MB
    ------------------------------------------------------------
                                           Total:         2.8 MB

The following packages will be SUPERSEDED by a higher-priority channel:

  conda                                         conda-forge --> pkgs/main



Downloading and Extracting Packages
conda-4.8.2          | 2.8 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [12]:
import progressbar
def calculate_team_feature(df, feature_name, attr, season_count, step):
    print("Calculating feature for", attr)
    df[feature_name] = None
    bar = progressbar.ProgressBar(maxval=12, \
        widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    step = 1
    for ind in range(len(df)):
        season = df.loc[ind, 'season_id']
        team = df.loc[ind, 'team_id']
        s = 0
        for i in range(1, season_count + 1):
            s += df.loc[(df['season_id'] == season - step * i) & (df['team_id'] == team)][attr].sum()
        df.loc[ind, feature_name] = s / season_count
        if ind % 100 == 0:
            bar.update(step)
            step += 1
    bar.finish()
    return df

In [13]:
# transform methods for teams
def add_avg_games_for_team(df):
    return calculate_team_feature(df, 'feature_mean_games_count_team', 'games', 3, 3)

def add_avg_points_for_team(df):
    return calculate_team_feature(df, 'feature_mean_points_team', 'points', 3, 3)

def add_avg_goals_scored_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_scored_team', 'goals_scored', 3, 3)

def add_avg_goals_missed_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_missed_team', 'goals_missed', 3, 3)

def add_avg_position_in_championship_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_championship_team', 'position_in_championship', 3, 3)

def add_avg_position_in_conference_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_conference_team', 'position_in_conference', 3, 3)

def add_avg_position_in_division_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_division_team', 'position_in_division', 3, 3)

In [14]:
transforms = [add_avg_games_for_team, add_avg_points_for_team, add_avg_goals_scored_for_team, add_avg_goals_missed_for_team, add_avg_position_in_championship_for_team, add_avg_position_in_conference_for_team, add_avg_position_in_division_for_team]
transformer = DataFrameTransformer(df_team_feature)
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

                                                                               [                                                                        ] N/A%

Calculating feature for games


[                                                                        ] N/A%

Calculating feature for points


[                                                                        ] N/A%

Calculating feature for goals_scored


[                                                                        ] N/A%

Calculating feature for goals_missed


[                                                                        ] N/A%

Calculating feature for position_in_championship


[                                                                        ] N/A%

Calculating feature for position_in_conference


[                                                                        ] N/A%

Calculating feature for position_in_division




Unnamed: 0,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,...,feature_mean_goals_missed_season,feature_mean_goals_scored_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_mean_goals_scored_team,feature_mean_goals_missed_team,feature_mean_position_in_championship_team,feature_mean_position_in_conference_team,feature_mean_position_in_division_team
0,216,85,4,,6,10,14,,True,,...,100.221,12.4836,24.3333,41.6667,34.6667,53.6667,94,9.66667,2,1.33333
1,216,85,56,,10,116,11,,True,,...,100.221,12.4836,24.3333,43.6667,34.6667,59.3333,106.333,9.33333,2,1.33333
2,216,80,60,100.0,163,137,11,7.0,False,5.0,...,224.189,225.177,28.6667,21.6667,0,4.33333,56.3333,8.66667,0,0
3,216,83,60,104.0,145,124,9,6.0,False,4.0,...,211.707,212.742,38.3333,23.6667,0,11,65,6.33333,0,0
4,216,82,5,,9,13,11,,True,,...,119.826,10.9299,16,41.6667,33.3333,58.6667,102,12.3333,2.33333,1.66667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,87,86,82,86.0,213,223,18,9.0,False,4.0,...,202.57,203.18,48.3333,0,0,0,0,0,0,0
1132,84,86,82,81.0,225,254,23,12.0,False,5.0,...,202.57,203.18,48.3333,27.3333,30.6667,85.3333,85,5.66667,3,1.33333
1133,1479,86,82,80.0,199,251,24,13.0,False,6.0,...,202.57,203.18,48.3333,27.3333,32.6667,84.6667,76.3333,4,2,1
1134,71,86,82,79.0,232,274,25,14.0,False,7.0,...,202.57,203.18,48.3333,27.3333,31.6667,85.3333,83.6667,4.66667,2.66667,1


## Features for player

In [15]:
def av_games_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_games_overall'] = ssum/count
                if not np.isnan(list(b[j]["games"])[i]):
                    count+=1
                    ssum+=list(b[j]["games"])[i]
        return pd.concat(b)
def av_goals_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_goals_overall'] = ssum/count
                if not np.isnan(list(b[j]["goals"])[i]):
                    count+=1
                    ssum+=list(b[j]["goals"])[i]
        return pd.concat(b)
def av_assists_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_assists_overall'] = ssum/count
                if not np.isnan(list(b[j]["assists"])[i]):
                    count+=1
                    ssum+=list(b[j]["assists"])[i]
        return pd.concat(b)
def av_penalty_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_penalty_overall'] = ssum/count
                if not np.isnan(list(b[j]["penalty"])[i]):
                    count+=1
                    ssum+=list(b[j]["penalty"])[i]
        return pd.concat(b)
def av_p_m_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_p_m_overall'] = ssum/count
                if not np.isnan(list(b[j]["p_m"])[i]):
                    count+=1
                    ssum+=list(b[j]["p_m"])[i]
        return pd.concat(b)
def av_p_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_+_overall'] = ssum/count
                if not np.isnan(list(b[j]["+"])[i]):
                    count+=1
                    ssum+=list(b[j]["+"])[i]
        return pd.concat(b)
def av_p_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_-_overall'] = ssum/count
                if not np.isnan(list(b[j]["-"])[i]):
                    count+=1
                    ssum+=list(b[j]["-"])[i]
        return pd.concat(b)
def av_esg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_esg_overall'] = ssum/count
                if not np.isnan(list(b[j]["esg"])[i]):
                    count+=1
                    ssum+=list(b[j]["esg"])[i]
        return pd.concat(b)
def av_ppg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_ppg_overall'] = ssum/count
                if not np.isnan(list(b[j]["ppg"])[i]):
                    count+=1
                    ssum+=list(b[j]["ppg"])[i]
        return pd.concat(b)
def av_shg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_shg_overall'] = ssum/count
                if not np.isnan(list(b[j]["shg"])[i]):
                    count+=1
                    ssum+=list(b[j]["shg"])[i]
        return pd.concat(b)
def av_otg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_otg_overall'] = ssum/count
                if not np.isnan(list(b[j]["otg"])[i]):
                    count+=1
                    ssum+=list(b[j]["otg"])[i]
        return pd.concat(b)
def av_gwg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_gwg_overall'] = ssum/count
                if not np.isnan(list(b[j]["gwg"])[i]):
                    count+=1
                    ssum+=list(b[j]["gwg"])[i]
        return pd.concat(b)
def av_sds_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sds_overall'] = ssum/count
                if not np.isnan(list(b[j]["sds"])[i]):
                    count+=1
                    ssum+=list(b[j]["sds"])[i]
        return pd.concat(b)
def av_sog_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sog_overall'] = ssum/count
                if not np.isnan(list(b[j]["sog"])[i]):
                    count+=1
                    ssum+=list(b[j]["sog"])[i]
        return pd.concat(b)
def av__SOG_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%SOG_overall'] = ssum/count
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    count+=1
                    ssum+=list(b[j]["%SOG"])[i]
        return pd.concat(b)
def av_S_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_S/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["S/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["S/G"])[i]
        return pd.concat(b)
def av_fo_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fo_overall'] = ssum/count
                if not np.isnan(list(b[j]["fo"])[i]):
                    count+=1
                    ssum+=list(b[j]["fo"])[i]
        return pd.concat(b)
def av_fow_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fow_overall'] = ssum/count
                if not np.isnan(list(b[j]["fow"])[i]):
                    count+=1
                    ssum+=list(b[j]["fow"])[i]
        return pd.concat(b)
def av__FO_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%FO_overall'] = ssum/count
                if not np.isnan(list(b[j]["%FO"])[i]):
                    count+=1
                    ssum+=list(b[j]["%FO"])[i]
        return pd.concat(b)
def av_TOI_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_TOI/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["TOI/G"])[i]
        return pd.concat(b)
def av_SFT_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_SFT/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["SFT/G"])[i]
        return pd.concat(b)
def av_hits_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_hits_overall'] = ssum/count
                if not np.isnan(list(b[j]["hits"])[i]):
                    count+=1
                    ssum+=list(b[j]["hits"])[i]
        return pd.concat(b)
def av_bls_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_bls_overall'] = ssum/count
                if not np.isnan(list(b[j]["bls"])[i]):
                    count+=1
                    ssum+=list(b[j]["bls"])[i]
        return pd.concat(b)
def av_foa_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 0:
                        b[j].loc[b[j].index[i], 'av_foa_overall'] = ssum/count
                    if not np.isnan(list(b[j]["foa"])[i]):
                        count+=1
                        ssum+=list(b[j]["foa"])[i]
        return pd.concat(b)

In [18]:
transforms = [av_foa_overall, av_bls_overall, av_hits_overall, av_SFT_G_overall, av_fow_overall, av_fo_overall, av_S_G_overall, av__SOG_overall, av_sog_overall, av_sds_overall, av_gwg_overall, av_otg_overall, av_shg_overall, av_ppg_overall, av_esg_overall, av_games_overall, av_goals_overall, av_assists_overall, av_penalty_overall, av_p_m_overall, av_p_overall, av_p_overall]
transformer = DataFrameTransformer(df_player_stats)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas 

Unnamed: 0,%FO,%SOG,+,-,S/G,SFT/G,TOI/G,assists,av_%SOG_overall,av_-_overall,...,sog,team_country,team_full_name,team_id,team_khl_id,team_league_link,team_name,team_nhl_id,team_site_id,team_year_founded
24236,100.0,12.9,53.0,47.0,4.1,27.8,25:17,30.0,,,...,217.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24235,-,0.0,0.0,2.0,3.0,28.7,32:26,1.0,12.900000,47.000000,...,9.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24234,-,7.7,51.0,58.0,3.3,25.0,25:33,27.0,6.450000,24.500000,...,183.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24232,-,0.0,0.0,4.0,3.8,24.5,27:31,2.0,6.866667,35.666667,...,15.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
24233,50.0,5.3,51.0,41.0,4.2,27.1,27:17,27.0,5.150000,27.750000,...,225.0,"Nur-Sultan, KAZ",,2338,22.0,https://www.eliteprospects.com/league/khl,Barys Nur-Sultan\n \n\n \nKHL,,2498,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29314,25.0,0.0,1.0,3.0,2.0,23.2,18:10,2.0,9.527273,18.272727,...,8.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921
29313,43.8,14.0,16.0,14.0,1.9,19.3,14:27,9.0,8.733333,17.000000,...,50.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29312,37.5,0.0,1.0,6.0,1.4,18.2,12:20,2.0,9.138462,16.769231,...,11.0,"Moskva, RUS",HK Dynamo Moskva,6116,11.0,https://www.eliteprospects.com/league/khl,Dynamo Moskva\n \n\n \nKHL,,6815,1946
29310,11.1,3.6,24.0,29.0,2.4,22.9,16:09,17.0,8.485714,16.000000,...,111.0,Bratislava,,174,,https://www.eliteprospects.com/league/slovakia,HC Slovan Bratislava\n \n\n \nSlov...,,169,1921


In [20]:
def av_games_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [24]:
transforms = [av_games_in_last_2,
av_goals_in_last_2,
av_assists_in_last_2,
av_penalty_in_last_2,
av_p_m_in_last_2,
av_p_in_last_2,
av_m_in_last_2,
av_esg_in_last_2,
av_ppg_in_last_2,
av_shg_in_last_2,
av_otg_in_last_2,
av_gwg_in_last_2,
av_sds_in_last_2,
av_sog_in_last_2,
av__SOG_in_last_2,
av_S_G_in_last_2,
av_fo_in_last_2,
av_fow_in_last_2,
av_SFT_G_in_last_2,
av_hits_in_last_2,
av_bls_in_last_2,
av_foa_in_last_2]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




KeyboardInterrupt: 

In [None]:
def av_games_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [None]:
transforms = [av_games_in_last_3,
av_goals_in_last_3,
av_assists_in_last_3,
av_penalty_in_last_3,
av_p_m_in_last_3,
av_p_in_last_3,
av_m_in_last_3,
av_esg_in_last_3,
av_ppg_in_last_3,
av_shg_in_last_3,
av_otg_in_last_3,
av_gwg_in_last_3,
av_sds_in_last_3,
av_sog_in_last_3,
av__SOG_in_last_3,
av_S_G_in_last_3,
av_fo_in_last_3,
av_fow_in_last_3,
av_SFT_G_in_last_3,
av_hits_in_last_3,
av_bls_in_last_3,
av_foa_in_last_3]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

## Merge datasets

In [None]:
result_input_df = pd.merge(df_player_feature, df_team_feature, on=['team_id', 'season_id'], suffixes=('_player', '_team'))
pd.set_option('display.max_columns', 500)
result_input_df

## Data preprocessing

In [None]:
result_input_df['season_type'].value_counts()

In [None]:
print('Categorical features are ')
for column in result_input_df.columns:
    if len(result_input_df[column].unique()) > 2 and len(result_input_df[column].unique()) <= 500 and 'int64' != result_input_df[column].dtype and 'float64' != result_input_df[column].dtype:
        print(column, len(result_input_df[column].unique()))

In [None]:
def data_preprocessing(data_input):
    
    # Select bad columns and remove them
    cols_2_drop = ['league_short_name', 'league_full_name', 'team_full_name', 'team_league_link', 'team_site_id', 'team_khl_id', 'team_nhl_id', 'season_type']
    data_input = data_input.drop(cols_2_drop, axis=1)
    
    # Encode categorical features using OneHotEncoding
    data_input = pd.get_dummies(data_input, columns=['player_shoots'], prefix_sep='=')
    
    return data_input

In [None]:
result_df = data_preprocessing(result_input_df)
result_df = result_df.drop(['player_shoots=-'], axis=1)
result_df

In [None]:
y_column = 'goals'

X_columns = result_df.columns[result_df.columns != y_column]

data_train = result_df[result_df['season_id'] != 86]
data_test = result_df[result_df['season_id'] == 86]
data_test

In [None]:
X_train, y_train = data_train[X_columns].values, data_train[y_column].values
X_test, y_test = data_test[X_columns].values, data_test[y_column].values

## LightGBM

In [None]:
conda install -c conda-forge lightgbm

In [None]:
!{sys.executable} -m pip install lightgbm

In [None]:
import lightgbm as ltb

In [None]:
model = ltb.LGBMRegressor()
model.fit(X_train, y_train)
predicted_y = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predicted_y)