In [1]:
import os
import pandas as pd

In [2]:
table_name = 'temptable.csv'
os.chdir('../data')
df = pd.read_csv(table_name)

In [3]:
df.sample(5)

Unnamed: 0,player_stats_id,season_id,team_id,player_id,games,points,goals,assists,penalty,p_m,...,player_height,player_weight,player_site_id,player_age,player_name,player_unicode_name,player_khl_id,player_nhl_id,season_year,season_type
3752,4028,26,79,27413,7,1,0,1,4,4,...,186.0,86.0,27402,50.0,Greg Andrusak,greg andrusak,,8455452,1998,regular
18831,20069,50,63,9088,62,34,13,21,25,-17,...,184.0,90.0,9092,37.0,Brad Boyes ...,"brad boyes a.k.a. ""bradley boyes""",,8468504,2006,regular
9155,9819,14,78,17094,19,11,6,5,10,2,...,186.0,88.0,17100,34.0,David Robertson,david robertson,,8457816,1994,regular
23880,25466,65,75,9160,82,16,2,14,39,14,...,185.0,93.0,9164,35.0,Josh Gorges,josh gorges,,8470324,2011,regular
5620,6076,32,77,53594,8,0,0,0,4,-5,...,183.0,91.0,53544,51.0,Keith Jones,keith jones,,8456745,2000,regular


In [4]:
class DataFrameTransformer:
    def __init__(self, df):
        self.df = df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df)
        return self.df

In [5]:
# Пример - добавить столбец с общим числом очков для игрока

def add_total_points(df):
    df['total_points'] = df['points'].groupby(df['player_id']).transform('sum')
    return df

# ...

In [6]:
# Для добавления в трансформер достаточно написать функцию и добавить ее в массив transforms ниже

transforms = [add_total_points]
transformer = DataFrameTransformer(df)
transformer.add_transforms(transforms)
ext_df = transformer.fit()

In [7]:
ext_df[['player_id', 'total_points']].sample(5)

Unnamed: 0,player_id,total_points
3169,10297,16
15494,32790,50
9526,8561,111
4068,54636,18
14236,3663,618


## Features for teams

In [8]:
table_name = 'team_stats.csv'
os.chdir('../data')
team_stats_df = pd.read_csv(table_name)

In [9]:
team_stats_df.sample(5)

Unnamed: 0,id,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
18756,20097,188,92,36,56.0,111,62,1,1.0,False,1.0,
17318,18556,649,92,37,27.0,69,115,23,11.0,False,6.0,
21676,23221,649,92,37,27.0,69,115,23,11.0,False,6.0,
15485,16595,192,92,38,50.0,94,76,4,2.0,False,1.0,
822,866,724,91,36,,3,94,10,,True,,


In [10]:
import numpy as np

# transform methods for seasons
def add_avg_games_for_season(df, team_stats_df):
    df = df.join(team_stats_df.groupby('season_id')['games'].mean(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_games_count_season']))
    return df

def add_avg_points_for_season(df, team_stats_df):
    df = df.join(team_stats_df.groupby('season_id')['points'].mean(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_points_season']))
    return df

def add_sum_points_for_season(df, team_stats_df):
    df = df.join(team_stats_df.groupby('season_id')['points'].sum(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_sum_points_season']))
    return df

def add_team_count_for_season(df, team_stats_df):
    df = df.merge(team_stats_df.groupby('season_id').apply(lambda x: len(x['team_id'].unique())).rename('feature_team_count_season'), on='season_id')
    return df

# transform methods for teams
def add_avg_games_for_team(df, team_stats_df):
    df = df.join(team_stats_df.groupby('team_id')['games'].mean(), on='team_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_games_count_team']))
    return df

def add_avg_points_for_team(df, team_stats_df):
    df = df.join(team_stats_df.groupby('team_id')['points'].mean(), on='team_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_points_team']))
    return df

def add_sum_points_for_team(df, team_stats_df):
    df = df.join(team_stats_df.groupby('team_id')['points'].sum(), on='team_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_sum_points_team']))
    return df


In [11]:
class DataFrameTransformer:
    def __init__(self, df, team_stats_df):
        self.df = df
        self.team_stats_df = team_stats_df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df, self.team_stats_df)
        return self.df

In [13]:
transforms = [add_avg_games_for_season, add_avg_points_for_season, add_sum_points_for_season, add_team_count_for_season, add_avg_games_for_team, add_avg_points_for_team, add_sum_points_for_team]
transformer = DataFrameTransformer(df, team_stats_df)
transformer.add_transforms(transforms)
ext_df = transformer.fit()

In [14]:
ext_df

Unnamed: 0,player_stats_id,season_id,team_id,player_id,games,points,goals,assists,penalty,p_m,...,season_year,season_type,total_points,feature_mean_games_count_season,feature_mean_points_season,feature_sum_points_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_sum_points_team
0,0,41,1479,8517,21,6,0,6,12,1,...,2003,regular,619,82.0,87.370370,2359.0,27,79.360000,86.280000,2157.0
1,18,41,78,9665,20,4,2,2,4,-9,...,2003,regular,11,82.0,87.370370,2359.0,27,79.571429,91.142857,2552.0
2,38,41,82,10673,2,0,0,0,0,0,...,2003,regular,5,82.0,87.370370,2359.0,27,80.111111,78.833333,1419.0
3,48,41,86,8658,69,19,2,17,12,8,...,2003,regular,132,82.0,87.370370,2359.0,27,79.555556,85.629630,2312.0
4,54,41,77,8833,1,0,0,0,0,0,...,2003,regular,71,82.0,87.370370,2359.0,27,79.571429,89.892857,2517.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24214,25752,35,82,5517,81,8,2,6,90,-29,...,2001,regular,16,82.0,86.555556,2337.0,27,80.111111,78.833333,1419.0
24215,25764,35,76,69981,2,0,0,0,5,-1,...,2001,regular,4,82.0,86.555556,2337.0,27,79.571429,76.107143,2131.0
24216,25767,35,64,3734,3,0,0,0,0,0,...,2001,regular,0,82.0,86.555556,2337.0,27,80.521739,90.565217,2083.0
24217,25771,35,69,8634,62,24,5,19,32,-6,...,2001,regular,333,82.0,86.555556,2337.0,27,80.111111,87.500000,1575.0
