In [1]:
import os
import pandas as pd
import numpy as np
import statistics
from scipy.stats import gmean
import sys

In [2]:
!{sys.executable} -m pip install psycopg2-binary



In [3]:
database_connect = 'postgres://doadmin:rdxo4w05qb3vq10l@db-postgresql-fra1-36671-do-user-4768937-0.db.ondigitalocean.com:25060/hockey'

In [4]:
class DataFrameTransformer:
    def __init__(self, df):
        self.df = df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in self.transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df)
        return self.df

In [None]:
df = pd.read_sql_table('data_for_model3', 'postgres://doadmin:rdxo4w05qb3vq10l@db-postgresql-fra1-36671-do-user-4768937-0.db.ondigitalocean.com:25060/hockey')

In [None]:
df

In [None]:
table_player_stats_name = 'data_for_model3'
df_player_stats = pd.read_sql_table(table_player_stats_name, database_connect)
df_player_stats.sample(5)

In [None]:
table_team_stats_name = 'team_stats'
df_team_stats = pd.read_sql_table(table_team_stats_name, database_connect)
df_team_stats.sample(5)

In [None]:
df_temp = df_team_stats[:]
df_temp.drop('id', axis=1, inplace=True)
df_team_stats = df_temp.drop_duplicates(keep='first')
df_team_stats = df_team_stats.reset_index(drop=True)
df_team_stats

## Features for teams

In [None]:
def add_feature_from_dict(df, d, feature_name, attr_name, season_count, step):
    df[feature_name] = None
    for k, v in d.items():
        s = 0
        for i in range(1, season_count + 1):
            if d.get(k - step * i) != None:
                s += d.get(k - step * i)
        df[feature_name][df[attr_name] == k] = s / season_count
    return df

In [None]:
# transform methods for seasons
def add_avg_games_for_season(df):
    d = df.groupby('season_id')['games'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_games_count_season',  'season_id', 3, 3)

def add_avg_points_for_season(df):
    d = df.groupby('season_id')['points'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_points_season',  'season_id', 3, 3)

def add_avg_goals_scored_for_season(df):
    d = df.groupby('season_id')['goals_scored'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_scored_season', 'season_id', 3, 3)

def add_avg_goals_missed_for_season(df):
    d = df.groupby('season_id')['goals_missed'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_goals_missed_season', 'season_id', 3, 3)

def add_avg_position_in_championship_for_season(df):
    d = df.groupby('season_id')['position_in_championship'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_championship_season', 'season_id', 3, 3)

def add_avg_position_in_conference_for_season(df):
    d = df.groupby('season_id')['position_in_conference'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_conference_season', 'season_id', 3, 3)

def add_avg_position_in_division_for_season(df):
    d = df.groupby('season_id')['position_in_division'].mean().to_dict()
    return add_feature_from_dict(df, d, 'feature_mean_position_in_division_season', 'season_id', 3, 3)

def add_team_count_for_season(df):
    d = df.groupby('season_id').apply(lambda x: len(x['team_id'].unique())).to_dict()
    return add_feature_from_dict(df, d, 'feature_team_count_season', 'season_id', 3, 3)

In [None]:
transforms = [add_avg_games_for_season, add_avg_points_for_season, add_avg_position_in_division_for_season, add_avg_position_in_conference_for_season, add_avg_position_in_championship_for_season, add_avg_goals_missed_for_season, add_avg_goals_scored_for_season, add_team_count_for_season]
transformer = DataFrameTransformer(df_team_stats[:])
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

In [None]:
conda install progressbar2

In [None]:
import progressbar
def calculate_team_feature(df, feature_name, attr, season_count, step):
    print("Calculating feature for", attr)
    df[feature_name] = None
    bar = progressbar.ProgressBar(maxval=12, \
        widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    step = 1
    for ind in range(len(df)):
        season = df.loc[ind, 'season_id']
        team = df.loc[ind, 'team_id']
        s = 0
        for i in range(1, season_count + 1):
            s += df.loc[(df['season_id'] == season - step * i) & (df['team_id'] == team)][attr].sum()
        df.loc[ind, feature_name] = s / season_count
        if ind % 100 == 0:
            bar.update(step)
            step += 1
    bar.finish()
    return df

In [None]:
# transform methods for teams
def add_avg_games_for_team(df):
    return calculate_team_feature(df, 'feature_mean_games_count_team', 'games', 3, 3)

def add_avg_points_for_team(df):
    return calculate_team_feature(df, 'feature_mean_points_team', 'points', 3, 3)

def add_avg_goals_scored_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_scored_team', 'goals_scored', 3, 3)

def add_avg_goals_missed_for_team(df):
    return calculate_team_feature(df, 'feature_mean_goals_missed_team', 'goals_missed', 3, 3)

def add_avg_position_in_championship_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_championship_team', 'position_in_championship', 3, 3)

def add_avg_position_in_conference_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_conference_team', 'position_in_conference', 3, 3)

def add_avg_position_in_division_for_team(df):
    return calculate_team_feature(df, 'feature_mean_position_in_division_team', 'position_in_division', 3, 3)

In [None]:
transforms = [add_avg_games_for_team, add_avg_points_for_team, add_avg_goals_scored_for_team, add_avg_goals_missed_for_team, add_avg_position_in_championship_for_team, add_avg_position_in_conference_for_team, add_avg_position_in_division_for_team]
transformer = DataFrameTransformer(df_team_feature)
transformer.add_transforms(transforms)
df_team_feature = transformer.fit()
df_team_feature

## Features for player

In [None]:
def av_games_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_games_overall'] = ssum/count
                if not np.isnan(list(b[j]["games"])[i]):
                    count+=1
                    ssum+=list(b[j]["games"])[i]
        return pd.concat(b)
def av_goals_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_goals_overall'] = ssum/count
                if not np.isnan(list(b[j]["goals"])[i]):
                    count+=1
                    ssum+=list(b[j]["goals"])[i]
        return pd.concat(b)
def av_assists_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_assists_overall'] = ssum/count
                if not np.isnan(list(b[j]["assists"])[i]):
                    count+=1
                    ssum+=list(b[j]["assists"])[i]
        return pd.concat(b)
def av_penalty_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_penalty_overall'] = ssum/count
                if not np.isnan(list(b[j]["penalty"])[i]):
                    count+=1
                    ssum+=list(b[j]["penalty"])[i]
        return pd.concat(b)
def av_p_m_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_p_m_overall'] = ssum/count
                if not np.isnan(list(b[j]["p_m"])[i]):
                    count+=1
                    ssum+=list(b[j]["p_m"])[i]
        return pd.concat(b)
def av_p_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_+_overall'] = ssum/count
                if not np.isnan(list(b[j]["+"])[i]):
                    count+=1
                    ssum+=list(b[j]["+"])[i]
        return pd.concat(b)
def av_p_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_-_overall'] = ssum/count
                if not np.isnan(list(b[j]["-"])[i]):
                    count+=1
                    ssum+=list(b[j]["-"])[i]
        return pd.concat(b)
def av_esg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_esg_overall'] = ssum/count
                if not np.isnan(list(b[j]["esg"])[i]):
                    count+=1
                    ssum+=list(b[j]["esg"])[i]
        return pd.concat(b)
def av_ppg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_ppg_overall'] = ssum/count
                if not np.isnan(list(b[j]["ppg"])[i]):
                    count+=1
                    ssum+=list(b[j]["ppg"])[i]
        return pd.concat(b)
def av_shg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_shg_overall'] = ssum/count
                if not np.isnan(list(b[j]["shg"])[i]):
                    count+=1
                    ssum+=list(b[j]["shg"])[i]
        return pd.concat(b)
def av_otg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_otg_overall'] = ssum/count
                if not np.isnan(list(b[j]["otg"])[i]):
                    count+=1
                    ssum+=list(b[j]["otg"])[i]
        return pd.concat(b)
def av_gwg_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_gwg_overall'] = ssum/count
                if not np.isnan(list(b[j]["gwg"])[i]):
                    count+=1
                    ssum+=list(b[j]["gwg"])[i]
        return pd.concat(b)
def av_sds_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sds_overall'] = ssum/count
                if not np.isnan(list(b[j]["sds"])[i]):
                    count+=1
                    ssum+=list(b[j]["sds"])[i]
        return pd.concat(b)
def av_sog_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_sog_overall'] = ssum/count
                if not np.isnan(list(b[j]["sog"])[i]):
                    count+=1
                    ssum+=list(b[j]["sog"])[i]
        return pd.concat(b)
def av__SOG_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%SOG_overall'] = ssum/count
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    count+=1
                    ssum+=list(b[j]["%SOG"])[i]
        return pd.concat(b)
def av_S_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_S/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["S/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["S/G"])[i]
        return pd.concat(b)
def av_fo_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fo_overall'] = ssum/count
                if not np.isnan(list(b[j]["fo"])[i]):
                    count+=1
                    ssum+=list(b[j]["fo"])[i]
        return pd.concat(b)
def av_fow_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_fow_overall'] = ssum/count
                if not np.isnan(list(b[j]["fow"])[i]):
                    count+=1
                    ssum+=list(b[j]["fow"])[i]
        return pd.concat(b)
def av__FO_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_%FO_overall'] = ssum/count
                if not np.isnan(list(b[j]["%FO"])[i]):
                    count+=1
                    ssum+=list(b[j]["%FO"])[i]
        return pd.concat(b)
def av_TOI_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_TOI/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["TOI/G"])[i]
        return pd.concat(b)
def av_SFT_G_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_SFT/G_overall'] = ssum/count
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    count+=1
                    ssum+=list(b[j]["SFT/G"])[i]
        return pd.concat(b)
def av_hits_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_hits_overall'] = ssum/count
                if not np.isnan(list(b[j]["hits"])[i]):
                    count+=1
                    ssum+=list(b[j]["hits"])[i]
        return pd.concat(b)
def av_bls_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if count != 0:
                    b[j].loc[b[j].index[i], 'av_bls_overall'] = ssum/count
                if not np.isnan(list(b[j]["bls"])[i]):
                    count+=1
                    ssum+=list(b[j]["bls"])[i]
        return pd.concat(b)
def av_foa_overall(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            ssum = 0
            for i in range(len(b[j])):
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 0:
                        b[j].loc[b[j].index[i], 'av_foa_overall'] = ssum/count
                    if not np.isnan(list(b[j]["foa"])[i]):
                        count+=1
                        ssum+=list(b[j]["foa"])[i]
        return pd.concat(b)

In [None]:
transforms = [av_foa_overall, av_bls_overall, av_hits_overall, av_SFT_G_overall, av_fow_overall, av_fo_overall, av_S_G_overall, av__SOG_overall, av_sog_overall, av_sds_overall, av_gwg_overall, av_otg_overall, av_shg_overall, av_ppg_overall, av_esg_overall, av_games_overall, av_goals_overall, av_assists_overall, av_penalty_overall, av_p_m_overall, av_p_overall, av_p_overall]
transformer = DataFrameTransformer(df_player_stats)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

In [None]:
def av_games_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_games_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_+_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_-_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_2(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 2:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 2:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_2'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [None]:
transforms = [av_games_in_last_2,
av_goals_in_last_2,
av_assists_in_last_2,
av_penalty_in_last_2,
av_p_m_in_last_2,
av_p_in_last_2,
av_m_in_last_2,
av_esg_in_last_2,
av_ppg_in_last_2,
av_shg_in_last_2,
av_otg_in_last_2,
av_gwg_in_last_2,
av_sds_in_last_2,
av_sog_in_last_2,
av__SOG_in_last_2,
av_S_G_in_last_2,
av_fo_in_last_2,
av_fow_in_last_2,
av_SFT_G_in_last_2,
av_hits_in_last_2,
av_bls_in_last_2,
av_foa_in_last_2]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

In [None]:
def av_games_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["games"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["games"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_games_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["games"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_games_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_goals_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["goals"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["goals"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["goals"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_goals_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_assists_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["assists"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["assists"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["assists"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_assists_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_penalty_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["penalty"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["penalty"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["penalty"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_penalty_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["p_m"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["p_m"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["p_m"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_p_m_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_p_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["+"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["+"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_+_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["+"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_+_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_m_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["-"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["-"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_-_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["-"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_-_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_esg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["esg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["esg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["esg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_esg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_ppg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["ppg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["ppg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["ppg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_ppg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_shg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["shg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["shg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["shg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_shg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_otg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["otg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["otg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["otg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_otg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_gwg_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["gwg"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["gwg"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["gwg"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_gwg_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sds_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sds"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sds"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sds"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sds_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_sog_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["sog"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["sog"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["sog"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_sog_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__SOG_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%SOG"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%SOG"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%SOG"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%SOG_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_S_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["S/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["S/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["S/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_S/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fo_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fo"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fo"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fo"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fo_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_fow_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["fow"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["fow"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["fow"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_fow_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av__FO_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["%FO"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["%FO"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["%FO"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_%FO_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_TOI_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["TOI/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["TOI/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["TOI/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_TOI/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_SFT_G_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["SFT/G"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["SFT/G"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["SFT/G"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_SFT/G_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_hits_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["hits"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["hits"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["hits"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_hits_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_bls_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["bls"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["bls"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["bls"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_bls_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)
def av_foa_in_last_3(df):
        a = df.sort_values(by = ['season_year']).groupby(["player_id"])
        b = [pd.DataFrame(a.get_group(x)) for x in a.groups]
        for j in range(len(b)):
            count = 0
            res = []
            for i in range(len(b[j])):
                b[j] = b[j].sort_values(by = ['season_year', 'team_id'])
                if not np.isnan(list(b[j]["foa"])[i]):
                    if count != 3:
                        count+=1
                        res.append(list(b[j]["foa"])[i])
                    else:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_3'] = sum(res)/count
                        del res[0]
                        res.append(list(b[j]["foa"])[i])
                else:
                    if count == 3:
                        b[j].loc[b[j].index[i], 'av_foa_in_last_3'] = sum(res)/count
                    res = []
                    count = 0
        return pd.concat(b)

In [None]:
transforms = [av_games_in_last_3,
av_goals_in_last_3,
av_assists_in_last_3,
av_penalty_in_last_3,
av_p_m_in_last_3,
av_p_in_last_3,
av_m_in_last_3,
av_esg_in_last_3,
av_ppg_in_last_3,
av_shg_in_last_3,
av_otg_in_last_3,
av_gwg_in_last_3,
av_sds_in_last_3,
av_sog_in_last_3,
av__SOG_in_last_3,
av_S_G_in_last_3,
av_fo_in_last_3,
av_fow_in_last_3,
av_SFT_G_in_last_3,
av_hits_in_last_3,
av_bls_in_last_3,
av_foa_in_last_3]
transformer = DataFrameTransformer(df_player_feature)
transformer.add_transforms(transforms)
df_player_feature = transformer.fit()
df_player_feature

## Merge datasets

In [None]:
df_player_feature['team_id'] = df_player_feature['team_id'].astype(int)
result_input_df = pd.merge(df_player_feature, df_team_feature, on=['team_id', 'season_id'], suffixes=('_player', '_team'))
pd.set_option('display.max_columns', 500)
result_input_df

## Features for league

In [None]:
def med_goals_per_season(df):
    a = df.groupby('season_id')['goals'].median()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['featrue_median_goals_previous_season']))
    return df

def med_assists_per_season(df):
    a = df.groupby('season_id')['assists'].median()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['featrue_median_assists_previous_season']))
    return df


def med_penalty_per_season(df):
    a = df.groupby('season_id')['penalty'].median()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['featrue_median_penalty_previous_season']))
    return df


# Average goals per season
def avg_goals_per_season(df):
    a = df.groupby('season_id')['goals'].mean()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_goals_preious_season']))
    return df

# Average assists per season
def avg_assists_per_season(df):
    a = df.groupby('season_id')['assists'].mean()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_assists_previous_season']))
    return df

# Sum of penalty per season
def avg_penalty_per_season(df):
    a = df.groupby('season_id')['penalty'].mean()
    b = a.copy()
    for ind, point in b.iteritems():
        if ind < 3 or ind > 89:
            a[ind] == np.nan
        else:
            a[ind - 3] = b[ind]
    df = df.join(a, on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_penalty_season']))
    return df

In [None]:
_tpr = result_input_df.copy()

In [None]:
transforms = [med_goals_per_season,
med_assists_per_season,
med_penalty_per_season,
avg_goals_per_season,
avg_assists_per_season,
avg_penalty_per_season]
transformer = DataFrameTransformer(result_input_df)
transformer.add_transforms(transforms)
result_input_df = transformer.fit()
result_input_df

## Быстрое получение таблички

In [None]:
result_input_df = pd.read_csv('Замердженный_датафрейм_20_03.csv')

In [None]:
result_input_df

In [None]:
result_input_df.to_csv(r'C:\Users\ego-k\OneDrive\Рабочий стол\Замердженный_датафрейм_20_03.csv', index = False, header=True)

## Data preprocessing

In [None]:
print('Binary features are ')
for column in result_input_df.columns:
    if len(result_input_df[column].unique()) == 2:
        print(column, len(result_input_df[column].unique()))

In [None]:
print('Categorical features are ')
for column in result_input_df.columns:
    if len(result_input_df[column].unique()) > 2 and len(result_input_df[column].unique()) <= 100:
        print(column, len(result_input_df[column].unique()))

In [None]:
def target_encoding(df, cat_name, target, weight):
    mean = df[target].mean()
    agg = df.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    return df[cat_name].map(smooth)

In [None]:
def data_preprocessing(data_input):
    
    # Select bad columns and remove them
    cols_2_drop_team = ['games_team', 'points_team', 'goals_scored',
                        'goals_missed','position_in_championship', 'position_in_conference',
                        'playoff_fact', 'position_in_division',  'nhl_id']
    data_input = data_input.drop(cols_2_drop_team, axis=1)
    
#     Index([
#  
#       
#       
#        'player_age', 
#   ],
#       dtype='object')
    cols_2_drop_player = ['team_id', 'season_id', 'player_stats_id', 'player_stats_khl_id', 
                          'player_id', 'games_player', 'points_player',
                          'assists', 'penalty', 'p_m', '+', '-', 'esg', 'ppg',
                          'shg', 'otg', 'gwg', 'sds', 'sog', '%SOG', 'S/G', 'fo', 
                          'fow', '%FO', 'TOI/G', 'SFT/G', 'hits', 'bls', 'foa', 'league_id', 
                          'league_full_name', 'team_full_name', 'team_league_link', 'team_site_id',
                          'team_khl_id', 'team_nhl_id', 'player_site_id', 'player_khl_id', 'player_nhl_id',
                          'player_name', 'player_unicode_name']
    
    data_input = data_input.drop(cols_2_drop_player, axis=1)
    
    # Encode categorical features using OneHotEncoding
    data_input = pd.get_dummies(data_input, columns=['player_shoots'], prefix_sep='=')
    data_input = data_input.drop(['player_shoots=-'], axis=1)
    
    data_input = pd.get_dummies(data_input, columns=['league_short_name'], prefix_sep='=')
    data_input = data_input.drop(['league_short_name=VHL'], axis=1)
    
    # Encode categorical features using TargetEncoding
    target_column = 'goals'
    data_input['team_name'] = target_encoding(df=data_input, cat_name='team_name', target=target_column, weight=10)
    data_input['team_country'] = target_encoding(df=data_input, cat_name='team_country', target=target_column, weight=10)
    data_input['player_nationality'] = target_encoding(df=data_input, cat_name='player_nationality', target=target_column, weight=10)    
    data_input['player_youth_team'] = target_encoding(df=data_input, cat_name='player_youth_team', target=target_column, weight=10)
    data_input['player_position'] = target_encoding(df=data_input, cat_name='player_position', target=target_column, weight=10)      

    # Encode binary features
    data_input.loc[:, 'season_type'] = data_input['season_type'].replace({'regular': 0, 'playoff': 1})
    
    # Get actual player age
    cur_year = 2019
    data_input['player_age'] = data_input['player_age'] - cur_year + data_input['season_year']
    
    return data_input

In [None]:
result_input_df.to_csv("../data/data_with_feature.csv")

In [None]:
data_train = result_input_df[result_input_df['season_id'] != 86]
data_test = result_input_df[result_input_df['season_id'] == 86]

In [None]:
prep_data_train = data_preprocessing(data_train)
prep_data_test = data_preprocessing(data_test)
prep_data_train

In [None]:
y_column = 'goals'
X_columns = prep_data_train.columns[prep_data_train.columns != y_column]
X_columns

In [None]:
X_train, y_train = prep_data_train[X_columns].values, prep_data_train[y_column].values
X_test, y_test = prep_data_test[X_columns].values, prep_data_test[y_column].values

## Отброс коррелируемых фичей

In [None]:
from scipy.stats.stats import pearsonr

In [None]:
pearsonr_data = []

In [None]:
import progressbar
bar = progressbar.ProgressBar(maxval=6*3 + 1, \
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
for i in list(prep_data_train.columns):
    for j in list(prep_data_train.columns):
        if i!=j:
            tpr = np.array(list(filter(lambda x: np.isfinite(x[0]) and np.isfinite(x[1]), list(zip(prep_data_train[i], prep_data_train[j])))))
            pearsonr_data.append((pearsonr(tpr[:, 0], tpr[:, 1])[0], i, j))
bar.finish()

In [None]:
pearsonr_data2 = pearsonr_data

In [None]:
features_to_delete = []

In [None]:
sorted_pearsonr_data = sorted([(abs(x[0]), x[1], x[2]) for x in pearsonr_data], reverse=True)

In [None]:
while(sorted_pearsonr_data[0][0] > 0.85):
    features_to_delete.append(sorted_pearsonr_data[0][1])
    to_del = sorted_pearsonr_data[0]
    i = 0
    while i != range(len(sorted_pearsonr_data)):
        if i == len(sorted_pearsonr_data):
            break
        if to_del[1] == sorted_pearsonr_data[i][1] or to_del[1] == sorted_pearsonr_data[i][2]:
            del sorted_pearsonr_data[i]
            i-=1
        i+=1

In [None]:
sorted_pearsonr_data[:5]

In [None]:
features_to_delete

In [None]:
prep_data_train = prep_data_train.drop(features_to_delete, axis=1)
prep_data_test = prep_data_test.drop(features_to_delete, axis=1)

In [None]:
y_column = 'goals'
X_columns = prep_data_train.columns[prep_data_train.columns != y_column]
X_columns

In [None]:
X_train, y_train = prep_data_train[X_columns].values, prep_data_train[y_column].values
X_test, y_test = prep_data_test[X_columns].values, prep_data_test[y_column].values

## LightGBM

In [None]:
conda install -c conda-forge lightgbm

In [None]:
import lightgbm as ltb

In [None]:
model = ltb.LGBMRegressor()
model.fit(X_train, y_train)
predicted_y = model.predict(X_test)

In [None]:
def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100.0

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
print('MSE %.2f' % mean_squared_error(y_test, predicted_y))
print('R^2 %.2f' % r2_score(y_test, predicted_y))
print('MAPE %.2f' % mean_absolute_error(y_test, predicted_y))

## CatBoost

In [None]:
conda install -c conda-forge catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
best_acc = 0
best_depth = 2
best_rate = 0.05
best_iterations = 500
bar = progressbar.ProgressBar(maxval=6*3 + 1, \
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
step = 1
for depth in [2, 3, 4, 5, 7, 10]:
        for iterations in [500, 1000, 2000]:   
            model = CatBoostRegressor(iterations=iterations, learning_rate=0.05, depth=depth, silent=True)
            # Fit model
            model.fit(X_train, y_train)
            predicted_y = model.predict(X_test)
            r2 = r2_score(y_test, predicted_y)
            if r2 > best_acc:
                best_acc = r2
                best_depth = depth
                best_iterations = iterations
                
            bar.update(step)
            step += 1
bar.finish()
best_depth, best_iterations

In [None]:
print('R^2 %.2f' % best_acc)

In [None]:
# model = CatBoostRegressor(iterations=best_iterations, learning_rate=0.05, depth=best_depth, silent=True)
print('MSE %.2f' % mean_squared_error(y_test, predicted_y))
print('R^2 %.2f' % r2_score(y_test, predicted_y))
print('MAPE %.2f' % mean_absolute_error(y_test, predicted_y))

In [None]:
abs(y_test-predicted_y).mean()/y_test.mean()

In [None]:
rr = np.array(list(enumerate(model.feature_importances_)))

In [None]:
list(zip(X_columns, rr[:, 1]))

In [None]:
ww =(rr[rr[:, 1] > 1])

In [None]:
j = 0
for i in ww[:, 0].astype(int):
    print(list(X_columns)[i], ww[j])
    j+=1

In [None]:
from matplotlib import pyplot
print(enumerate(model.feature_importances_))
# plot
pyplot.figure(figsize = (20, 12))
pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
pyplot.show()

In [None]:
model = CatBoostRegressor(iterations=2000, learning_rate=0.05, depth=16, silent=True)
model.fit(X_train, y_train)
predicted_y = model.predict(X_test)

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot

plot_importance(model)
pyplot.show()

In [None]:
np.array(list(filter(lambda x: np.isfinite(x), (abs(y_test-predicted_y)/y_test)))).mean()

## PLS

In [None]:
from sklearn.cross_decomposition import PLSRegression

In [None]:
X_train_noNan = prep_data_train[X_columns].fillna(X_train.mean())
X_test_noNan = prep_data_test[X_columns].fillna(X_test.mean())

In [None]:
for column in X_columns:
    X_train_noNan[column].fillna(np.nanmean(X_train_noNan[column].values), inplace=True)
    X_test_noNan[column].fillna(np.nanmean(X_test_noNan[column].values), inplace=True)
X_test_noNan.isnull().sum().sum()

In [None]:
best_acc = 0
best_n = 2
best_iter = 500
for n in [2, 3, 4, 5, 7, 10, 15, 20, 30, 50, 85]:
    for max_iter in [500, 1000, 2000, 4000]:
        model = PLSRegression(n_components=n, max_iter=max_iter)
        # Fit model
        model.fit(X_train_noNan, y_train)
        predicted_y = model.predict(X_test_noNan)
        r2 = r2_score(y_test, predicted_y)
        if r2 > best_acc:
            best_acc = r2
            best_n = n
            best_iter = max_iter
best_n, best_iter

In [None]:
print('R^2 %.2f' % best_acc)

In [None]:
pd.read_csv('C:\Users\ego-k\OneDrive\Рабочий стол\Замердженный_датафрейм_13_03.csv')