# Inżynieria cech
 - badanie wartości NA, błędów grubych
 - badanie typów danych i zmniejszenie zużycia pamięci
 - stworzenie danych średnich w różnych oknach czasowych (10 ostatnich spotkań, 20, 30 i od początku sezonu)

In [245]:
import pandas as pd
import sqlite3
import numpy as np

pd.set_option('display.max_columns', 40)

In [246]:
con = sqlite3.connect('data/start/NBA-Game-Database-combined.sqlite')
game_info = pd.read_sql_query(f"SELECT * FROM \"{'game_info'}\" ORDER BY date", con)
team_stats = pd.read_sql_query(f"SELECT * FROM \"{'team_stats'}\"", con)
player_stats = pd.read_sql_query(f"SELECT * FROM \"{'player_stats'}\"", con)
con.close()

## Badanie wartości NA, outlierów i błędów logicznych w danych

### Dane drużynowe

In [247]:
def analyze_na_outliers(df, context_cols):
    missing_data = pd.isnull(df).mean()*100

    # Wykrywanie grubych błędów
    outliers = {}
    for col in df.select_dtypes(include=['float', 'int']).columns:
        mean = df[col].mean()
        std = df[col].std()
        threshold = 4

        outliers[col] = df.loc[
            (df[col] < mean - threshold * std) | (df[col] > mean + threshold * std),
            context_cols + [col]
        ]

    errors = {}
    for col in df.loc[:, df.columns.str.contains('%')].columns:
        errors[col] = df.loc[
            df[col]<0,
            context_cols + [col]
        ]

    return missing_data, outliers, errors

In [248]:
team_missing_data, team_outliers, team_errors = analyze_na_outliers(team_stats, context_cols=['game_id', 'team'])
team_missing_data[team_missing_data > 0]

FT%      0.006031
+/-    100.000000
dtype: float64

In [249]:
team_stats[team_stats['FT%'].isna()]

Unnamed: 0,team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,Pace,game_id,win
16476,BOS,240,37.0,93.0,0.398,17.0,52.0,0.327,0.0,0.0,,12.0,26.0,38.0,27.0,11.0,4.0,12.0,8.0,91.0,-99,,0.489,0.489,0.559,0.0,23.5,72.2,43.7,73.0,12.3,8.9,11.4,100.0,101.4,115.9,89.7,232404091179,0


Kolumnę +/- można usunąć, Net Rating przekazuje tą samą informację, ale z uwzględnieniem tempa gry

In [250]:
team_stats.loc[16476, 'FT%'] = 0.0
team_stats = team_stats.drop(columns=['+/-', 'GmSc', 'USG%'])

In [251]:
for column, outlier_values in team_outliers.items():
    if not outlier_values.empty:
        print(outlier_values, end='\n\n')

            game_id team    FG
8969   202105030965  WAS  63.0
13460  222302240901  SAC  65.0
16557  232404141219  IND  65.0

            game_id team    FGA
3737   181901130639  WAS  118.0
4165   181902130853  CLE  120.0
4221   181902220881  OKC  118.0
4299   181902270920  SAC  118.0
4314   181903010928  CHI  119.0
4315   181903010928  ATL  123.0
4671   181903251106  POR  121.0
4749   181903311145  ATL  118.0
9208   212210200005  BOS  117.0
9780   212211260291  SAC  120.0
12775  222301020558  GSW  121.0
15373  232401200627  HOU  117.0
15475  232401270678  GSW  119.0
16269  232403261075  MIL  119.0

           game_id team    FG%
9848  212212010325  DAL  0.687

           game_id team    3P
7138  202112290050  MIL  29.0
7630  202102010296  HOU  28.0
7943  202102220452  UTA  28.0

            game_id team   3PA
3777   181901160659  HOU  70.0
3831   181901190686  HOU  68.0
9180   202105161071  HOU  63.0
15597  232402040739  OKC  63.0

           game_id team    3P%
1068  171812300535  MEM

Outliery wskazują na rzadko spotykane wydarzenia i ciekawostki statystyczne, nie widać niepoprawnych wartości

In [252]:
for column, error_values in team_errors.items():
    if not error_values.empty:
        print(error_values, end='\n\n')

Brak wychwyconych błędów logicznych w danych drużynowych

### Dane wg zawodników

In [253]:
player_missing_data, player_outliers, player_errors = analyze_na_outliers(player_stats, context_cols=['game_id', 'player'])
player_missing_data

game_id     0.000000
player      0.000000
team        0.000000
MP          0.000000
FG         17.112934
FGA        17.112934
FG%        20.997485
3P         17.112934
3PA        17.112934
3P%        35.971538
FT         17.112934
FTA        17.112934
FT%        53.566215
ORB        17.112934
DRB        17.112934
TRB        17.112934
AST        17.112934
STL        17.112934
BLK        17.112934
TOV        17.112934
PF         17.112934
PTS        17.112934
GmSc       17.112934
+/-        17.225482
TS%        20.552945
eFG%       20.997485
3PAr       20.997485
FTr        20.997485
ORB%       17.133654
DRB%       17.133654
TRB%       17.133654
AST%       17.134125
STL%       17.133654
BLK%       17.133654
TOV%       19.944056
USG%       17.133654
ORtg       17.133654
DRtg       17.133654
BPM        17.125649
PIE        17.112934
dtype: float64

Nie będziemy imputować danych w tabeli player_stats, ponieważ kolumny z brakami nie będą wykorzystywane.

In [254]:
for column, outlier_values in player_outliers.items():
    if not outlier_values.empty:
        print(outlier_values, end='\n\n')

             game_id                 player    FG
542     171810200022         Nikola Vučević  17.0
863     171810210034  Giannis Antetokounmpo  17.0
3101    171811030122           LeBron James  23.0
3564    171811050140           James Harden  19.0
5578    171811170220           LeBron James  17.0
...              ...                    ...   ...
209588  232404021128           Nikola Jokić  18.0
209733  232404031134          Malachi Flynn  18.0
210712  232404071171           Tyrese Maxey  19.0
211046  232404091183        Anthony Edwards  17.0
212204  232404141226          GG Jackson II  17.0

[285 rows x 3 columns]

             game_id         player   FGA
3101    171811030122   LeBron James  34.0
8633    171812040341   Devin Booker  32.0
8904    171812050352   Bradley Beal  37.0
10103   171812120400   Bradley Beal  33.0
10167   171812120402   Jimmy Butler  33.0
...              ...            ...   ...
208851  232403291100  Jalen Brunson  47.0
209588  232404021128   Nikola Jokić  32

Po analizie outlierów można stwierdzić, że część wartości to rekordowe, rzadko spotykane liczby - tyczy się to wartości nominalnych.

Outliery w kolumnach procentowych (druga połowa outputu, np. TRB% - procent wszystkich piłek możliwych do zebrania zebranych przez danego zawodnika) są często spowodowane krótkim czasem gry zawodników, co przełożyło się na nienaturalnie duże wartości

In [255]:
for column, error_values in player_errors.items():
    if not error_values.empty:
        print(error_values, end='\n\n')

             game_id              player    AST%
17628   171801230699           Ron Baker  -611.5
45850   181901070594         Ricky Rubio  -186.0
49464   181901270738      Ryan Broekhoff  -131.4
51510   181902080821        Trevon Duval -1000.0
70540   192012090350        Tony Bradley  -106.7
82103   192002120813        Justin James   -78.3
116112  202105121041      Udoka Azubuike  -309.7
142258  212203090982      Omer Yurtseven -1000.0
148774  222310190012        Ochai Agbaji  -505.3
183823  232411120142  Brandin Podziemski -1000.0
193892  232401060525        Jared Butler -1000.0
206608  232403171012       Jordan Miller  -178.2



AST% jest definiowana jako szacunkowa wartość procentowa trafionych rzutów przez kolegów z drużyny, przy których asystował zawodnik, gdy był on na boisku. Nie może zatem ona być mniejsza niż 0, więc zastąpimy te wartości

In [256]:
player_stats.loc[player_stats['AST%']<0, 'AST%'] = 0.0

## Badanie typów danych i zmniejszenie zużycia pamięci

### Team stats

In [257]:
team_stats.dtypes

team        object
MP          object
FG         float64
FGA        float64
FG%        float64
3P         float64
3PA        float64
3P%        float64
FT         float64
FTA        float64
FT%        float64
ORB        float64
DRB        float64
TRB        float64
AST        float64
STL        float64
BLK        float64
TOV        float64
PF         float64
PTS        float64
TS%        float64
eFG%       float64
3PAr       float64
FTr        float64
ORB%       float64
DRB%       float64
TRB%       float64
AST%       float64
STL%       float64
BLK%       float64
TOV%       float64
ORtg       float64
DRtg       float64
Pace       float64
game_id      int64
win          int64
dtype: object

In [258]:
team_stats.memory_usage(deep=True)

Index         128
team       994920
MP         994920
FG         132656
FGA        132656
FG%        132656
3P         132656
3PA        132656
3P%        132656
FT         132656
FTA        132656
FT%        132656
ORB        132656
DRB        132656
TRB        132656
AST        132656
STL        132656
BLK        132656
TOV        132656
PF         132656
PTS        132656
TS%        132656
eFG%       132656
3PAr       132656
FTr        132656
ORB%       132656
DRB%       132656
TRB%       132656
AST%       132656
STL%       132656
BLK%       132656
TOV%       132656
ORtg       132656
DRtg       132656
Pace       132656
game_id    132656
win        132656
dtype: int64

In [259]:
team_stats.memory_usage(deep=True).sum()

6500272

In [260]:
(team_stats
.select_dtypes(int)
.describe
)

<bound method NDFrame.describe of             game_id  win
0      171810170001    0
1      171810170001    1
2      171810170002    1
3      171810170002    0
4      171810180003    0
...             ...  ...
16577  232404141229    1
16578  232404141230    0
16579  232404141230    1
16580  232404141231    0
16581  232404141231    1

[16582 rows x 2 columns]>

In [261]:
team_stats = team_stats.astype({'win': 'int8'})

### Player stats

In [262]:
player_stats.dtypes

game_id      int64
player      object
team        object
MP          object
FG         float64
FGA        float64
FG%        float64
3P         float64
3PA        float64
3P%        float64
FT         float64
FTA        float64
FT%        float64
ORB        float64
DRB        float64
TRB        float64
AST        float64
STL        float64
BLK        float64
TOV        float64
PF         float64
PTS        float64
GmSc        object
+/-        float64
TS%        float64
eFG%       float64
3PAr       float64
FTr        float64
ORB%       float64
DRB%       float64
TRB%       float64
AST%       float64
STL%       float64
BLK%       float64
TOV%       float64
USG%       float64
ORtg       float64
DRtg       float64
BPM        float64
PIE        float64
dtype: object

In [263]:
player_stats.memory_usage(deep=True)

Index           128
game_id     1698832
player     15256765
team       12741240
MP         13398284
FG          1698832
FGA         1698832
FG%         1698832
3P          1698832
3PA         1698832
3P%         1698832
FT          1698832
FTA         1698832
FT%         1698832
ORB         1698832
DRB         1698832
TRB         1698832
AST         1698832
STL         1698832
BLK         1698832
TOV         1698832
PF          1698832
PTS         1698832
GmSc       11513475
+/-         1698832
TS%         1698832
eFG%        1698832
3PAr        1698832
FTr         1698832
ORB%        1698832
DRB%        1698832
TRB%        1698832
AST%        1698832
STL%        1698832
BLK%        1698832
TOV%        1698832
USG%        1698832
ORtg        1698832
DRtg        1698832
BPM         1698832
PIE         1698832
dtype: int64

In [264]:
player_stats.memory_usage(deep=True).sum()

114067844

In [265]:
player_stats.columns

Index(['game_id', 'player', 'team', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%',
       'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg',
       'BPM', 'PIE'],
      dtype='object')

In [266]:
player_stats = player_stats.drop(
    columns=['FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'TS%', '3PAr', 'FTr', 'ORB%',
       'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%']
)

W kolumnie Minutes Played jest wyszczególniona informacja o przyczynie nieobecności zawodnika. Nie jest ona potrzebna w modelowaniu

In [267]:
mp_values = ['Did Not Play', 'Player Suspended', 'Not With Team', 'Did Not Dress']
player_stats.loc[player_stats['MP'].isin(mp_values), 'MP'] = np.nan

In [268]:
player_stats.memory_usage(deep=True)

Index           128
game_id     1698832
player     15256765
team       12741240
MP         12050411
eFG%        1698832
ORtg        1698832
DRtg        1698832
BPM         1698832
PIE         1698832
dtype: int64

In [269]:
player_stats.memory_usage(deep=True).sum()

50241536

### Dodanie daty

In [270]:
def extract_date(game_id):
    game_id_str=str(game_id)
    season_prefix = game_id_str[2:4]
    month = int(game_id_str[4:6])
    day = int(game_id_str[6:8])

    if 10 <= month <= 12:
        year = '20'+str(int(season_prefix) - 1)
    else:
        year = '20'+str(int(season_prefix))

    date_str = f"{year}-{month:02d}-{day:02d}"
    return date_str

In [271]:
team_stats['Date'] = team_stats['game_id'].apply(extract_date)
team_stats['Date']

0        2017-10-17
1        2017-10-17
2        2017-10-17
3        2017-10-17
4        2017-10-18
            ...    
16577    2024-04-14
16578    2024-04-14
16579    2024-04-14
16580    2024-04-14
16581    2024-04-14
Name: Date, Length: 16582, dtype: object

### Dodanie kolumn z liczbą zwycięstw w ostatnich 10 spotkaniach i aktualną serią porażek/zwycięstw

In [272]:
team_stats['Season'] = team_stats['game_id'].astype(str).str[:4]
# team_stats['Season']

In [273]:
team_stats['id'] = team_stats.index

In [274]:
def add_last_10(df):
    df['last10'] = (
        df.groupby(['team', 'Season'])['win']
        .rolling(window=10, min_periods=1)
        .sum()
        .shift(1)
        .reset_index(level=['team', 'Season'], drop=True)
    )

    first_game_mask = df.groupby(['team', 'Season']).cumcount() == 0
    df.loc[first_game_mask, 'last10'] = 0.0
    return df.sort_index()

In [275]:
def calculate_streak(win_series):
    streaks = []
    streak = 0
    last_win = None

    for i in range(1, len(win_series)):
        win = win_series.iloc[i - 1]  # bierzemy wynik meczu poprzedzającego aktualny
        if win == last_win:
            streak += 1
        else:
            streak = 1
        streaks.append(streak if win == 1 else -streak)
        last_win = win

    # Pierwszy mecz nie może mieć streak, więc dodajemy wartość 0 na początku
    streaks.insert(0, 0)

    return streaks


In [276]:
team_stats = add_last_10(team_stats)
team_stats

Unnamed: 0,team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace,game_id,win,Date,Season,id,last10
0,BOS,240,36.0,88.0,0.409,8.0,32.0,0.250,19.0,25.0,0.760,9.0,37.0,46.0,24.0,11.0,4.0,12.0,24.0,99.0,0.500,0.455,0.364,0.284,18.0,80.4,47.9,66.7,11.0,6.6,10.8,98.7,101.7,100.3,171810170001,0,2017-10-17,1718,0,0.0
1,CLE,240,38.0,83.0,0.458,5.0,22.0,0.227,21.0,25.0,0.840,9.0,41.0,50.0,19.0,3.0,4.0,17.0,25.0,102.0,0.543,0.488,0.265,0.301,19.6,82.0,52.1,50.0,3.0,7.1,15.3,101.7,98.7,100.3,171810170001,1,2017-10-17,1718,1,0.0
2,HOU,240,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,0.684,10.0,33.0,43.0,28.0,9.0,5.0,13.0,16.0,122.0,0.579,0.562,0.423,0.196,22.2,84.6,51.2,59.6,8.8,10.0,11.0,119.0,118.0,102.5,171810170002,1,2017-10-17,1718,2,0.0
3,GSW,240,43.0,80.0,0.538,16.0,30.0,0.533,19.0,21.0,0.905,6.0,35.0,41.0,34.0,5.0,9.0,17.0,25.0,121.0,0.678,0.638,0.375,0.263,15.4,77.8,48.8,79.1,4.9,16.1,16.0,118.0,119.0,102.5,171810170002,0,2017-10-17,1718,3,0.0
4,CHO,240,29.0,73.0,0.397,9.0,30.0,0.300,23.0,29.0,0.793,3.0,44.0,47.0,16.0,4.0,3.0,17.0,15.0,90.0,0.525,0.459,0.411,0.397,7.3,83.0,50.0,55.2,4.0,4.3,16.5,90.9,103.1,99.0,171810180003,0,2017-10-18,1718,4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16577,OKC,240,54.0,97.0,0.557,14.0,34.0,0.412,13.0,16.0,0.813,11.0,47.0,58.0,38.0,6.0,6.0,15.0,17.0,135.0,0.649,0.629,0.351,0.165,27.5,79.7,58.6,70.4,5.7,10.3,12.6,128.3,81.7,105.2,232404141229,1,2024-04-14,2324,16577,6.0
16578,POR,240,31.0,98.0,0.316,7.0,45.0,0.156,13.0,17.0,0.765,23.0,31.0,54.0,18.0,11.0,2.0,18.0,19.0,82.0,0.389,0.352,0.459,0.173,37.7,70.5,51.4,58.1,11.4,3.8,14.6,84.7,125.0,96.8,232404141230,0,2024-04-14,2324,16578,2.0
16579,SAC,240,43.0,87.0,0.494,13.0,34.0,0.382,22.0,27.0,0.815,13.0,38.0,51.0,29.0,11.0,6.0,14.0,19.0,121.0,0.612,0.569,0.391,0.310,29.5,62.3,48.6,67.4,11.4,11.3,12.4,125.0,84.7,96.8,232404141230,1,2024-04-14,2324,16579,3.0
16580,DET,240,36.0,92.0,0.391,8.0,33.0,0.242,15.0,20.0,0.750,11.0,29.0,40.0,17.0,7.0,3.0,14.0,9.0,95.0,0.471,0.435,0.359,0.217,20.8,70.7,42.6,47.2,7.0,5.3,12.2,95.5,123.6,99.5,232404141231,0,2024-04-14,2324,16580,2.0


In [277]:
team_stats['streak'] = team_stats \
    .groupby(['team', 'Season'])['win'] \
    .transform(calculate_streak)
team_stats = team_stats.sort_index()
team_stats[team_stats['team']=='GSW']

Unnamed: 0,team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,...,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace,game_id,win,Date,Season,id,last10,streak
3,GSW,240,43.0,80.0,0.538,16.0,30.0,0.533,19.0,21.0,0.905,6.0,35.0,41.0,34.0,5.0,9.0,17.0,25.0,121.0,...,0.638,0.375,0.263,15.4,77.8,48.8,79.1,4.9,16.1,16.0,118.0,119.0,102.5,171810170002,0,2017-10-17,1718,3,0.0,0
48,GSW,240,47.0,92.0,0.511,18.0,41.0,0.439,16.0,20.0,0.800,10.0,39.0,49.0,29.0,6.0,8.0,18.0,23.0,128.0,...,0.609,0.446,0.217,22.2,70.9,49.0,61.7,5.5,12.9,15.2,118.2,110.8,108.3,171810200025,1,2017-10-20,1718,48,0.0,-1
60,GSW,240,33.0,84.0,0.393,12.0,38.0,0.316,23.0,27.0,0.852,12.0,34.0,46.0,20.0,10.0,7.0,17.0,28.0,101.0,...,0.464,0.452,0.321,26.1,85.0,53.5,60.6,10.2,14.0,15.1,102.7,112.8,98.4,171810210031,0,2017-10-21,1718,60,1.0,1
88,GSW,240,48.0,86.0,0.558,15.0,39.0,0.385,22.0,26.0,0.846,10.0,40.0,50.0,33.0,6.0,8.0,16.0,23.0,133.0,...,0.645,0.453,0.302,26.3,74.1,54.3,68.8,5.8,11.9,14.1,129.4,100.2,102.8,171810230045,1,2017-10-23,1718,88,1.0,-1
125,GSW,240,43.0,77.0,0.558,12.0,26.0,0.462,19.0,22.0,0.864,6.0,31.0,37.0,32.0,10.0,7.0,17.0,15.0,117.0,...,0.636,0.338,0.286,19.4,64.6,46.8,74.4,10.2,11.7,16.4,119.4,114.3,98.0,171810250063,1,2017-10-25,1718,125,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16465,GSW,240,49.0,92.0,0.533,14.0,35.0,0.400,6.0,6.0,1.000,11.0,36.0,47.0,32.0,9.0,8.0,16.0,19.0,118.0,...,0.609,0.380,0.065,32.4,85.7,61.8,65.3,9.4,16.7,14.5,123.4,115.1,95.6,232404071173,1,2024-04-07,2324,16465,7.0,-1
16490,GSW,240,47.0,80.0,0.588,26.0,41.0,0.634,14.0,21.0,0.667,4.0,38.0,42.0,37.0,3.0,12.0,15.0,17.0,134.0,...,0.750,0.513,0.263,11.8,79.2,51.2,78.7,3.0,19.7,14.4,133.2,119.3,100.6,232404091186,1,2024-04-09,2324,16490,7.0,1
16518,GSW,240,36.0,77.0,0.468,12.0,35.0,0.343,16.0,20.0,0.800,10.0,36.0,46.0,24.0,8.0,13.0,16.0,18.0,100.0,...,0.545,0.455,0.260,25.6,64.3,48.4,66.7,8.7,20.6,15.7,109.4,100.6,91.4,232404111200,1,2024-04-11,2324,16518,8.0,2
16545,GSW,240,40.0,86.0,0.465,13.0,34.0,0.382,16.0,21.0,0.762,11.0,42.0,53.0,27.0,6.0,5.0,16.0,15.0,109.0,...,0.541,0.395,0.244,25.0,89.4,58.2,67.5,6.1,9.6,14.4,110.1,115.1,99.0,232404121213,0,2024-04-12,2324,16545,9.0,3


### Stworzenie ramek danych w oknach czasowych

In [278]:
def calculate_rolling_stats(df, window, columns, cols_concat):
    df_copy = df.copy()
    results = pd.DataFrame()

    for col in columns:
        if window == 'all':
            # Calculate the mean for each team-season group up to the current game (shifted by one)
            rolling_mean = (
                df_copy.groupby(['Season', 'team'])[col]
                .expanding()
                .mean()
                .shift(1)
                .reset_index(level=['Season', 'team'], drop=True)
            )
        else:
            # Calculate rolling mean with a specified window
            rolling_mean = (
                df_copy.groupby(['Season', 'team'])[col]
                .rolling(window=window, min_periods=1)
                .mean()
                .shift(1)
                .reset_index(level=['Season', 'team'], drop=True)
            )

        results[col] = rolling_mean

    return pd.concat([df_copy[cols_concat].reset_index(drop=True), results], axis=1)

In [279]:
team_stats.columns

Index(['team', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'ORtg', 'DRtg', 'Pace', 'game_id', 'win', 'Date',
       'Season', 'id', 'last10', 'streak'],
      dtype='object')

In [280]:
cols_move = ['id', 'game_id', 'Date', 'Season', 'team', 'win',  'streak', 'last10'] # kolumny do przeniesienia do docelowego df
cols_to_transform = team_stats.columns[2:-7]

In [281]:
team_all_season = calculate_rolling_stats(team_stats, 'all', cols_to_transform, cols_move)
team_all_season[team_all_season['team']=='BOS']

Unnamed: 0,id,game_id,Date,Season,team,win,streak,last10,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace
0,0,171810170001,2017-10-17,1718,BOS,0,0,0.0,38.170732,85.548780,0.447720,11.182927,31.024390,0.363463,15.829268,20.170732,0.787098,9.060976,32.841463,41.902439,23.731707,7.780488,4.243902,15.548780,19.585366,103.353659,0.549134,0.513524,0.363634,0.238073,20.870732,76.362195,48.623171,62.274390,7.904878,7.626829,14.157317,105.213415,110.643902,98.231707
13,13,171810180007,2017-10-18,1718,BOS,0,-1,0.0,36.000000,88.000000,0.409000,8.000000,32.000000,0.250000,19.000000,25.000000,0.760000,9.000000,37.000000,46.000000,24.000000,11.000000,4.000000,12.000000,24.000000,99.000000,0.500000,0.455000,0.364000,0.284000,18.000000,80.400000,47.900000,66.700000,11.000000,6.600000,10.800000,98.700000,101.700000,100.300000
38,38,171810200020,2017-10-20,1718,BOS,1,-2,0.0,37.500000,89.500000,0.419000,9.500000,30.000000,0.321500,15.000000,23.000000,0.642000,10.000000,34.500000,44.500000,23.500000,11.500000,3.000000,13.500000,25.500000,99.500000,0.499500,0.472000,0.336000,0.257500,20.000000,82.300000,48.400000,62.850000,11.550000,5.100000,11.900000,99.650000,105.200000,99.850000
101,101,171810240051,2017-10-24,1718,BOS,1,1,1.0,36.666667,87.666667,0.418333,9.666667,29.666667,0.329333,17.333333,26.000000,0.657333,11.333333,36.666667,48.000000,21.000000,9.000000,3.666667,15.333333,25.000000,100.333333,0.506333,0.473333,0.339000,0.298667,23.266667,79.700000,50.233333,57.133333,9.033333,6.166667,13.333333,100.433333,100.800000,99.900000
132,132,171810260067,2017-10-26,1718,BOS,1,2,2.0,36.750000,84.500000,0.437000,10.750000,29.500000,0.367750,18.500000,26.250000,0.696750,9.750000,36.500000,46.250000,22.250000,9.500000,4.500000,15.000000,24.750000,102.750000,0.538000,0.501750,0.351000,0.314000,20.825000,78.925000,49.875000,60.425000,9.650000,7.025000,13.475000,104.125000,98.900000,98.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16449,16449,232404071165,2024-04-07,2324,BOS,1,4,8.0,43.766234,90.090909,0.486623,16.519481,42.558442,0.387182,16.727273,20.662338,0.809377,10.688312,35.857143,46.545455,26.727273,6.571429,6.467532,11.935065,16.324675,120.779221,0.610260,0.578494,0.472377,0.232818,24.768831,76.558442,51.859740,61.074026,6.694805,11.787013,10.712987,123.566234,111.663636,97.106494
16476,16476,232404091179,2024-04-09,2324,BOS,0,5,8.0,43.833333,90.051282,0.487603,16.512821,42.500000,0.387615,16.641026,20.564103,0.808859,10.615385,35.846154,46.461538,26.820513,6.628205,6.487179,11.948718,16.269231,120.820513,0.611013,0.579474,0.471923,0.231744,24.629487,76.364103,51.746154,61.180769,6.752564,11.796154,10.733333,123.602564,111.630769,97.119231
16515,16515,232404111198,2024-04-11,2324,BOS,0,-1,7.0,43.746835,90.088608,0.486468,16.518987,42.620253,0.386848,16.430380,20.303797,0.798620,10.632911,35.721519,46.354430,26.822785,6.683544,6.455696,11.949367,16.164557,120.443038,0.609468,0.578329,0.473025,0.228810,24.615190,76.311392,51.644304,61.330380,6.822785,11.759494,10.741772,123.321519,111.684810,97.025316
16527,16527,232404121204,2024-04-12,2324,BOS,1,-2,6.0,43.700000,90.037500,0.486200,16.512500,42.612500,0.386775,16.387500,20.250000,0.798800,10.625000,35.600000,46.225000,26.800000,6.687500,6.525000,11.950000,16.162500,120.300000,0.609175,0.578075,0.473212,0.228275,24.585000,76.113750,51.510000,61.345000,6.830000,11.850000,10.750000,123.211250,111.838750,97.002500


Należy poprawić manualnie pierwsze rekordy w sezonie - średnie kroczące jak i last10. Przez przesunięcie shift() pierwsze rekordy są wzięte z poprzedniej drużyny, a pierwsza drużyna wg alfabetu - ATL (Atlanta Hawks) posiada wartości NAN

W pierwszym sezonie w ramce danych każda drużyna będzie miała wpisane średnie dla całej ligi z całego sezonu, a w każdym następnym pierwszym rekordem będzie średnia dla drużyny z poprzedniego sezonu. Imputowany pierwszy rekord nie będzie mieć wpływu na dalsze obliczenia średnich kroczących

In [282]:
def impute_first_rows(rolling_avgs, team_stats_boxscore, columns):
    first_season = '1718'
    df_copy = team_stats_boxscore.copy()
    rolling_avgs = rolling_avgs.copy()

    # Dla każdego sezonu (oprócz pierwszego) używamy ostatnich wartości z poprzedniego sezonu
    for season in df_copy['Season'].unique()[1:]:  # pomijamy pierwszy sezon
        yr1, yr2 = int(season[:2]), int(season[2:])
        prev_season = str(yr1-1)+str(yr2-1)  # poprzedni sezon

        # Dla każdej drużyny w bieżącym sezonie
        for team in np.sort(df_copy[df_copy['Season'] == season]['team'].unique()):
            # Znajdujemy indeks pierwszego rekordu w bieżącym sezonie
            current_mask = (rolling_avgs['Season'] == season) & (rolling_avgs['team'] == team)

            # Znajdujemy ostatnią wartość z poprzedniego sezonu
            prev_mask = (rolling_avgs['Season'] == prev_season) & (rolling_avgs['team'] == team)

            if prev_mask.any():
                prev_values = rolling_avgs[prev_mask].iloc[-1][columns]
                if current_mask.any():
                    first_idx = rolling_avgs[current_mask].index[0]
                    rolling_avgs.loc[first_idx, columns] = prev_values

    # Dla pierwszego sezonu, używamy średniej ligowej
    league_avg = df_copy[df_copy['Season'] == first_season][columns].mean()

    for team in np.sort(df_copy[df_copy['Season'] == first_season]['team'].unique()):
        mask = (rolling_avgs['Season'] == first_season) & (rolling_avgs['team'] == team)
        if mask.any():
            first_idx = rolling_avgs[mask].index[0]
            rolling_avgs.loc[first_idx, columns] = league_avg

    return rolling_avgs

In [283]:
team_all_season = impute_first_rows(team_all_season, team_stats, cols_to_transform)
team_all_season[team_all_season['team']=='BOS']

Unnamed: 0,id,game_id,Date,Season,team,win,streak,last10,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace
0,0,171810170001,2017-10-17,1718,BOS,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
13,13,171810180007,2017-10-18,1718,BOS,0,-1,0.0,36.000000,88.000000,0.409000,8.000000,32.000000,0.250000,19.000000,25.000000,0.760000,9.000000,37.000000,46.000000,24.000000,11.000000,4.000000,12.000000,24.000000,99.000000,0.500000,0.455000,0.364000,0.284000,18.000000,80.400000,47.900000,66.700000,11.000000,6.600000,10.800000,98.700000,101.700000,100.300000
38,38,171810200020,2017-10-20,1718,BOS,1,-2,0.0,37.500000,89.500000,0.419000,9.500000,30.000000,0.321500,15.000000,23.000000,0.642000,10.000000,34.500000,44.500000,23.500000,11.500000,3.000000,13.500000,25.500000,99.500000,0.499500,0.472000,0.336000,0.257500,20.000000,82.300000,48.400000,62.850000,11.550000,5.100000,11.900000,99.650000,105.200000,99.850000
101,101,171810240051,2017-10-24,1718,BOS,1,1,1.0,36.666667,87.666667,0.418333,9.666667,29.666667,0.329333,17.333333,26.000000,0.657333,11.333333,36.666667,48.000000,21.000000,9.000000,3.666667,15.333333,25.000000,100.333333,0.506333,0.473333,0.339000,0.298667,23.266667,79.700000,50.233333,57.133333,9.033333,6.166667,13.333333,100.433333,100.800000,99.900000
132,132,171810260067,2017-10-26,1718,BOS,1,2,2.0,36.750000,84.500000,0.437000,10.750000,29.500000,0.367750,18.500000,26.250000,0.696750,9.750000,36.500000,46.250000,22.250000,9.500000,4.500000,15.000000,24.750000,102.750000,0.538000,0.501750,0.351000,0.314000,20.825000,78.925000,49.875000,60.425000,9.650000,7.025000,13.475000,104.125000,98.900000,98.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16449,16449,232404071165,2024-04-07,2324,BOS,1,4,8.0,43.766234,90.090909,0.486623,16.519481,42.558442,0.387182,16.727273,20.662338,0.809377,10.688312,35.857143,46.545455,26.727273,6.571429,6.467532,11.935065,16.324675,120.779221,0.610260,0.578494,0.472377,0.232818,24.768831,76.558442,51.859740,61.074026,6.694805,11.787013,10.712987,123.566234,111.663636,97.106494
16476,16476,232404091179,2024-04-09,2324,BOS,0,5,8.0,43.833333,90.051282,0.487603,16.512821,42.500000,0.387615,16.641026,20.564103,0.808859,10.615385,35.846154,46.461538,26.820513,6.628205,6.487179,11.948718,16.269231,120.820513,0.611013,0.579474,0.471923,0.231744,24.629487,76.364103,51.746154,61.180769,6.752564,11.796154,10.733333,123.602564,111.630769,97.119231
16515,16515,232404111198,2024-04-11,2324,BOS,0,-1,7.0,43.746835,90.088608,0.486468,16.518987,42.620253,0.386848,16.430380,20.303797,0.798620,10.632911,35.721519,46.354430,26.822785,6.683544,6.455696,11.949367,16.164557,120.443038,0.609468,0.578329,0.473025,0.228810,24.615190,76.311392,51.644304,61.330380,6.822785,11.759494,10.741772,123.321519,111.684810,97.025316
16527,16527,232404121204,2024-04-12,2324,BOS,1,-2,6.0,43.700000,90.037500,0.486200,16.512500,42.612500,0.386775,16.387500,20.250000,0.798800,10.625000,35.600000,46.225000,26.800000,6.687500,6.525000,11.950000,16.162500,120.300000,0.609175,0.578075,0.473212,0.228275,24.585000,76.113750,51.510000,61.345000,6.830000,11.850000,10.750000,123.211250,111.838750,97.002500


In [284]:
team_all_season[(team_all_season['team']=='HOU')]

Unnamed: 0,id,game_id,Date,Season,team,win,streak,last10,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace
2,2,171810170002,2017-10-17,1718,HOU,1,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
24,24,171810180013,2017-10-18,1718,HOU,1,1,1.0,47.000000,97.000000,0.485000,15.000000,41.000000,0.366000,13.000000,19.000000,0.684000,10.000000,33.000000,43.000000,28.000000,9.000000,5.000000,13.000000,16.000000,122.000000,0.579000,0.562000,0.423000,0.196000,22.200000,84.600000,51.200000,59.600000,8.800000,10.000000,11.000000,119.000000,118.000000,102.500000
59,59,171810210030,2017-10-21,1718,HOU,1,2,2.0,40.000000,90.000000,0.441500,13.500000,43.000000,0.316500,20.000000,24.000000,0.807500,11.000000,32.500000,43.500000,23.500000,8.000000,4.000000,14.500000,15.000000,113.500000,0.563500,0.516000,0.482500,0.272500,24.450000,81.300000,51.200000,58.600000,8.000000,7.300000,12.650000,113.200000,110.150000,100.150000
85,85,171810230043,2017-10-23,1718,HOU,0,3,3.0,39.666667,87.333333,0.453000,12.666667,44.333333,0.289000,19.333333,23.333333,0.811000,10.000000,37.000000,47.000000,21.333333,9.333333,4.000000,14.666667,18.000000,111.333333,0.570333,0.525000,0.512667,0.271000,22.800000,82.600000,53.066667,53.600000,9.433333,7.366667,13.133333,112.066667,104.566667,99.233333
112,112,171810250057,2017-10-25,1718,HOU,1,-1,3.0,37.750000,84.750000,0.443750,11.750000,42.750000,0.276000,18.750000,23.250000,0.793000,9.750000,35.250000,45.000000,22.250000,8.250000,3.500000,15.250000,21.250000,106.000000,0.557000,0.512250,0.508000,0.278000,22.225000,84.675000,52.450000,59.725000,8.425000,6.550000,13.925000,108.450000,105.000000,97.475000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16442,16442,232404071162,2024-04-07,2324,HOU,0,-4,6.0,41.558442,90.779221,0.458117,12.571429,35.896104,0.350740,18.181818,23.467532,0.772416,11.519481,34.090909,45.610390,24.636364,7.818182,4.454545,12.792208,20.727273,113.870130,0.564130,0.527870,0.396364,0.261260,25.063636,76.225974,50.436364,59.425974,7.845455,8.464935,11.197403,114.311688,113.368831,98.866234
16481,16481,232404091181,2024-04-09,2324,HOU,1,-5,5.0,41.615385,90.807692,0.458590,12.628205,35.961538,0.351564,18.294872,23.615385,0.772397,11.423077,34.012821,45.435897,24.717949,7.858974,4.474359,12.769231,20.897436,114.153846,0.564936,0.528615,0.396936,0.262731,24.856410,76.305128,50.308974,59.528205,7.869231,8.523077,11.171795,114.388462,113.582051,98.911538
16516,16516,232404111199,2024-04-11,2324,HOU,0,1,5.0,41.658228,90.746835,0.459405,12.658228,35.949367,0.352544,18.227848,23.544304,0.771759,11.354430,33.974684,45.329114,24.772152,7.860759,4.506329,12.759494,20.898734,114.202532,0.565734,0.529646,0.397063,0.262051,24.741772,76.296203,50.264557,59.589873,7.873418,8.558228,11.173418,114.463291,113.511392,98.901266
16546,16546,232404121214,2024-04-12,2324,HOU,1,-1,4.0,41.637500,90.800000,0.458925,12.762500,36.087500,0.353725,18.250000,23.525000,0.773475,11.412500,33.837500,45.250000,24.737500,7.862500,4.525000,12.725000,20.862500,114.287500,0.565887,0.529675,0.398288,0.261675,24.832500,76.098750,50.190000,59.532500,7.880000,8.607500,11.142500,114.620000,113.718750,98.856250


Teraz możemy stworzyć także ramki dla okien czasowych 20, 30 i 40 meczowych

In [285]:
team_last_20 = calculate_rolling_stats(team_stats, 20, cols_to_transform, cols_move)
team_last_20 = impute_first_rows(team_all_season, team_stats, cols_to_transform)
team_last_20

Unnamed: 0,id,game_id,Date,Season,team,win,streak,last10,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace
0,0,171810170001,2017-10-17,1718,BOS,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
1,1,171810170001,2017-10-17,1718,CLE,1,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
2,2,171810170002,2017-10-17,1718,HOU,1,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
3,3,171810170002,2017-10-17,1718,GSW,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
4,4,171810180003,2017-10-18,1718,CHO,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16577,16577,232404141229,2024-04-14,2324,OKC,1,4,6.0,44.432099,89.222222,0.499321,13.283951,34.209877,0.386901,17.753086,21.518519,0.829889,8.777778,33.061728,41.839506,26.975309,8.493827,6.567901,12.641975,18.864198,119.901235,0.608938,0.573914,0.382840,0.244926,20.800000,73.777778,48.287654,60.695062,8.430864,12.895062,11.353086,119.403704,112.658025,99.790123
16578,16578,232404141230,2024-04-14,2324,POR,0,-4,2.0,39.456790,89.604938,0.441198,11.506173,33.061728,0.348000,16.246914,20.530864,0.793605,12.506173,30.098765,42.604938,23.160494,7.604938,4.345679,15.197531,20.185185,106.666667,0.541370,0.505333,0.368395,0.230914,27.170370,74.114815,49.298765,58.658025,7.737037,8.101235,13.379012,108.688889,117.495062,97.197531
16579,16579,232404141230,2024-04-14,2324,SAC,1,-3,3.0,43.333333,90.962963,0.476506,14.382716,39.320988,0.365395,15.456790,20.777778,0.739383,10.802469,33.098765,43.901235,28.333333,7.580247,4.148148,13.135802,19.913580,116.506173,0.582370,0.555642,0.433617,0.230420,24.288889,78.893827,50.806173,65.586420,7.595062,7.811111,11.559259,116.820988,115.627160,98.802469
16580,16580,232404141231,2024-04-14,2324,DET,0,1,2.0,40.950617,88.197531,0.464877,11.086420,31.716049,0.350099,17.074074,21.728395,0.785432,10.493827,32.876543,43.370370,25.567901,6.469136,4.703704,15.234568,20.728395,110.061728,0.563778,0.528012,0.360988,0.249432,23.828395,77.483951,50.139506,62.533333,6.456790,8.334568,13.464198,109.935802,118.729630,99.758025


In [286]:
team_last_30 = calculate_rolling_stats(team_stats, 30, cols_to_transform, cols_move)
team_last_30 = impute_first_rows(team_all_season, team_stats, cols_to_transform)
team_last_30

Unnamed: 0,id,game_id,Date,Season,team,win,streak,last10,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace
0,0,171810170001,2017-10-17,1718,BOS,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
1,1,171810170001,2017-10-17,1718,CLE,1,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
2,2,171810170002,2017-10-17,1718,HOU,1,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
3,3,171810170002,2017-10-17,1718,GSW,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
4,4,171810180003,2017-10-18,1718,CHO,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16577,16577,232404141229,2024-04-14,2324,OKC,1,4,6.0,44.432099,89.222222,0.499321,13.283951,34.209877,0.386901,17.753086,21.518519,0.829889,8.777778,33.061728,41.839506,26.975309,8.493827,6.567901,12.641975,18.864198,119.901235,0.608938,0.573914,0.382840,0.244926,20.800000,73.777778,48.287654,60.695062,8.430864,12.895062,11.353086,119.403704,112.658025,99.790123
16578,16578,232404141230,2024-04-14,2324,POR,0,-4,2.0,39.456790,89.604938,0.441198,11.506173,33.061728,0.348000,16.246914,20.530864,0.793605,12.506173,30.098765,42.604938,23.160494,7.604938,4.345679,15.197531,20.185185,106.666667,0.541370,0.505333,0.368395,0.230914,27.170370,74.114815,49.298765,58.658025,7.737037,8.101235,13.379012,108.688889,117.495062,97.197531
16579,16579,232404141230,2024-04-14,2324,SAC,1,-3,3.0,43.333333,90.962963,0.476506,14.382716,39.320988,0.365395,15.456790,20.777778,0.739383,10.802469,33.098765,43.901235,28.333333,7.580247,4.148148,13.135802,19.913580,116.506173,0.582370,0.555642,0.433617,0.230420,24.288889,78.893827,50.806173,65.586420,7.595062,7.811111,11.559259,116.820988,115.627160,98.802469
16580,16580,232404141231,2024-04-14,2324,DET,0,1,2.0,40.950617,88.197531,0.464877,11.086420,31.716049,0.350099,17.074074,21.728395,0.785432,10.493827,32.876543,43.370370,25.567901,6.469136,4.703704,15.234568,20.728395,110.061728,0.563778,0.528012,0.360988,0.249432,23.828395,77.483951,50.139506,62.533333,6.456790,8.334568,13.464198,109.935802,118.729630,99.758025


In [287]:
team_last_40 = calculate_rolling_stats(team_stats, 40, cols_to_transform, cols_move)
team_last_40 = impute_first_rows(team_all_season, team_stats, cols_to_transform)
team_last_40

Unnamed: 0,id,game_id,Date,Season,team,win,streak,last10,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Pace
0,0,171810170001,2017-10-17,1718,BOS,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
1,1,171810170001,2017-10-17,1718,CLE,1,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
2,2,171810170002,2017-10-17,1718,HOU,1,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
3,3,171810170002,2017-10-17,1718,GSW,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
4,4,171810180003,2017-10-18,1718,CHO,0,0,0.0,39.607724,86.060569,0.461134,10.490650,29.000000,0.361003,16.627236,21.676829,0.768602,9.711789,33.805285,43.517073,23.236992,7.716667,4.815447,14.262602,19.852439,106.333333,0.557316,0.522378,0.337896,0.254894,22.140163,77.860894,50.000447,58.535935,7.873902,8.448008,12.982520,108.668699,108.668699,97.318537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16577,16577,232404141229,2024-04-14,2324,OKC,1,4,6.0,44.432099,89.222222,0.499321,13.283951,34.209877,0.386901,17.753086,21.518519,0.829889,8.777778,33.061728,41.839506,26.975309,8.493827,6.567901,12.641975,18.864198,119.901235,0.608938,0.573914,0.382840,0.244926,20.800000,73.777778,48.287654,60.695062,8.430864,12.895062,11.353086,119.403704,112.658025,99.790123
16578,16578,232404141230,2024-04-14,2324,POR,0,-4,2.0,39.456790,89.604938,0.441198,11.506173,33.061728,0.348000,16.246914,20.530864,0.793605,12.506173,30.098765,42.604938,23.160494,7.604938,4.345679,15.197531,20.185185,106.666667,0.541370,0.505333,0.368395,0.230914,27.170370,74.114815,49.298765,58.658025,7.737037,8.101235,13.379012,108.688889,117.495062,97.197531
16579,16579,232404141230,2024-04-14,2324,SAC,1,-3,3.0,43.333333,90.962963,0.476506,14.382716,39.320988,0.365395,15.456790,20.777778,0.739383,10.802469,33.098765,43.901235,28.333333,7.580247,4.148148,13.135802,19.913580,116.506173,0.582370,0.555642,0.433617,0.230420,24.288889,78.893827,50.806173,65.586420,7.595062,7.811111,11.559259,116.820988,115.627160,98.802469
16580,16580,232404141231,2024-04-14,2324,DET,0,1,2.0,40.950617,88.197531,0.464877,11.086420,31.716049,0.350099,17.074074,21.728395,0.785432,10.493827,32.876543,43.370370,25.567901,6.469136,4.703704,15.234568,20.728395,110.061728,0.563778,0.528012,0.360988,0.249432,23.828395,77.483951,50.139506,62.533333,6.456790,8.334568,13.464198,109.935802,118.729630,99.758025


In [297]:
any(team_last_40.isna().sum())

False

In [289]:
team_last_30.isna().sum()

id         0
game_id    0
Date       0
Season     0
team       0
win        0
streak     0
last10     0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
TS%        0
eFG%       0
3PAr       0
FTr        0
ORB%       0
DRB%       0
TRB%       0
AST%       0
STL%       0
BLK%       0
TOV%       0
ORtg       0
DRtg       0
Pace       0
dtype: int64

In [290]:
team_last_20.isna().sum()

id         0
game_id    0
Date       0
Season     0
team       0
win        0
streak     0
last10     0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
TS%        0
eFG%       0
3PAr       0
FTr        0
ORB%       0
DRB%       0
TRB%       0
AST%       0
STL%       0
BLK%       0
TOV%       0
ORtg       0
DRtg       0
Pace       0
dtype: int64

In [293]:
team_all_season.isna().sum()

id         0
game_id    0
Date       0
Season     0
team       0
win        0
streak     0
last10     0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
TS%        0
eFG%       0
3PAr       0
FTr        0
ORB%       0
DRB%       0
TRB%       0
AST%       0
STL%       0
BLK%       0
TOV%       0
ORtg       0
DRtg       0
Pace       0
dtype: int64

In [292]:
with sqlite3.connect('data/transformed/team_moving_avgs.sqlite') as con:
    team_last_20.to_sql('team_last_20', con, if_exists='replace', index=False)
    team_last_30.to_sql('team_last_30', con, if_exists='replace', index=False)
    team_last_40.to_sql('team_last_40', con, if_exists='replace', index=False)
    team_all_season.to_sql('team_all_season', con, if_exists='replace', index=False)