In [1]:
import numpy as np
import pandas as pd

In [13]:
matches = pd.read_csv("matches_2020_2024.csv")

In [14]:
matches = matches.fillna(0)
del matches["comp"]
del matches["notes"]
matches["date"] = pd.to_datetime(matches["date"])
matches["target"] = (matches["result"] == "W").astype("int")

In [20]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek

In [22]:
# Tính toán chỉ số trung bình
def calculate_rolling_averages(df, team_col, date_col, cols, window=3):
    """
    Tính toán trung bình trượt cho từng đội
    """
    # Tạo bản sao để không ảnh hưởng đến dữ liệu gốc
    result = df.copy()
    
    # Khởi tạo các cột mới
    new_cols = [f"{c}_rolling" for c in cols]
    for col in new_cols:
        result[col] = 0.0
    
    # Tính toán cho từng đội
    for team in df[team_col].unique():
        team_data = result[result[team_col] == team].sort_values(date_col)
        
        # Tính trung bình trượt cho từng cột
        for old_col, new_col in zip(cols, new_cols):
            result.loc[team_data.index, new_col] = (
                team_data[old_col].rolling(window, closed='left').mean()
            )
    
    return result

# Các cột cần tính trung bình
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]

# Áp dụng hàm tính toán
matches_rolling = calculate_rolling_averages(
    df=matches,
    team_col="team",
    date_col="date",
    cols=cols,
    window=3
)

# Loại bỏ các dòng có giá trị NaN trong các cột rolling
rolling_cols = [f"{c}_rolling" for c in cols]
matches_rolling = matches_rolling.dropna(subset=rolling_cols)

# Reset index nếu cần
matches_rolling.index = range(len(matches_rolling))

In [23]:
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2024-08-17,12:30,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,2.6,...,12,5,3.000000,1.666667,25.000000,10.666667,15.733333,0.000000,0.0,0.0
1,2024-08-25,16:30,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,2.5,...,16,6,2.333333,1.000000,22.666667,8.333333,15.566667,0.000000,0.0,0.0
2,2024-09-01,16:00,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,1.8,...,16,6,2.000000,0.000000,24.333333,8.666667,14.400000,0.333333,0.0,0.0
3,2024-09-14,15:00,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,0.9,...,15,5,2.333333,0.000000,16.000000,5.333333,13.933333,0.333333,0.0,0.0
4,2024-09-21,15:00,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,2.0,...,15,5,1.666667,0.333333,14.666667,5.333333,13.966667,0.333333,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2417,2022-04-30,15:00,Matchweek 35,Sat,Away,L,0.0,2.0,Aston Villa,0.5,...,15,5,1.333333,2.000000,12.333333,4.000000,18.166667,0.666667,0.0,0.0
2418,2022-05-08,14:00,Matchweek 36,Sun,Home,L,0.0,4.0,West Ham,0.8,...,14,6,0.666667,2.666667,9.666667,3.000000,18.900000,0.666667,0.0,0.0
2419,2022-05-11,19:45,Matchweek 21,Wed,Away,L,0.0,3.0,Leicester City,1.1,...,19,2,0.000000,3.000000,7.333333,2.333333,19.666667,0.666667,0.0,0.0
2420,2022-05-15,14:00,Matchweek 37,Sun,Away,D,1.0,1.0,Wolves,1.1,...,14,6,0.000000,3.000000,8.666667,3.333333,20.266667,0.333333,0.0,0.0


In [24]:
matches_rolling.to_csv("matches_after_preprocessing.csv")