In [None]:
# 1. Import thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance



In [None]:
# 2. Load dữ liệu
match = pd.read_csv("../data/raw/match.csv")
team_attr = pd.read_csv("../data/raw/team_attributes.csv")


In [None]:
# 3. Feature Engineering cơ bản

match = match[[
    "id", "home_team_api_id", "away_team_api_id", "date",
    "home_team_goal", "away_team_goal"
]].copy()

# Tạo biến kết quả
match["goal_diff"] = match["home_team_goal"] - match["away_team_goal"]
match["match_result"] = match["goal_diff"].apply(
    lambda x: "Home Win" if x > 0 else "Away Win" if x < 0 else "Draw"
)

# Ngày tháng
match["match_date"] = pd.to_datetime(match["date"])
match["match_month"] = match["match_date"].dt.month
match["match_year"] = match["match_date"].dt.year


In [None]:
# 4. Gộp dữ liệu chiến thuật đội bóng
latest_attr = team_attr.sort_values("date").drop_duplicates("team_api_id", keep="last")

home_attr = latest_attr.rename(columns={
    "team_api_id": "home_team_api_id",
    "buildUpPlaySpeed": "home_speed",
    "chanceCreationShooting": "home_shooting",
    "defencePressure": "home_pressure"
})[["home_team_api_id", "home_speed", "home_shooting", "home_pressure"]]

away_attr = latest_attr.rename(columns={
    "team_api_id": "away_team_api_id",
    "buildUpPlaySpeed": "away_speed",
    "chanceCreationShooting": "away_shooting",
    "defencePressure": "away_pressure"
})[["away_team_api_id", "away_speed", "away_shooting", "away_pressure"]]

match = match.merge(home_attr, on="home_team_api_id", how="left")
match = match.merge(away_attr, on="away_team_api_id", how="left")

# Tạo feature chênh lệch
match["diff_speed"] = match["home_speed"] - match["away_speed"]
match["diff_shooting"] = match["home_shooting"] - match["away_shooting"]
match["diff_pressure"] = match["home_pressure"] - match["away_pressure"]


In [None]:
# 5. Chuẩn bị dữ liệu huấn luyện
features = match[[
    "home_team_goal", "away_team_goal", "match_month",
    "diff_speed", "diff_shooting", "diff_pressure"
]]
labels = match["match_result"]

# Loại bỏ NaN
features = features.dropna()
labels = labels.loc[features.index]

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(features, y_encoded, test_size=0.2, random_state=42)


In [None]:
# 6. EDA nhanh
plt.figure(figsize=(6,4))
sns.countplot(x=labels)
plt.title("Distribution of Match Results")
plt.show()

sns.pairplot(features.assign(Result=labels).dropna(), hue="Result")


In [None]:
# Phân phối kết quả trận đấu
plt.figure(figsize=(6,4))
sns.countplot(x=labels)
plt.title("Distribution of Match Results")
plt.xlabel("Match Result")
plt.ylabel("Count")
plt.show()


In [None]:
# NHÓM 1: Biến liên quan đến bàn thắng & thời gian
sns.pairplot(
    data=features.assign(Result=labels),
    vars=["home_team_goal", "away_team_goal", "match_month"],
    hue="Result",
    height=2.5
)
plt.suptitle("Group 1: Goals & Month", y=1.02)
plt.show()


In [None]:
# NHÓM 2: Chênh lệch chiến thuật
sns.pairplot(
    data=features.assign(Result=labels),
    vars=["diff_speed", "diff_shooting", "diff_pressure"],
    hue="Result",
    height=2.5
)
plt.suptitle("Group 2: Tactical Differences", y=1.02)
plt.show()


In [None]:
# Biểu đồ Boxplot từng biến theo kết quả
for col in features.columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=features.assign(Result=labels), x="Result", y=col)
    plt.title(f"{col} vs Match Result")
    plt.tight_layout()
    plt.show()


In [None]:
# Heatmap tương quan
plt.figure(figsize=(8,6))
sns.heatmap(features.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Between Features")
plt.tight_layout()
plt.show()


In [None]:
# 7. Huấn luyện mô hình Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Trích xuất Feature Importance
importance = model.feature_importances_
feat_df = pd.DataFrame({'feature': features.columns, 'importance': importance}).sort_values(by="importance", ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(data=feat_df, x="importance", y="feature")
plt.title("Feature Importance (Gốc)")
plt.show()


In [None]:
# 8. Permutation Importance
perm_result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)

perm_df = pd.DataFrame({
    'feature': features.columns,
    'importance_mean': perm_result.importances_mean,
    'importance_std': perm_result.importances_std
}).sort_values(by='importance_mean', ascending=False)

plt.figure(figsize=(8,5))
plt.barh(perm_df['feature'], perm_df['importance_mean'], xerr=perm_df['importance_std'])
plt.title("Permutation Importance")
plt.xlabel("Mean decrease in accuracy")
plt.gca().invert_yaxis()
plt.show()


In [None]:
# 9. Kết luận
from IPython.display import Markdown as md
md(f"""
**Top 3 feature quan trọng nhất:**

1. `{feat_df.iloc[0]['feature']}`
2. `{feat_df.iloc[1]['feature']}`
3. `{feat_df.iloc[2]['feature']}`

Feature như `match_month` hoặc `diff_pressure` có thể ít đóng góp cho mô hình, cần xem xét loại bỏ hoặc biến đổi lại.
""")
