In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
dkhalidashik_2024_2025_barcelona_scores_and_fixtures_path = kagglehub.dataset_download('dkhalidashik/2024-2025-barcelona-scores-and-fixtures')
dkhalidashik_barcelona_performane_24_25_data_path = kagglehub.dataset_download('dkhalidashik/barcelona-performane-24-25-data')

print('Data source import complete.')


# Introduction

### ⚽ Barcelona 2024–2025 Performance Prediction Under Hansi Flick

![Alt text](https://image.sggp.org.vn/1200x630/Uploaded/2025/cplaaht/2024_09_28/hansi-flick-barcelona-929.jpg.webp)

### 📌 Project Overview

This project aims to analyze and predict FC Barcelona’s match outcomes for the 2024–2025 season under new head coach Hansi Flick. Leveraging historical and upcoming match data, the notebook builds a machine learning pipeline that forecasts results based on tactical, statistical, and contextual features.

The core objectives of this analysis are:
	•	🔍 Analyze match-related trends such as possession, formation impact, and captaincy influence.
	•	🧠 Train a classification model to predict whether Barcelona will win, lose, or draw upcoming fixtures.
	•	📈 Visualize prediction confidence for each upcoming match using probabilistic outputs.
	•	🧪 Evaluate model performance through accuracy, classification reports, and confusion matrices.

### 🎯 Key Questions
	•	Which formations or captains are associated with higher possession?
	•	Can we accurately predict future match results based on recent performance trends?
	•	How confident is the model about its predictions for upcoming fixtures?

This predictive modeling approach combines traditional football analytics with data science techniques to offer insights into Barcelona’s strategy and performance trajectory under new leadership.

# 📦 Importing Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 📂 Loading and Preprocessing the Original Dataset

In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/barcelona-performane-24-25-data/Barcelona stats24-25.csv', delimiter=';')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.info()

# Data cleaning and preprocessing

In [None]:
# 1. Fix column names
df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('Oponent', 'Opponent')

# 2. Convert types
df['Date'] = pd.to_datetime(df['Date'])
df['Attendance'] = df['Attendance'].str.replace(',', '').astype(float)

# 3. Create features
df['GD'] = df['GF'] - df['GA']  # Goal Difference
# Numerical result
result_map = {'W': 3, 'D': 1, 'L': 0}
df['Points'] = df['Result'].map(result_map)

# 4. Split played vs upcoming
played_df = df[df['Result'].notna()].copy()
upcoming_df = df[df['Result'].isna()].copy()

# 5. Sort by date
played_df = played_df.sort_values('Date').reset_index(drop=True)
upcoming_df = upcoming_df.sort_values('Date').reset_index(drop=True)

In [None]:
# Check cleaned data
print(played_df.head(3))
print(upcoming_df.head(3))

# Analysis and Visualization

In [None]:
# Basic numbers
total_matches = played_df.shape[0]
total_wins = (played_df['Result'] == 'W').sum()
total_draws = (played_df['Result'] == 'D').sum()
total_losses = (played_df['Result'] == 'L').sum()

# Goals
total_goals_for = played_df['GF'].sum()
total_goals_against = played_df['GA'].sum()
total_goal_difference = total_goals_for - total_goals_against

# Points
total_points = played_df['Points'].sum()
points_per_match = total_points / total_matches

# Display
print(f"Total Matches Played: {total_matches}")
print(f"Wins: {total_wins}, Draws: {total_draws}, Losses: {total_losses}")
print(f"Goals Scored (GF): {total_goals_for}")
print(f"Goals Conceded (GA): {total_goals_against}")
print(f"Goal Difference (GD): {total_goal_difference}")
print(f"Total Points: {total_points}")
print(f"Points per Match: {points_per_match:.2f}")

In [None]:
# Sum of expected goals
total_xg = played_df['xG'].sum()
total_xga = played_df['xGA'].sum()

# Compare actual vs expected
overperformance_attack = total_goals_for - total_xg
overperformance_defense = total_xga - total_goals_against

# Display
print(f"Total xG (Expected Goals For): {total_xg:.2f}")
print(f"Total xGA (Expected Goals Against): {total_xga:.2f}")
print(f"Difference (Goals scored - xG): {overperformance_attack:.2f}")
print(f"Difference (xGA - Goals conceded): {overperformance_defense:.2f}")

In [None]:
# Home matches
home_df = played_df[played_df['venue'] == 'Home']
# Away matches
away_df = played_df[played_df['venue'] == 'Away']

# Calculate stats
def venue_stats(df):
    wins = (df['Result'] == 'W').sum()
    draws = (df['Result'] == 'D').sum()
    losses = (df['Result'] == 'L').sum()
    points = (wins * 3) + (draws)
    goals_for = df['GF'].sum()
    goals_against = df['GA'].sum()
    matches = df.shape[0]
    ppm = points / matches if matches else 0
    return wins, draws, losses, points, goals_for, goals_against, ppm

home_stats = venue_stats(home_df)
away_stats = venue_stats(away_df)

# Print
print("🏟️ Home Stats:")
print(f"Wins: {home_stats[0]}, Draws: {home_stats[1]}, Losses: {home_stats[2]}")
print(f"Goals For: {home_stats[4]}, Goals Against: {home_stats[5]}")
print(f"Points: {home_stats[3]}, Points per Match: {home_stats[6]:.2f}")

print("\n🛫 Away Stats:")
print(f"Wins: {away_stats[0]}, Draws: {away_stats[1]}, Losses: {away_stats[2]}")
print(f"Goals For: {away_stats[4]}, Goals Against: {away_stats[5]}")
print(f"Points: {away_stats[3]}, Points per Match: {away_stats[6]:.2f}")

In [None]:
import matplotlib.pyplot as plt

# Rolling Points per Match
df['Rolling_Points'] = df['Points'].rolling(window=5, min_periods=1).mean()
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Rolling_Points'], marker='o', label='Rolling Points per Match (5 Games)')
plt.axhline(2, color='red', linestyle='--', label='2 Points Threshold')
plt.title('Rolling Points per Match (Last 5 Games)')
plt.xlabel('Date')
plt.ylabel('Points per Match')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('rolling_points_per_match.png', dpi=300, bbox_inches='tight')  # <<< Save here
plt.show()

# Rolling GF vs GA
df['Rolling_GF'] = df['GF'].rolling(window=5, min_periods=1).mean()
df['Rolling_GA'] = df['GA'].rolling(window=5, min_periods=1).mean()
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Rolling_GF'], label='Rolling GF (Goals For)', color='green')
plt.plot(df['Date'], df['Rolling_GA'], label='Rolling GA (Goals Against)', color='red')
plt.title('Rolling Goals For and Against (Last 5 Games)')
plt.xlabel('Date')
plt.ylabel('Goals')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('rolling_goals_for_against.png', dpi=300, bbox_inches='tight')  # <<< Save here
plt.show()

# Rolling xG vs xGA
df['Rolling_xG'] = df['xG'].rolling(window=5, min_periods=1).mean()
df['Rolling_xGA'] = df['xGA'].rolling(window=5, min_periods=1).mean()
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Rolling_xG'], label='Rolling xG (Expected Goals For)', color='purple')
plt.plot(df['Date'], df['Rolling_xGA'], label='Rolling xGA (Expected Goals Against)', color='orange')
plt.title('Rolling xG and xGA (Last 5 Games)')
plt.xlabel('Date')
plt.ylabel('Expected Goals')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('rolling_xg_xga.png', dpi=300, bbox_inches='tight')  # <<< Save here
plt.show()

# Machine Learning Model

In [21]:
# Create lag features
df['Lag_GF'] = df['GF'].shift(1)
df['Lag_GA'] = df['GA'].shift(1)
df['Lag_xG'] = df['xG'].shift(1)
df['Lag_xGA'] = df['xGA'].shift(1)
df['Lag_Points'] = df['Points'].shift(1)
df['Lag_Result'] = df['Result'].shift(1)

# For moving averages over last 3 matches
df['MA3_GF'] = df['GF'].rolling(window=3).mean()
df['MA3_GA'] = df['GA'].rolling(window=3).mean()
df['MA3_xG'] = df['xG'].rolling(window=3).mean()
df['MA3_xGA'] = df['xGA'].rolling(window=3).mean()
df['MA3_Points'] = df['Points'].rolling(window=3).mean()

# View the new dataframe
df.tail(10)

NameError: name 'df' is not defined

In [None]:
# Rolling averages for last 5 matches
df['MA5_GF'] = df['GF'].rolling(window=5).mean()
df['MA5_GA'] = df['GA'].rolling(window=5).mean()
df['MA5_xG'] = df['xG'].rolling(window=5).mean()
df['MA5_xGA'] = df['xGA'].rolling(window=5).mean()
df['MA5_Points'] = df['Points'].rolling(window=5).mean()

# Goal difference in last match
df['Lag_GD'] = df['Lag_GF'] - df['Lag_GA']

# Home match encoding (1 = Home, 0 = Away)
df['Is_Home'] = df['venue'].apply(lambda x: 1 if x == 'Home' else 0)

def big_match_detector(row):
    if 'Champions' in row['Comp']:
        return 1
    if 'Copa del Rey' in row['Comp'] and 'Final' in row['Round']:
        return 1
    if 'Supercopa' in row['Comp'] and 'Final' in row['Round']:
        return 1
    return 0

# Apply the new function
df['Is_Big_Match'] = df.apply(big_match_detector, axis=1)

# Let's check the new columns
df.tail(10)

In [None]:
# Drop columns except Captain and Formation
drop_cols = ['Time', 'Day4', 'venue', 'GF', 'GA', 'Referee', 'Opp_Formation', 'Attendance', 'Points', 'Round']

df_cleaned = df.drop(columns=drop_cols)

# Show sample
print(df_cleaned.head())

In [None]:
# Step 1: Drop matches with missing moving average features
df_model = df_cleaned.dropna()

# Step 2: Map 'Result' to numeric labels
result_mapping = {'W': 2, 'D': 1, 'L': 0}
df_model['Result_Label'] = df_model['Result'].map(result_mapping)

# Quick check
print(df_model[['Date', 'Result', 'Result_Label']].head())

In [None]:
# Step 1: Decide split index
split_index = int(len(df_model) * 0.85)

# Step 2: Create train and test sets
train_data = df_model.iloc[:split_index]
test_data = df_model.iloc[split_index:]

# Step 3: Select feature columns
feature_cols = [
    'xG', 'xGA', 'Poss', 'GD',
    'MA3_GF', 'MA3_GA', 'MA3_xG', 'MA3_xGA', 'MA3_Points',
    'MA5_GF', 'MA5_GA', 'MA5_xG', 'MA5_xGA', 'MA5_Points',
    'Lag_GD', 'Is_Home', 'Is_Big_Match'
]

# Inputs (X) and targets (y)
X_train = train_data[feature_cols]
y_train = train_data['Result_Label']

X_test = test_data[feature_cols]
y_test = test_data['Result_Label']

# Check shapes
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Train the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
model.fit(X_train_scaled, y_train)

# Step 3: Predict
y_pred = model.predict(X_test_scaled)

# Step 4: Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detect the unique classes present
unique_classes = np.unique(np.concatenate((y_test, y_pred)))

# Mapping label numbers to names
label_mapping = {0: "Loss", 1: "Draw", 2: "Win"}
target_names = [label_mapping[i] for i in unique_classes]

# Print classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=target_names))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

The model was evaluated on the test set and produced the following metrics:
	•	Accuracy: 0.75
This means the model correctly predicted 75% of the outcomes in the test set.
	•	Classification Report:
	•	Win Class:
	•	Precision: 1.00 → When the model predicted a “Win”, it was always correct.
	•	Recall: 0.75 → It correctly identified 75% of the actual “Win” matches.
	•	F1-score: 0.86 → A good balance between precision and recall.
	•	Loss Class:
	•	All metrics are 0.00 because there were no actual “Loss” instances in the test data (support = 0), so this class was not represented.

•	Confusion Matrix:

   •    First row (Actual “Loss”): 0 true positives, 0 false positives (not present in the test set).
	•	Second row (Actual “Win”): 3 true positives, 1 false negative — the model missed 1 “Win” case.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Make sure to define your `features` (i.e., X_train.columns)
# If you previously had X_train as a DataFrame, restore feature names
try:
    feature_names = X_train.columns
except:
    # If X_train is a NumPy array now, define manually or load again
    feature_names = [f'Feature_{i}' for i in range(X_train.shape[1])]

# Step 1: Extract coefficients
coefficients = model.coef_[0]  # Shape: (n_features,) for binary, or (n_classes, n_features) for multinomial

# Step 2: Map to feature names
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Step 3: Sort by absolute coefficient value
importance_df['Abs_Coefficient'] = importance_df['Coefficient'].abs()
importance_df = importance_df.sort_values(by='Abs_Coefficient', ascending=True)

# Step 4: Plot
plt.figure(figsize=(10, 8))
plt.barh(importance_df['Feature'], importance_df['Coefficient'], color='steelblue')
plt.axvline(0, color='black', linestyle='--')
plt.title('Feature Importance (Logistic Regression Coefficients)')
plt.xlabel('Coefficient Value')
plt.tight_layout()
#plt.savefig("feature_importance.png")  # ✅ Save the plot
plt.show()

In [None]:
# Step 0: Prepare X and y
X = df_model.drop(columns=['Result_Label'])

# Drop datetime columns
datetime_cols = X.select_dtypes(include=['datetime64']).columns.tolist()
X = X.drop(columns=datetime_cols)

# One-hot encode all categorical columns
X = pd.get_dummies(X)

# Target
y = df_model['Result_Label']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
model.fit(X_train_scaled, y_train)

# Permutation importance
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model, random_state=42)
perm.fit(X_test_scaled, y_test)

# Show importance (in Jupyter notebook)
import eli5
eli5.show_weights(perm, feature_names=X.columns.tolist())

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Train a random forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
importances = rf_model.feature_importances_
feat_imp_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False).head(20)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df, palette='viridis')
plt.title("Top 20 Feature Importances (Random Forest)")
plt.tight_layout()
#plt.savefig("feature_importance_rf.png")
plt.show()

In [None]:
# Create basic features

# Is_Home: 1 for Home, 0 for Away
played_df['Is_Home'] = played_df['venue'].map({'Home': 1, 'Away': 0})
upcoming_df['Is_Home'] = upcoming_df['venue'].map({'Home': 1, 'Away': 0})

# Is_Big_Match: 1 for matches against strong opponents, 0 otherwise
big_teams = ['Real Madrid', 'Atlético Madrid', 'Inter', 'Bayern', 'Monaco']
played_df['Is_Big_Match'] = played_df['Opponent'].isin(big_teams).astype(int)
upcoming_df['Is_Big_Match'] = upcoming_df['Opponent'].isin(big_teams).astype(int)

# Rolling_Points: 5-match moving average of Points for played_df
played_df['Rolling_Points'] = played_df['Points'].rolling(window=5, min_periods=1).mean()

# For upcoming_df, use the last Rolling_Points from played_df
upcoming_df['Rolling_Points'] = played_df['Rolling_Points'].iloc[-1]

# Fill any NaN values (e.g., early rows in Rolling_Points)
played_df.fillna(0, inplace=True)
upcoming_df.fillna(0, inplace=True)

# Verify new columns
print("played_df columns:", played_df.columns.tolist())
print("upcoming_df columns:", upcoming_df.columns.tolist())
print(played_df[['Date', 'Opponent', 'Is_Home', 'Is_Big_Match', 'Rolling_Points']].head())
print(upcoming_df[['Date', 'Opponent', 'Is_Home', 'Is_Big_Match', 'Rolling_Points']].head())

In [None]:
# Add rolling and lagged features

# 3-match rolling averages for played_df
played_df['MA3_GF'] = played_df['GF'].rolling(window=3, min_periods=1).mean()
played_df['MA3_Points'] = played_df['Points'].rolling(window=3, min_periods=1).mean()

# Lagged feature: previous match's GF
played_df['Lag_GF'] = played_df['GF'].shift(1)

# For upcoming_df, use the last values from played_df
last_played = played_df.iloc[-1]
upcoming_df['MA3_GF'] = last_played['MA3_GF']
upcoming_df['MA3_Points'] = last_played['MA3_Points']
upcoming_df['Lag_GF'] = last_played['GF']  # Use last match's GF as Lag_GF

# Fill NaN values (e.g., first row of Lag_GF in played_df)
played_df.fillna(0, inplace=True)
upcoming_df.fillna(0, inplace=True)

# Verify new columns
print("New played_df columns:", played_df.columns.tolist())
print("New upcoming_df columns:", upcoming_df.columns.tolist())
print(played_df[['Date', 'Opponent', 'MA3_GF', 'MA3_Points', 'Lag_GF']].head())
print(upcoming_df[['Date', 'Opponent', 'MA3_GF', 'MA3_Points', 'Lag_GF']].head())

In [None]:
# Add more rolling and lagged features

# 3-match rolling averages for played_df
played_df['MA3_xG'] = played_df['xG'].rolling(window=3, min_periods=1).mean()
played_df['MA3_xGA'] = played_df['xGA'].rolling(window=3, min_periods=1).mean()

# Lagged feature: previous match's xG
played_df['Lag_xG'] = played_df['xG'].shift(1)

# For upcoming_df, use the last values from played_df
last_played = played_df.iloc[-1]
upcoming_df['MA3_xG'] = last_played['MA3_xG']
upcoming_df['MA3_xGA'] = last_played['MA3_xGA']
upcoming_df['Lag_xG'] = last_played['xG']  # Use last match's xG as Lag_xG

# Fill NaN values (e.g., first row of Lag_xG or rows with missing xG/xGA)
played_df.fillna(0, inplace=True)
upcoming_df.fillna(0, inplace=True)

# Verify new columns
print("Updated played_df columns:", played_df.columns.tolist())
print("Updated upcoming_df columns:", upcoming_df.columns.tolist())
print(played_df[['Date', 'Opponent', 'MA3_xG', 'MA3_xGA', 'Lag_xG']].head())
print(upcoming_df[['Date', 'Opponent', 'MA3_xG', 'MA3_xGA', 'Lag_xG']].head())

In [None]:
# One-hot encode categorical variables: Opponent, Formation, Opp_Formation

# Get unique categories from both played_df and upcoming_df to ensure consistency
all_opponents = pd.concat([played_df['Opponent'], upcoming_df['Opponent']]).unique()
all_formations = pd.concat([played_df['Formation'], upcoming_df['Formation']]).unique()
all_opp_formations = pd.concat([played_df['Opp_Formation'], upcoming_df['Opp_Formation']]).unique()

# One-hot encode with consistent categories
played_df = pd.get_dummies(played_df, columns=['Opponent'], prefix='Opponent', dtype=int)
upcoming_df = pd.get_dummies(upcoming_df, columns=['Opponent'], prefix='Opponent', dtype=int)

played_df = pd.get_dummies(played_df, columns=['Formation'], prefix='Formation', dtype=int)
upcoming_df = pd.get_dummies(upcoming_df, columns=['Formation'], prefix='Formation', dtype=int)

played_df = pd.get_dummies(played_df, columns=['Opp_Formation'], prefix='Opp_Formation', dtype=int)
upcoming_df = pd.get_dummies(upcoming_df, columns=['Opp_Formation'], prefix='Opp_Formation', dtype=int)

# Align columns to ensure both DataFrames have the same dummy variables
all_opponent_cols = [f'Opponent_{opp}' for opp in all_opponents]
all_formation_cols = [f'Formation_{fmt}' for fmt in all_formations]
all_opp_formation_cols = [f'Opp_Formation_{fmt}' for fmt in all_opp_formations]

# Add missing columns with zeros
for col in all_opponent_cols + all_formation_cols + all_opp_formation_cols:
    if col not in played_df.columns:
        played_df[col] = 0
    if col not in upcoming_df.columns:
        upcoming_df[col] = 0

# Verify new columns
print("Updated played_df columns:", played_df.columns.tolist())
print("Updated upcoming_df columns:", upcoming_df.columns.tolist())
print(played_df[['Date', 'Opponent_Valencia', 'Opponent_Inter', 'Formation_4-2-3-1', 'Opp_Formation_4-4-2']].head())
print(upcoming_df[['Date', 'Opponent_Inter', 'Opponent_Real Madrid', 'Formation_4-2-3-1', 'Opp_Formation_4-4-2']].head())

In [None]:
# Add remaining rolling and lagged features

# 5-match rolling averages for played_df
played_df['MA5_GF'] = played_df['GF'].rolling(window=5, min_periods=1).mean()
played_df['MA5_GA'] = played_df['GA'].rolling(window=5, min_periods=1).mean()

# Lagged feature: previous match's GD
played_df['Lag_GD'] = played_df['GD'].shift(1)

# For upcoming_df, use the last values from played_df
last_played = played_df.iloc[-1]
upcoming_df['MA5_GF'] = last_played['MA5_GF']
upcoming_df['MA5_GA'] = last_played['MA5_GA']
upcoming_df['Lag_GD'] = last_played['GD']  # Use last match's GD as Lag_GD

# Fill NaN values (e.g., first row of Lag_GD)
played_df.fillna(0, inplace=True)
upcoming_df.fillna(0, inplace=True)

# Verify new columns
print("Final played_df columns:", played_df.columns.tolist())
print("Final upcoming_df columns:", upcoming_df.columns.tolist())
print(played_df[['Date', 'Opponent_Valencia', 'MA5_GF', 'MA5_GA', 'Lag_GD']].head())
print(upcoming_df[['Date', 'Opponent_Inter', 'MA5_GF', 'MA5_GA', 'Lag_GD']].head())

In [None]:
import matplotlib.pyplot as plt

# Plot MA3_GF, MA3_xG, and Points over time
plt.figure(figsize=(10, 6))
plt.plot(played_df['Date'], played_df['MA3_GF'], label='MA3_GF (3-match avg Goals For)', color='blue')
plt.plot(played_df['Date'], played_df['MA3_xG'], label='MA3_xG (3-match avg Expected Goals)', color='green')
plt.plot(played_df['Date'], played_df['Points'], label='Points', color='red', alpha=0.5)
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Barcelona Performance Trends (2024-2025 Season)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig("Barcelona Performance Trend.png")
plt.show()

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Select features for modeling (exclude non-predictive and sparse columns)
features = ['Is_Home', 'Is_Big_Match', 'Rolling_Points', 'MA3_GF', 'MA3_Points', 'Lag_GF',
            'MA3_xG', 'MA3_xGA', 'Lag_xG', 'MA5_GF', 'MA5_GA', 'Lag_GD'] + \
           [col for col in played_df.columns if col.startswith('Opponent_')]

# Prepare data
X = played_df[features]
y = played_df['Points']
X_upcoming = upcoming_df[features]  # Features for upcoming matches

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on test set: {mae:.2f}")

# Predict Points for upcoming matches
upcoming_preds = model.predict(X_upcoming)
upcoming_df['Predicted_Points'] = upcoming_preds

# Display predictions
print(upcoming_df[['Date', 'Opponent_Inter', 'Opponent_Real Madrid', 'Predicted_Points']])

In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt

# Assuming the model from the previous step is still in memory
# If not, re-run the model training (just the model part for brevity)
features = ['Is_Home', 'Is_Big_Match', 'Rolling_Points', 'MA3_GF', 'MA3_Points', 'Lag_GF',
            'MA3_xG', 'MA3_xGA', 'Lag_xG', 'MA5_GF', 'MA5_GA', 'Lag_GD'] + \
           [col for col in played_df.columns if col.startswith('Opponent_')]
X = played_df[features]
y = played_df['Result'].map({'W': 2, 'D': 1, 'L': 0})
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, n_estimators=100, random_state=42)
model.fit(X, y)

# Plot feature importance
plt.figure(figsize=(10, 6))
xgb.plot_importance(model, max_num_features=10, importance_type='gain', title='Top 10 Feature Importance')
plt.tight_layout()
plt.show()

# Print feature importance scores
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Reload the original dataset to get Formation and Opp_Formation
df = pd.read_csv('/kaggle/input/2024-2025-barcelona-scores-and-fixtures/Barcelona stats24-25.csv', delimiter=';')
df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('Oponent', 'Opponent')

# Merge Formation and Opp_Formation back into played_df and upcoming_df
played_df['Formation'] = df[df['Result'].notna()]['Formation'].values
played_df['Opp_Formation'] = df[df['Result'].notna()]['Opp_Formation'].values
upcoming_df['Formation'] = df[df['Result'].isna()]['Formation'].values
upcoming_df['Opp_Formation'] = df[df['Result'].isna()]['Opp_Formation'].values

# Impute missing Formation and Opp_Formation in upcoming_df with most common values from played_df
most_common_formation = played_df['Formation'].mode()[0]
most_common_opp_formation = played_df['Opp_Formation'].mode()[0]
upcoming_df['Formation'] = upcoming_df['Formation'].fillna(most_common_formation)
upcoming_df['Opp_Formation'] = upcoming_df['Opp_Formation'].fillna(most_common_opp_formation)

# Drop existing encoded Formation_* and Opp_Formation_* columns
formation_cols = [col for col in played_df.columns if col.startswith('Formation_')]
opp_formation_cols = [col for col in played_df.columns if col.startswith('Opp_Formation_')]
played_df = played_df.drop(columns=formation_cols + opp_formation_cols)
upcoming_df = upcoming_df.drop(columns=formation_cols + opp_formation_cols)

# Re-encode Formation and Opp_Formation
all_formations = pd.concat([played_df['Formation'], upcoming_df['Formation']]).unique()
all_opp_formations = pd.concat([played_df['Opp_Formation'], upcoming_df['Opp_Formation']]).unique()
played_df = pd.get_dummies(played_df, columns=['Formation', 'Opp_Formation'], prefix=['Formation', 'Opp_Formation'], dtype=int)
upcoming_df = pd.get_dummies(upcoming_df, columns=['Formation', 'Opp_Formation'], prefix=['Formation', 'Opp_Formation'], dtype=int)

# Align columns
all_formation_cols = [f'Formation_{fmt}' for fmt in all_formations]
all_opp_formation_cols = [f'Opp_Formation_{fmt}' for fmt in all_opp_formations]
for col in all_formation_cols + all_opp_formation_cols:
    if col not in played_df.columns:
        played_df[col] = 0
    if col not in upcoming_df.columns:
        upcoming_df[col] = 0

# Select features (now including Formation_* and Opp_Formation_*)
features = ['Is_Home', 'Is_Big_Match', 'Rolling_Points', 'MA3_GF', 'MA3_Points', 'Lag_GF',
            'MA3_xG', 'MA3_xGA', 'Lag_xG', 'MA5_GF', 'MA5_GA', 'Lag_GD'] + \
           [col for col in played_df.columns if col.startswith('Opponent_')] + \
           [col for col in played_df.columns if col.startswith('Formation_')] + \
           [col for col in played_df.columns if col.startswith('Opp_Formation_')]

# Prepare data
X = played_df[features]
y = played_df['Result'].map({'W': 2, 'D': 1, 'L': 0})
X_upcoming = upcoming_df[features]

# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.2f}")

# Predict Result for upcoming matches
reverse_map = {2: 'W', 1: 'D', 0: 'L'}
upcoming_preds = model.predict(X_upcoming)
upcoming_df['Predicted_Result'] = [reverse_map[pred] for pred in upcoming_preds]

# Display predictions
print(upcoming_df[['Date', 'Opponent_Inter', 'Opponent_Real Madrid', 'Predicted_Result']])

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Select features (same as before, including Formation_* and Opp_Formation_*)
features = ['Is_Home', 'Is_Big_Match', 'Rolling_Points', 'MA3_GF', 'MA3_Points', 'Lag_GF',
            'MA3_xG', 'MA3_xGA', 'Lag_xG', 'MA5_GF', 'MA5_GA', 'Lag_GD'] + \
           [col for col in played_df.columns if col.startswith('Opponent_')] + \
           [col for col in played_df.columns if col.startswith('Formation_')] + \
           [col for col in played_df.columns if col.startswith('Opp_Formation_')]

# Prepare data
X = played_df[features]
y = played_df['Result'].map({'W': 2, 'D': 1, 'L': 0})
X_upcoming = upcoming_df[features]

# Calculate class weights to handle imbalance (W: 40, D: 6, L: 7)
class_counts = played_df['Result'].value_counts()
scale_pos_weight = {2: 1.0, 1: class_counts['W'] / class_counts['D'], 0: class_counts['W'] / class_counts['L']}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train tuned XGBoost classifier
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=100,
    max_depth=3,  # Shallower trees to prevent overfitting
    learning_rate=0.05,  # Slower learning for better convergence
    scale_pos_weight=[scale_pos_weight[label] for label in y_train],  # Weight minority classes
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.2f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['L', 'D', 'W'], yticklabels=['L', 'D', 'W'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Predict Result for upcoming matches
upcoming_preds = model.predict(X_upcoming)
reverse_map = {2: 'W', 1: 'D', 0: 'L'}
upcoming_df['Predicted_Result'] = [reverse_map[pred] for pred in upcoming_preds]

# Display predictions
print(upcoming_df[['Date', 'Opponent_Inter', 'Opponent_Real Madrid', 'Predicted_Result']])

In [None]:
print(df.columns)  # Show columns in original dataset
print(df[df['Result'].isna()].shape)  # Shape of upcoming matches
print(upcoming_df.shape)  # Shape of upcoming_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Reload the original dataset
df = pd.read_csv('/kaggle/input/2024-2025-barcelona-scores-and-fixtures/Barcelona stats24-25.csv', delimiter=';')
df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('Oponent', 'Opponent')

# Create a copy of upcoming_df
upcoming_df_fixed = upcoming_df.copy()

# Assign Opponent with proper index alignment
upcoming_opponents = df[df['Result'].isna()]['Opponent'].reset_index(drop=True)
if len(upcoming_opponents) != len(upcoming_df_fixed):
    raise ValueError(f"Mismatch in row counts: {len(upcoming_opponents)} opponents vs {len(upcoming_df_fixed)} rows in upcoming_df")
upcoming_df_fixed['Opponent'] = upcoming_opponents

# Verify assignment
print("Upcoming opponents after assignment:")
print(upcoming_df_fixed[['Date', 'Opponent']])
if upcoming_df_fixed['Opponent'].isna().any():
    print("Warning: Some Opponent values are still NaN")
    print(upcoming_df_fixed[upcoming_df_fixed['Opponent'].isna()])

# Get prediction probabilities
features = ['Is_Home', 'Is_Big_Match', 'Rolling_Points', 'MA3_GF', 'MA3_Points', 'Lag_GF',
            'MA3_xG', 'MA3_xGA', 'Lag_xG', 'MA5_GF', 'MA5_GA', 'Lag_GD'] + \
           [col for col in played_df.columns if col.startswith('Opponent_')] + \
           [col for col in played_df.columns if col.startswith('Formation_')] + \
           [col for col in played_df.columns if col.startswith('Opp_Formation_')]
X_upcoming = upcoming_df_fixed[features]
probs = model.predict_proba(X_upcoming)

# Create probability DataFrame with aligned indices
prob_df = pd.DataFrame(probs, columns=['P(L)', 'P(D)', 'P(W)'])
prob_df['Date'] = upcoming_df_fixed['Date'].reset_index(drop=True)
prob_df['Opponent'] = upcoming_df_fixed['Opponent'].reset_index(drop=True)

# Apply custom thresholds
def custom_predict(probs, threshold_d=0.05, threshold_l=0.05):
    predictions = []
    for p in probs:
        if p[1] > threshold_d:  # P(D) > 0.05
            predictions.append('D')
        elif p[0] > threshold_l:  # P(L) > 0.05
            predictions.append('L')
        else:
            predictions.append('W')
    return predictions

prob_df['Predicted_Result'] = custom_predict(probs)

# Plot stacked bar chart
plt.figure(figsize=(10, 6))
prob_df[['P(L)', 'P(D)', 'P(W)']].plot(kind='bar', stacked=True, color=['red', 'yellow', 'green'], width=0.8)
plt.xlabel('Date (Opponent)')
plt.ylabel('Probability')
plt.title('Prediction Probabilities for Upcoming Matches')
plt.xticks(ticks=range(len(prob_df)), labels=[f"{date} ({opp})" for date, opp in zip(prob_df['Date'], prob_df['Opponent'])], rotation=45)
plt.legend(title='Result')
plt.tight_layout()
plt.show()

# Print results
print(prob_df[['Date', 'Opponent', 'P(W)', 'P(D)', 'P(L)', 'Predicted_Result']])

# Save to CSV
prob_df[['Date', 'Opponent', 'P(W)', 'P(D)', 'P(L)', 'Predicted_Result']].to_csv('upcoming_predictions.csv', index=False)
print("Saved predictions to 'upcoming_predictions.csv'")

In [None]:
import seaborn as sns

# Count each result type
result_counts = prob_df['Predicted_Result'].value_counts().sort_index()  # L, D, W

# Bar plot
plt.figure(figsize=(6, 4))
sns.barplot(x=result_counts.index, y=result_counts.values, palette={'W': 'green', 'D': 'gold', 'L': 'red'})
plt.title('Barça Upcoming Match Predictions')
plt.xlabel('Predicted Result')
plt.ylabel('Number of Matches')
plt.tight_layout()
plt.show()

In [None]:
# Get the list of all one-hot encoded formation columns
formation_cols = [col for col in played_df.columns if col.startswith('Formation_')]

# Reconstruct 'Formation' column from one-hot
played_df['Formation'] = played_df[formation_cols].idxmax(axis=1).str.replace('Formation_', '')

# Drop rows with missing 'Poss'
formation_poss = played_df.dropna(subset=['Formation', 'Poss'])

# Group and compute average possession
formation_avg_poss = formation_poss.groupby('Formation')['Poss'].mean().sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 5))
formation_avg_poss.plot(kind='bar', color='cornflowerblue')
plt.title('📊 Average Possession by Formation')
plt.ylabel('Average Possession (%)')
plt.xlabel('Formation')
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('average possession by formation')
plt.show()

In [None]:
# Count of each formation
formation_counts = df['Formation'].value_counts()

plt.figure(figsize=(10, 5))
formation_counts.plot(kind='bar', color='lightcoral')
plt.title('📋 Number of Matches per Barcelona Formation')
plt.ylabel('Match Count')
plt.xlabel('Formation')
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('barcelona_formation_frequency.png')
plt.show()

In [None]:
# Drop rows where 'Captain' or 'Poss' is missing
captain_poss = played_df.dropna(subset=['Captain', 'Poss'])

# Group by captain and calculate average possession
captain_avg_poss = captain_poss.groupby('Captain')['Poss'].mean().sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 5))
captain_avg_poss.plot(kind='bar', color='mediumseagreen')
plt.title('🎖️ Average Possession by Captain')
plt.ylabel('Average Possession (%)')
plt.xlabel('Captain')
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('average possesion by captain')
plt.show()

In [None]:
# Frequency of each captain
captain_counts = played_df['Captain'].value_counts()

# Plot frequency
plt.figure(figsize=(10, 5))
captain_counts.plot(kind='bar', color='cornflowerblue')
plt.title('📅 Captain Appearance Frequency')
plt.ylabel('Number of Matches')
plt.xlabel('Captain')
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('captain appearance frequency')
plt.show()

![Alt text](https://akm-img-a-in.tosshub.com/indiatoday/images/story/202410/uefa-champions-league-242353235-16x9_0.jpg?VersionId=JgyXMVxHn2m_nnehxAegJsoxTauW.fsn&size=690:388)

In [None]:
# Group by captain: mean possession & count
captain_stats = played_df.groupby('Captain')['Poss'].agg(['mean', 'count']).sort_values(by='count', ascending=False)

# Plot
fig, ax1 = plt.subplots(figsize=(12, 6))

# Bar plot (left y-axis): Frequency of matches captained
ax1.bar(captain_stats.index, captain_stats['count'], color='skyblue', label='Matches Captained')
ax1.set_ylabel('Number of Matches', color='skyblue')
ax1.tick_params(axis='y', labelcolor='skyblue')
ax1.set_xticklabels(captain_stats.index, rotation=45)

# Line plot (right y-axis): Average possession
ax2 = ax1.twinx()
ax2.plot(captain_stats.index, captain_stats['mean'], color='darkblue', marker='o', label='Avg Possession')
ax2.set_ylabel('Average Possession (%)', color='darkblue')
ax2.tick_params(axis='y', labelcolor='darkblue')

# Titles and layout
plt.title("Captain Performance: Matches Led vs Average Possession")
fig.tight_layout()
#plt.savefig('captain performance:matches led vs avg poss')
plt.show()

# ✅ Conclusion

This project successfully demonstrates the application of data science to football analytics by forecasting match outcomes for FC Barcelona’s 2024–2025 season under Hansi Flick. Using match statistics, tactical formations, captaincy, and engineered features, we built and evaluated a machine learning model that:
	•	Achieved an accuracy of 75% on the test set (though on a small sample size).
	•	Identified the most likely results for upcoming matches with probability-based predictions.
	•	Highlighted tactical trends — for example, formations like 4-2-3-1 were linked to most frequent possession, and captains like Ronald Araüjo showed leadership during high-possession games.

The visualization of predicted outcomes using stacked bar plots offered intuitive insights into how confident the model is in each predicted result.

### ⚠️ Limitations:
	•	The dataset is relatively small, especially for upcoming matches.
	•	Some class imbalance exists (e.g., no losses in the test set).
	•	Real-world football includes unpredictable events (injuries, red cards, morale), which are not captured here.

### 🧭 Future Directions
	•	Incorporate more granular player-level stats (e.g., passes completed, duels won).
	•	Use ensemble models or time-series based predictors like LSTMs.
	•	Improve data balance and extend analysis to multi-season comparisons.

This project serves as a solid foundation for further exploration of how data-driven insights can inform football performance, tactics, and management decisions.

![Alt text](https://images2.minutemediacdn.com/image/upload/c_crop,w_6000,h_3375,x_0,y_342/c_fill,w_1200,ar_4:3,f_auto,q_auto,g_auto/images/voltaxMediaLibrary/mmsport/si/01jmzbywbdrvse831jx5.jpg)