# Exploratory Data Analysis (EDA) - Football Players

This notebook performs an in-depth analysis of the football players dataset.
We will explore player attributes, distributions, and correlations to understand the factors influencing performance.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn style for better aesthetics
sns.set(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (10, 6)

# Create directory for saving images if it doesn't exist
PROJECT_ROOT = os.path.abspath("..")
IMAGE_DIR = os.path.join(PROJECT_ROOT, "Image", "analyse")
os.makedirs(IMAGE_DIR, exist_ok=True)
print(f"Images will be saved to: {IMAGE_DIR}")

## 1. Data Loading

In [None]:
# Define path to data
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "fifa_players.csv")

print(f"Loading data from: {DATA_PATH}")

try:
    df = pd.read_csv(DATA_PATH)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}")

## 2. Data Overview

In [None]:
print("=== First 5 Rows ===")
display(df.head())

print("\n=== Dataset Info ===")
df.info()

## 3. Descriptive Statistics

In [None]:
print("=== Descriptive Statistics (Numerical) ===")
display(df.describe())

## 4. Distribution Analysis

Let's visualize the distribution of key player attributes like Age, Overall Rating, and Potential.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Age Distribution
sns.histplot(df['age'], kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('Age Distribution')

# Overall Rating Distribution
sns.histplot(df['overall_rating'], kde=True, ax=axes[1], color='salmon')
axes[1].set_title('Overall Rating Distribution')

# Potential Distribution
sns.histplot(df['potential'], kde=True, ax=axes[2], color='lightgreen')
axes[2].set_title('Potential Distribution')

plt.tight_layout()
save_path = os.path.join(IMAGE_DIR, 'distributions.png')
plt.savefig(save_path)
print(f"Saved plot to {save_path}")
plt.show()

## 5. Correlation Analysis

Which attributes correlate most with a player's Overall Rating? We'll compute the correlation matrix for numerical features.

In [None]:
# Select numerical columns for correlation
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = df[numerical_cols].corr()

# Plot heatmap for top correlated features with Overall Rating
k = 10 # Number of variables for heatmap
cols = corr_matrix.nlargest(k, 'overall_rating')['overall_rating'].index
cm = np.corrcoef(df[cols].values.T)

plt.figure(figsize=(10, 8))
sns.set(font_scale=1.0)
sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, 
            yticklabels=cols.values, xticklabels=cols.values, cmap='coolwarm')
plt.title('Top 10 Features Correlated with Overall Rating')
save_path = os.path.join(IMAGE_DIR, 'correlation_heatmap.png')
plt.savefig(save_path)
print(f"Saved plot to {save_path}")
plt.show()

## 6. Age vs Performance

How does a player's rating change with age? Let's look at the relationship between Age and Overall Rating.

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x='age', y='overall_rating', data=df, color='purple', ci=None)
plt.title('Average Overall Rating by Age')
plt.xlabel('Age')
plt.ylabel('Average Overall Rating')
save_path = os.path.join(IMAGE_DIR, 'age_vs_rating.png')
plt.savefig(save_path)
print(f"Saved plot to {save_path}")
plt.show()

## 7. Value vs Rating

Is there a relationship between a player's market value and their rating?

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='overall_rating', y='value_euro', data=df, alpha=0.5, color='orange')
plt.title('Player Value vs Overall Rating')
plt.xlabel('Overall Rating')
plt.ylabel('Value (Euro)')
plt.yscale('log') # Use log scale for value due to large range
save_path = os.path.join(IMAGE_DIR, 'value_vs_rating.png')
plt.savefig(save_path)
print(f"Saved plot to {save_path}")
plt.show()

## 8. Top Nationalities

Which countries have the most players in the dataset?

In [None]:
top_nationalities = df['nationality'].value_counts().head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_nationalities.values, y=top_nationalities.index, palette='viridis')
plt.title('Top 10 Nationalities by Player Count')
plt.xlabel('Number of Players')
save_path = os.path.join(IMAGE_DIR, 'top_nationalities.png')
plt.savefig(save_path)
print(f"Saved plot to {save_path}")
plt.show()