In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('../data/raw/players_21.csv')

# Show first few rows
df.head()


In [None]:
# Show the first 5 rows of the data
df.head()



In [None]:
# Check the shape (rows, columns)
print("Shape of dataset:", df.shape)

# Check all column names
print("\nColumn Names:\n", df.columns.tolist())

# Basic info about data types and missing values
print("\nData Types and Non-Null Counts:\n")
df.info()

# Count missing values in each column
print("\nMissing values in each column:\n")
print(df.isnull().sum())


In [None]:
# Visualizing missing values using a heatmap
plt.figure(figsize=(20, 8))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
df = df.drop(columns=['player_traits', 'player_tags', 'nation_position', 'nation_jersey_number', 'loaned_from'])
df['release_clause_eur'] = df['release_clause_eur'].fillna(0)
df['team_jersey_number'] = df['team_jersey_number'].fillna(0)


In [None]:
top_overall = df[['short_name', 'long_name', 'age', 'club_name', 'nationality', 'overall', 'potential', 'value_eur']].sort_values(by='overall', ascending=False).head(10)
top_overall


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,6))
sns.barplot(data=top_overall, x='overall', y='short_name', palette='viridis')
plt.title('Top 10 Players by Overall Rating')
plt.xlabel('Overall Rating')
plt.ylabel('Player Name')
plt.show()



In [None]:
top_potential = df[['short_name', 'long_name', 'age', 'club_name', 'nationality', 'overall', 'potential', 'value_eur']].sort_values(by='potential', ascending=False).head(10)
top_potential


In [None]:
plt.figure(figsize=(12,6))
sns.barplot(data=top_potential, x='potential', y='short_name', palette='coolwarm')
plt.title('Top 10 Players by Potential')
plt.xlabel('Potential')
plt.ylabel('Player Name')
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='age', y='overall', alpha=0.5)
plt.title('Overall Rating vs Age')
plt.xlabel('Age')
plt.ylabel('Overall Rating')
plt.grid(True)
plt.show()


In [None]:
# Filter only numeric columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute correlation matrix
corr = numeric_df.corr()

# Plot the heatmap
plt.figure(figsize=(16,12))
sns.heatmap(corr, cmap='coolwarm', annot=False, linewidths=0.5)
plt.title('Correlation Heatmap of Player Stats')
plt.show()


In [None]:
# Count player positions
position_counts = df['player_positions'].str.split(',').str[0].value_counts().head(10)

# Pie chart
plt.figure(figsize=(8,8))
plt.pie(position_counts, labels=position_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Top 10 Player Primary Positions')
plt.axis('equal')
plt.show()



In [None]:
# Group by club and calculate average overall rating
club_overall = df.groupby('club_name')['overall'].mean().sort_values(ascending=False).head(15)

# Bar plot
plt.figure(figsize=(12,6))
sns.barplot(x=club_overall.values, y=club_overall.index, palette='viridis')
plt.xlabel('Average Overall Rating')
plt.ylabel('Club')
plt.title('Top 15 Clubs by Average Player Rating')
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='age', y='overall', alpha=0.5)
plt.title('Age vs Overall Rating')
plt.xlabel('Age')
plt.ylabel('Overall Rating')
plt.grid(True)
plt.show()
