# Exploratory Data Analysis and Data Understanding

### Import Libaries

In [None]:
# Loading Libaries
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC

import xgboost as xgb

RANDOM_STATE = 2025  # Shared random seed for reproducibility across models.

### Connecting to the dataset

In [None]:
# Load dataset
file_path = "NBA Player Stats and Salaries_2010-2025.csv"  # change path if needed
df = pd.read_csv(file_path)

# Quick peek
print("Shape (rows, columns):", df.shape)
print("\nFirst 5 rows:")
display(df.head(50))


## Exploratory Data Analysis

#### Distribution of NBA Salaries (2010 - 2025)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df["Salary"], bins=50, kde=True)
plt.title("Distribution of NBA Salaries (2010â€“2025)")
plt.xlabel("Salary")
plt.ylabel("Count")
plt.show()


#### Average Salary by Season

In [None]:
salary_year = df.groupby("Year")["Salary"].mean()

plt.figure(figsize=(10,5))
sns.lineplot(x=salary_year.index, y=salary_year.values)
plt.title("Average Salary by Season")
plt.ylabel("Average Salary ($)")
plt.show()


#### Player Performance Distribution Analysis

In [None]:
stats = ["PTS", "AST", "TRB", "STL", "BLK", "MP"] 
plt.figure(figsize=(14,10)) 
for i, col in enumerate(stats, 1): 
    plt.subplot(2,3,i) 
    sns.histplot(df[col], bins=40, kde=True) 
    plt.title(f"Distribution of {col}") 
plt.tight_layout() 
plt.show()

#### Relationship Between Player Performance and Salary

In [None]:
sns.scatterplot(data=df, x="PTS", y="Salary", alpha=0.4)
plt.title("Salary vs Points Per Game")
plt.show()


#### Team-Level Salary Distribution

In [None]:
team_salary = df.groupby("Team")["Salary"].mean().sort_values(ascending=False)

plt.figure(figsize=(12,6))
sns.barplot(x=team_salary.values, y=team_salary.index)
plt.title("Average Salary by Team")
plt.show()
