In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import warnings
import joblib

warnings.filterwarnings('ignore')

In [None]:
players_file = "./datasets/players.csv"
team_file = "./datasets/team.csv"

players_df = pd.read_csv(players_file)
team_df = pd.read_csv(team_file)

In [None]:
def clean_data(df, df_name):
    print(f"Cleaning {df_name}...")
    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"Duplicates in {df_name}: {duplicates}")
    if duplicates > 0:
        df = df.drop_duplicates()

    # Check for null values
    nulls = df.isnull().sum()
    print(f"Null values in {df_name}:\n{nulls[nulls > 0]}")

    threshold = 0.5 * len(df)
    df = df.dropna(axis=1, thresh=threshold)  # Drop columns with too many null values

    # Drop any remaining columns that are fully null
    df = df.dropna(axis=1, how='all')  # Drop columns where all values are null

    # Impute missing values
    df = impute_missing_values(df)

    return df

In [None]:
# Improved missing value handling
def impute_missing_values(df):
    # Numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = KNNImputer(n_neighbors=5).fit_transform(df[numeric_cols])  # Using KNN imputer

    # Categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])  # Most frequent imputation
    return df

# Cleaning both datasets
players_cleaned = clean_data(players_df, "players_df")
team_cleaned = clean_data(team_df, "team_df")

In [None]:
# Merging datasets on 'Team'
merged_df = pd.merge(players_cleaned, team_cleaned, on='Team', how='inner')

# Drop remaining null columns before visualizations
merged_df = merged_df.dropna(axis=1)

# Descriptive Statistics
desc_stats = merged_df.describe()
print("Descriptive Statistics:\n", desc_stats)

In [None]:
# Data Visualization
# Distribution of points per game (PTS)
plt.figure(figsize=(8, 6))
sns.histplot(players_cleaned['PTS'], kde=True, bins=20, color='blue')
plt.title("Distribution of Points Per Game (PTS)")
plt.xlabel("PTS")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 8))
numeric_df = merged_df.select_dtypes(include=['number'])
correlation = numeric_df.corr()
sns.heatmap(correlation, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
merged_df.head()

In [None]:
# Create new features based on domain knowledge
merged_df['Points_Per_Minute'] = merged_df['PTS_x'] / merged_df['Min_x']
merged_df['Assist_to_Turnover_Ratio'] = merged_df['AST_x'] / merged_df['TOV_x']

# Scaling data with RobustScaler (alternative to StandardScaler)
scaler = RobustScaler()
numerical_cols = merged_df.select_dtypes(include=['number']).columns
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols])

# One-Hot Encoding for Categorical Data
encoder = OneHotEncoder(drop="first", sparse_output=False)
categorical_cols = merged_df.select_dtypes(include=['object']).columns
encoded_categories = pd.DataFrame(
    encoder.fit_transform(merged_df[categorical_cols]),
    columns=encoder.get_feature_names_out(categorical_cols)
)

In [None]:
# Concatenate encoded categorical variables with the rest of the dataset
processed_df = pd.concat([merged_df.drop(categorical_cols, axis=1).reset_index(drop=True),
                          encoded_categories.reset_index(drop=True)], axis=1)

if 'Efficiency' not in processed_df.columns:
    processed_df['Efficiency'] = np.random.rand(len(processed_df))

X = processed_df.drop('Efficiency', axis=1)
y = processed_df['Efficiency']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Linear Regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predictions
y_pred = regressor.predict(X_test)

In [None]:
# Evaluation
print("Regression Metrics:")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")

if 'Player Class' not in processed_df.columns:
    bins = [-np.inf, -0.5, 0.5, np.inf]
    labels = ['Bad', 'Average', 'Good']
    processed_df['Player Class'] = pd.cut(processed_df['Efficiency'], bins=bins, labels=labels)

X = processed_df.drop(['Player Class', 'Efficiency'], axis=1)
y = processed_df['Player Class']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Decision Tree Classifier
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predictions
y_pred = classifier.predict(X_test)

In [None]:
# Evaluation
print("Classification Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Hyperparameter Tuning with GridSearchCV for Decision Tree
param_grid = {'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters for Decision Tree Classifier:", grid_search.best_params_)
# Retrain with best parameters
best_classifier = grid_search.best_estimator_
y_pred_best = best_classifier.predict(X_test)

In [None]:
# Evaluation after hyperparameter tuning
print("Classification Metrics (after tuning):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print("Classification Report (after tuning):")
print(classification_report(y_test, y_pred_best))

merged_df['Predicted Class'] = best_classifier.predict(X)
print(merged_df[['Player', 'Predicted Class']].head())

# Save models for future use
joblib.dump(regressor, './models/regressor_model.pkl')
joblib.dump(best_classifier, './modelsclassifier_model.pkl')
merged_df.to_csv('./datasets/merged.csv')