In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
#import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


In [None]:
class BasketballPredictiveModels:
    def __init__(self):
        self.models = {}
        self.feature_importance = {}
        self.performance_metrics = {}
        
    def prepare_efficiency_target(self, df, target_metric='ADV_EFF'):
        """Prepare target variable for efficiency prediction"""
        # Create forward-looking efficiency target (next game or period)
        df_sorted = df.sort_values(['PLAYER_ID', 'GAME_DATE']) if 'GAME_DATE' in df.columns else df
        
        if 'PLAYER_ID' in df.columns:
            df_sorted['FUTURE_EFF'] = df_sorted.groupby('PLAYER_ID')[target_metric].shift(-1)
        else:
            df_sorted['FUTURE_EFF'] = df_sorted[target_metric]
        
        # Remove rows with missing targets
        df_clean = df_sorted.dropna(subset=['FUTURE_EFF'])
        
        return df_clean
    
    def build_efficiency_models(self, X, y, test_size=0.2):
        """Build multiple models to predict player efficiency"""
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )
        
        # Define models
        models_to_train = {
            'random_forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'xgboost': xgb.XGBRegressor(n_estimators=100, random_state=42),
            'gradient_boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
            'elastic_net': ElasticNet(alpha=0.1, random_state=42),
            'lightgbm': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
        }
        
        # Train and evaluate each model
        for name, model in models_to_train.items():
            print(f"Training {name}...")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Make predictions
            train_pred = model.predict(X_train)
            test_pred = model.predict(X_test)
            
            # Calculate metrics
            train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
            test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
            test_mae = mean_absolute_error(y_test, test_pred)
            test_r2 = r2_score(y_test, test_pred)
            
            # Store results
            self.models[name] = model
            self.performance_metrics[name] = {
                'train_rmse': train_rmse,
                'test_rmse': test_rmse,
                'test_mae': test_mae,
                'test_r2': test_r2
            }
            
            # Store feature importance if available
            if hasattr(model, 'feature_importances_'):
                self.feature_importance[name] = model.feature_importances_
        
        return X_test, y_test
    
    def optimize_best_model(self, X, y, cv_folds=5):
        """Hyperparameter tuning for the best performing model"""
        # Find best model based on test RMSE
        best_model_name = min(self.performance_metrics.keys(), 
                            key=lambda x: self.performance_metrics[x]['test_rmse'])
        
        print(f"Optimizing {best_model_name}...")
        
        # Define parameter grids
        param_grids = {
            'random_forest': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 15, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            },
            'xgboost': {
                'n_estimators': [100, 200],
                'max_depth': [3, 5, 7],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 0.9, 1.0]
            },
            'gradient_boosting': {
                'n_estimators': [100, 200],
                'max_depth': [3, 5, 7],
                'learning_rate': [0.01, 0.1, 0.2]
            }
        }
        
        if best_model_name in param_grids:
            # Get base model
            if best_model_name == 'random_forest':
                base_model = RandomForestRegressor(random_state=42)
            elif best_model_name == 'xgboost':
                base_model = xgb.XGBRegressor(random_state=42)
            elif best_model_name == 'gradient_boosting':
                base_model = GradientBoostingRegressor(random_state=42)
            
            # Grid search
            grid_search = GridSearchCV(
                base_model, 
                param_grids[best_model_name],
                cv=cv_folds,
                scoring='neg_mean_squared_error',
                n_jobs=-1,
                verbose=1
            )
            
            grid_search.fit(X, y)
            
            # Update best model
            self.models[f'{best_model_name}_optimized'] = grid_search.best_estimator_
            
            print(f"Best parameters for {best_model_name}: {grid_search.best_params_}")
            
            return grid_search.best_estimator_
        
        return self.models[best_model_name]
    
    def predict_player_efficiency(self, player_features, model_name='random_forest'):
        """Predict efficiency for new player data"""
        if model_name not in self.models:
            raise ValueError(f"Model {model_name} not found. Available models: {list(self.models.keys())}")
        
        model = self.models[model_name]
        predictions = model.predict(player_features)
        
        return predictions
    
    def get_feature_importance(self, feature_names, model_name='random_forest', top_n=20):
        """Get and visualize feature importance"""
        if model_name not in self.feature_importance:
            print(f"Feature importance not available for {model_name}")
            return None
        
        # Create importance dataframe
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': self.feature_importance[model_name]
        }).sort_values('importance', ascending=False)
        
        # Plot top features
        plt.figure(figsize=(10, 8))
        sns.barplot(data=importance_df.head(top_n), y='feature', x='importance')
        plt.title(f'Top {top_n} Feature Importance - {model_name}')
        plt.tight_layout()
        plt.show()
        
        return importance_df
    
    def save_models(self, filepath_prefix='basketball_models'):
        """Save trained models"""
        for name, model in self.models.items():
            joblib.dump(model, f"{filepath_prefix}_{name}.pkl")
        
        # Save performance metrics
        pd.DataFrame(self.performance_metrics).T.to_csv(f"{filepath_prefix}_performance.csv")
        
        print("Models saved successfully!")


In [None]:
# Load your processed data
df = pd.read_csv('processed_player_data.csv')

# Initialize predictor
predictor = BasketballPredictiveModels()

# Prepare target variable
df_with_target = predictor.prepare_efficiency_target(df)

# Prepare features
feature_cols = [col for col in df_with_target.columns 
               if col not in ['FUTURE_EFF', 'PLAYER_NAME', 'TEAM_ABBREVIATION']]

X = df_with_target[feature_cols]
y = df_with_target['FUTURE_EFF']

# Build models
X_test, y_test = predictor.build_efficiency_models(X, y)

# Optimize best model
best_model = predictor.optimize_best_model(X, y)

# Show results
for model_name, metrics in predictor.performance_metrics.items():
    print(f"\n{model_name} Performance:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")