In [None]:
import os
import pandas as pd
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report

class F1Model:
    def __init__(self, race_results_path: str, output_dir: str = 'model_results'):
        """
        F1 race predictions
        
        :param race_results_path: Path to race results CSV
        :param output_dir: Directory to save model results
        """
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        self.output_dir = output_dir
        
        # Load race results
        try:
            self.race_results = pd.read_csv(race_results_path)
        except FileNotFoundError:
            raise FileNotFoundError(f"Race results file not found at {race_results_path}")
        
    def validate_data(self):
        """
        Validate input data columns and types
        
        :raises ValueError: If required columns are missing or have incorrect types
        """
        required_columns = [
            'driver_id', 'grid_position', 'temperature', 
            'humidity', 'track_temperature', 'pit_stops', 
            'finish_position', 'points_earned'
        ]
        
        # Check for missing columns
        missing_columns = [col for col in required_columns if col not in self.race_results.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")
        
        # Check column types
        numeric_columns = [
            'grid_position', 'temperature', 'humidity', 
            'track_temperature', 'pit_stops', 
            'finish_position', 'points_earned'
        ]
        for col in numeric_columns:
            if not pd.api.types.is_numeric_dtype(self.race_results[col]):
                raise ValueError(f"Column '{col}' must be numeric")
    
    def prepare_model_features(self):
        """
        Prepare features for model training
        
        :return: Prepared features and target variables
        """
        # Validate data before processing
        self.validate_data()
        
        # Select relevant features
        features = [
            'driver_id', 'grid_position', 'temperature', 
            'humidity', 'track_temperature', 'pit_stops'
        ]
        
        # Prepare feature matrix
        X = self.race_results[features]
        
        # Two prediction targets
        y_finish_position = self.race_results['finish_position']
        y_points = self.race_results['points_earned']
        
        return X, y_finish_position, y_points
    
    def train_and_evaluate_models(self, test_size=0.2, random_state=42):
        """
        Train multiple models and generate prediction results
        
        :param test_size: Proportion of data to use for testing
        :param random_state: Random seed for reproducibility
        :return: Dictionary of model results and evaluation metrics
        """
        # Prepare features
        X, y_finish_position, y_points = self.prepare_model_features()
        
        # Split data
        X_train, X_test, y_finish_train, y_finish_test, y_points_train, y_points_test = train_test_split(
            X, y_finish_position, y_points, test_size=test_size, random_state=random_state
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Model results storage
        model_results = {}
        
        # 1. Finish Position Prediction (Classification)
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=random_state)
        rf_classifier.fit(X_train_scaled, y_finish_train)
        
        # Predictions
        finish_predictions = rf_classifier.predict(X_test_scaled)
        finish_proba = rf_classifier.predict_proba(X_test_scaled)
        
        # Classification metrics
        finish_metrics = {
            'mean_absolute_error': mean_absolute_error(y_finish_test, finish_predictions),
            'accuracy': sklearn.metrics.accuracy_score(y_finish_test, finish_predictions),
            'classification_report': classification_report(y_finish_test, finish_predictions, output_dict=True)
        }
        
        # 2. Points Prediction (Regression)
        rf_regressor = RandomForestRegressor(n_estimators=100, random_state=random_state)
        rf_regressor.fit(X_train_scaled, y_points_train)
        
        # Predictions
        points_predictions = rf_regressor.predict(X_test_scaled)
        
        # Regression metrics
        points_metrics = {
            'mean_squared_error': mean_squared_error(y_points_test, points_predictions),
            'mean_absolute_error': mean_absolute_error(y_points_test, points_predictions),
            'r2_score': r2_score(y_points_test, points_predictions)
        }
        
        # Feature Importance
        finish_feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf_classifier.feature_importances_
        }).sort_values('importance', ascending=False)
        
        points_feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf_regressor.feature_importances_
        }).sort_values('importance', ascending=False)
        
        # Prepare results DataFrame
        results_df = pd.DataFrame({
            'race_id': X_test.index,
            'actual_finish_position': y_finish_test,
            'predicted_finish_position': finish_predictions,
            'finish_prediction_probability': finish_proba.max(axis=1),
            'actual_points': y_points_test,
            'predicted_points': points_predictions
        })
        
        # Save results
        results_df.to_csv(os.path.join(self.output_dir, 'model_predictions.csv'), index=False)
        
        # Save metrics
        with open(os.path.join(self.output_dir, 'model_metrics.txt'), 'w') as f:
            f.write("Finish Position Prediction Metrics:\n")
            f.write(str(finish_metrics) + "\n\n")
            f.write("Points Prediction Metrics:\n")
            f.write(str(points_metrics) + "\n")
        
        # Save feature importance
        finish_feature_importance.to_csv(os.path.join(self.output_dir, 'finish_feature_importance.csv'), index=False)
        points_feature_importance.to_csv(os.path.join(self.output_dir, 'points_feature_importance.csv'), index=False)
        
        # Save models
        joblib.dump(rf_classifier, os.path.join(self.output_dir, 'finish_position_model.pkl'))
        joblib.dump(rf_regressor, os.path.join(self.output_dir, 'points_prediction_model.pkl'))
        
        return {
            'results_df': results_df,
            'finish_metrics': finish_metrics,
            'points_metrics': points_metrics,
            'finish_feature_importance': finish_feature_importance,
            'points_feature_importance': points_feature_importance
        }

def main():
    race_results_path = 'data/model_results.csv'
    
    # Check if race results exist
    if not os.path.exists(race_results_path):
        print(f"Error: Race results file not found at {race_results_path}")
        print("Please ensure you have a CSV file with the required columns:")
        print("driver_id, grid_position, temperature, humidity, track_temperature, pit_stops, finish_position, points_earned")
        return
    
    # Generate model results
    try:
        model_results_generator = F1ModelResultsGenerator(race_results_path)
        model_results = model_results_generator.train_and_evaluate_models()
        
        # Print some results
        print("\nFinish Position Prediction Metrics:")
        print(model_results['finish_metrics'])
        
        print("\nPoints Prediction Metrics:")
        print(model_results['points_metrics'])
        
        print("\nTop 5 Important Features for Finish Position:")
        print(model_results['finish_feature_importance'].head())
    
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == '__main__':
    main()