In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import joblib

# Step 1: Load the data
data_path = r"C:\Users\Hiremath\OneDrive\Desktop\New folder\05.08.2023 fliprobo\baseball.csv"
data = pd.read_csv(data_path)

# Step 2: Exploratory Data Analysis (EDA)
# Perform data exploration to understand the features, target variable distribution, missing values, and any other patterns in the data.

# Step 3: Preprocessing and Feature Engineering
# Handle missing values, encode categorical variables, and perform feature engineering if needed.

# Step 4: Split the data into train and test sets
X = data.drop('W', axis=1)
y = data['W']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Building
models = [
    ('RandomForest', RandomForestRegressor()),
    ('LinearRegression', LinearRegression())
]

# Step 6: Model Evaluation
results = []
for name, model in models:
    pipeline = Pipeline([('scaler', StandardScaler()), (name, model)])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    results.append((name, -scores.mean(), scores.std()))

# Display the cross-validation results
for name, mean_mse, std_mse in results:
    print(f'{name}: Mean MSE: {mean_mse:.4f} (±{std_mse:.4f})')

# Step 7: Hyperparameter Tuning (for RandomForest)
param_grid = {
    'RandomForest__n_estimators': [100, 200, 300],
    'RandomForest__max_depth': [None, 10, 20],
    'RandomForest__min_samples_split': [2, 5, 10]
}

rf_pipeline = Pipeline([('scaler', StandardScaler()), ('RandomForest', RandomForestRegressor())])
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f'Best model parameters: {grid_search.best_params_}')
print(f'Best model cross-validation Mean MSE: {grid_search.best_score_:.4f}')

# Step 9: Final Model Selection
# The best model is already selected during hyperparameter tuning.

# Step 10: Save the Best Model for Production
joblib.dump(best_model, 'best_model.pkl')


RandomForest: Mean MSE: 55.1163 (±25.9441)
LinearRegression: Mean MSE: 620.7171 (±1029.9814)
Best model parameters: {'RandomForest__max_depth': 20, 'RandomForest__min_samples_split': 2, 'RandomForest__n_estimators': 200}
Best model cross-validation Mean MSE: -46.5613


['best_model.pkl']