> # **Best Model Selection**

## **Classification**

In [8]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the Titanic dataset from Seaborn
titanic_data = sns.load_dataset('titanic')

# Select features and target variable
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    # Create a parameter grid for the model
    if name == 'Random Forest':
        param_grid = {
            'model__n_estimators': [10, 50, 100, 200],
            'model__max_depth': [None, 5, 10, 20],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    elif name == 'Gradient Boosting':
        param_grid = {
            'model__n_estimators': [10, 50, 100],
            'model__learning_rate': [0.1, 0.01],
            'model__max_depth': [3, 5],
        }

    # Create a grid search object
    grid_search = GridSearchCV(pipeline, param_grid=param_grid)

    # Fit the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Print the best parameters for the model
    print(f'{name} Best Parameters: {grid_search.best_params_}')

    # Make predictions on the test data using the best model
    y_pred = grid_search.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print the evaluation metrics for the model
    print(f'{name} Test Accuracy: {accuracy}')
    print(f'{name} Test Precision: {precision}')
    print(f'{name} Test Recall: {recall}')
    print(f'{name} Test F1 Score: {f1}')

    # Update the best model and accuracy if necessary
    if accuracy > best_accuracy:
        best_model = model
        best_accuracy = accuracy

print(f'Best Model: {best_model}')
print(f'Best Accuracy: {best_accuracy}')


Random Forest Best Parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 10}
Random Forest Test Accuracy: 0.8156424581005587
Random Forest Test Precision: 0.8253968253968254
Random Forest Test Recall: 0.7027027027027027
Random Forest Test F1 Score: 0.7591240875912408
Gradient Boosting Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}
Gradient Boosting Test Accuracy: 0.7988826815642458
Gradient Boosting Test Precision: 0.8166666666666667
Gradient Boosting Test Recall: 0.6621621621621622
Gradient Boosting Test F1 Score: 0.7313432835820896
Best Model: RandomForestClassifier(random_state=42)
Best Accuracy: 0.8156424581005587


## **Regression**

In [9]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the Titanic dataset from Seaborn
titanic_data = sns.load_dataset('titanic')

# Select features and target variable
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42))
]

best_model = None
best_r2_score = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ]) 
    # Create a parameter grid for the model
    if name == 'Random Forest':
        param_grid = {
            'model__n_estimators': [10, 50, 100, 200],
            'model__max_depth': [None, 5, 10, 20],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    elif name == 'Gradient Boosting':
        param_grid = {
            'model__n_estimators': [10, 50, 100],
            'model__learning_rate': [0.1, 0.01],
            'model__max_depth': [3, 5],
        } 
    # Create a grid search object
    grid_search = GridSearchCV(pipeline, param_grid=param_grid) 
    # Fit the grid search on the training data
    grid_search.fit(X_train, y_train) 
    # Print the best parameters for the model
    print(f'{name} Best Parameters: {grid_search.best_params_}') 
    # Make predictions on the test data using the best model
    y_pred = grid_search.predict(X_test) 
    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test,y_pred,squared=False)
    r2 = r2_score(y_test,y_pred) 
    # Print the evaluation metrics for the model
    print(f'{name} Test MAE: {mae}')
    print(f'{name} Test MSE: {mse}')
    print(f'{name} Test RMSE: {rmse}')
    print(f'{name} Test R2 Score: {r2}') 
    # Update the best model and r2 score if necessary
    if r2 > best_r2_score:
        best_model = model
        best_r2_score = r2 
print(f'Best Model: {best_model}')
print(f'Best R2 Score: {best_r2_score}')


Random Forest Best Parameters: {'model__max_depth': 5, 'model__min_samples_leaf': 2, 'model__min_samples_split': 10, 'model__n_estimators': 100}
Random Forest Test MAE: 0.2701324653948732
Random Forest Test MSE: 0.14849249573022463
Random Forest Test RMSE: 0.38534724046011365
Random Forest Test R2 Score: 0.3876643428967661
Gradient Boosting Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}
Gradient Boosting Test MAE: 0.26911976508719937
Gradient Boosting Test MSE: 0.13533876632460917
Gradient Boosting Test RMSE: 0.3678841751483871
Gradient Boosting Test R2 Score: 0.44190612460658907
Best Model: GradientBoostingRegressor(random_state=42)
Best R2 Score: 0.44190612460658907


In [2]:
# Make this plot after Best Model selection Chat gpt
'''
plt.figure(figsize=(12,4))
for i in ['mean_test_precision','mean_test_recall', 'mean_test_minimum_of_both']:
    plt.plot([j[1]for j in df_results['param_class_weight']],
    df_results[i],
    label=i)
plt.legend() 
'''

"\nplt.figure(figsize=(12,4))\nfor i in ['mean_test_precision','mean_test_recall', 'mean_test_minimum_of_both']:\n    plt.plot([j[1]for j in df_results['param_class_weight']],\n    df_results[i],\n    label=i)\nplt.legend() \n"