# **Best Model Selection**:

- **[21.1.1. Best Model Selection](21.1.1. Best Model Selection.ipynb)**

In this section, we will discuss the best model selection techniques for regression and classification problems. We will cover two main types of models: (i)
**regression models** and (ii) **classification models**.
In this section, we will discuss the following topics:

  

In [34]:
import pandas as pd
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score,  GridSearchCV
from sklearn.pipeline import Pipeline 
from sklearn. impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder ,LabelEncoder
from sklearn. ensemble import RandomForestClassifier , GradientBoostingClassifier 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# load the dataset 
titanic_data = sns.load_dataset('titanic')

# select features and Target variable 
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']


# split the data into train and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#create a list of models to evaluate 
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XgBoost', XGBClassifier(random_state=42))
]

best_model = None
best_accuracy = 0.0

# Iterate over the model and evaluate their performance
for name, model in models:
   
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    
    #calculate mean accuracy 
    mean_accuracy = scores.mean()

    #fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # make prediction on the test data
    y_pred = pipeline.predict(X_test)

    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

print("Model",name) 
print("Cross-Validation Accuracy:", mean_accuracy)
print("Test Accuracy:", accuracy)
print()

#check if the accuracy model has the best accuracy
if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = pipeline


# print the best model
print("Best Model:", best_model)



Model XgBoost
Cross-Validation Accuracy: 0.807613513247316
Test Accuracy: 0.7932960893854749

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=Non...one,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                    

# ADD more models in the same code 


In [32]:
import pandas as pd
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score,  GridSearchCV
from sklearn.pipeline import Pipeline 
from sklearn. impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder 
from sklearn. ensemble import RandomForestClassifier , GradientBoostingClassifier 
from sklearn .svm import SVC
from sklearn .linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

titanic_data = sns.load_dataset('titanic')

# select features and Target variable 
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']


# split the data into train and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#create a list of models to evaluate 
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42))

]

est_model = None
best_accuracy = 0.0

# Iterate over the model and evaluate their performance
for name, model in models:
   
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('models', model)
   
    ])

    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    
    #calculate mean accuracy 
    mean_accuracy = scores.mean()

    #fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # make prediction on the test data
    y_pred = pipeline.predict(X_test)

    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

print("model",name) 
print("Cross-Validation Accuracy:", mean_accuracy)
print("Test Accuracy:", accuracy)
print()

#check if the accuracy model has the best accuracy
if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = pipeline


# print the best model
print("Best models:", best_model)



model Logistic Regression
Cross-Validation Accuracy: 0.7977839062346105
Test Accuracy: 0.8100558659217877

Best models: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('models', LogisticRegression(random_state=42))])
