# **Selecting the best model with best hyperparameters**

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# train test split the data 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
df = sns.load_dataset('tips')

In [11]:
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [12]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Task

In [13]:
# select features and variables

X = df.drop('tip',axis=1)
y = df['tip']

# lable encode categorical variable 
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

# split the data into traon and test data with 80% training dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)

In [16]:
# create a dictionies of list of  model to evaluate performance 

models = {
    "LinearRegression" : LinearRegression(),
    "SVR": SVR(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor()
}

# train and predict each model with evaluation metrics as well making a for loop
model_scores = []
for name, model in models.items():
    model.fit(X_train, y_train)

    # make prediction from each model 
    y_pred = model.predict(X_test)
    metric =r2_score (y_test , y_pred)
    model_scores.append((name, metric))

    # print the performing metric 
    # print(name,'MSE:', mean_squared_error)
    # print(name, 'R2:', r2_score(y_test, y_pred))
    # print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
    # print('\n')

    # selecting the best model from all above with evalution metrics sorting
    sorted_models =sorted(model_scores,key=lambda x: x[1], reverse=True)
    for model in sorted_models:
        print('r2_score', f"{model[0]} is {model[1]: 2f}")

 
    
    
    

r2_score LinearRegression is  0.566635
r2_score LinearRegression is  0.566635
r2_score SVR is  0.541012
r2_score LinearRegression is  0.566635
r2_score SVR is  0.541012
r2_score DecisionTreeRegressor is  0.398516
r2_score LinearRegression is  0.566635
r2_score SVR is  0.541012
r2_score RandomForestRegressor is  0.471696
r2_score DecisionTreeRegressor is  0.398516
r2_score LinearRegression is  0.566635
r2_score SVR is  0.541012
r2_score RandomForestRegressor is  0.471696
r2_score KNeighborsRegressor is  0.434495
r2_score DecisionTreeRegressor is  0.398516
r2_score LinearRegression is  0.566635
r2_score SVR is  0.541012
r2_score RandomForestRegressor is  0.471696
r2_score GradientBoostingRegressor is  0.443194
r2_score KNeighborsRegressor is  0.434495
r2_score DecisionTreeRegressor is  0.398516
r2_score LinearRegression is  0.566635
r2_score SVR is  0.541012
r2_score RandomForestRegressor is  0.471696
r2_score GradientBoostingRegressor is  0.443194
r2_score KNeighborsRegressor is  0.4344