In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import machine learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

#import grid search for cv cross validation  
from sklearn.model_selection import GridSearchCV

#import preprocessing libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
#load tips data
df = sns.load_dataset('tips')
df.head()
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Task

In [3]:
#split data into X and y
X = df.drop('tip', axis=1)
y = df['tip']

#label encoding
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder using for loop
for col in X.columns:
    if X[col].dtypes == 'object' or X[col].dtypes == 'category':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
#create a disctionary of models
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
    'XGBoost': XGBRegressor()
}

#train and predict the models
models_scores = []
for model_name, model in models.items():
    #train the model
    model.fit(X_train, y_train)
    #predict the model
    y_pred = model.predict(X_test)
    
    metric=mean_absolute_error(y_test, y_pred)
    models_scores.append((model_name, metric))

    #sorted the models
    sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=False)
    for model in sorted_models:
    #  print the models output one time only
       print('Mean Absolute Error for', f"{model[0]} is {model[1]: .2f}")
   

Mean Absolute Error for Linear Regression is  0.67
Mean Absolute Error for SVR is  0.57
Mean Absolute Error for Linear Regression is  0.67
Mean Absolute Error for SVR is  0.57
Mean Absolute Error for Linear Regression is  0.67
Mean Absolute Error for Random Forest is  0.79
Mean Absolute Error for SVR is  0.57
Mean Absolute Error for Linear Regression is  0.67
Mean Absolute Error for Gradient Boosting is  0.72
Mean Absolute Error for Random Forest is  0.79
Mean Absolute Error for SVR is  0.57
Mean Absolute Error for Linear Regression is  0.67
Mean Absolute Error for Gradient Boosting is  0.72
Mean Absolute Error for Random Forest is  0.79
Mean Absolute Error for Decision Tree is  0.88
Mean Absolute Error for SVR is  0.57
Mean Absolute Error for Linear Regression is  0.67
Mean Absolute Error for Gradient Boosting is  0.72
Mean Absolute Error for KNN is  0.73
Mean Absolute Error for Random Forest is  0.79
Mean Absolute Error for Decision Tree is  0.88
Mean Absolute Error for SVR is  0.57


# Mean Squared Error

In [5]:
#create a disctionary of models
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
    'XGBoost': XGBRegressor()
}

#train and predict the models
models_scores = []
for model_name, model in models.items():
    #train the model
    model.fit(X_train, y_train)
    #predict the model
    y_pred = model.predict(X_test)
    
    metric=mean_squared_error(y_test, y_pred)
    models_scores.append((model_name, metric))

    #sorted the models
    sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=False)
    for model in sorted_models:
       print('Mean Squared Error for', f"{model[0]} is {model[1]: .2f}")

   

Mean Squared Error for Linear Regression is  0.69
Mean Squared Error for SVR is  0.54
Mean Squared Error for Linear Regression is  0.69
Mean Squared Error for SVR is  0.54
Mean Squared Error for Linear Regression is  0.69
Mean Squared Error for Random Forest is  0.94
Mean Squared Error for SVR is  0.54
Mean Squared Error for Linear Regression is  0.69
Mean Squared Error for Gradient Boosting is  0.81
Mean Squared Error for Random Forest is  0.94
Mean Squared Error for SVR is  0.54
Mean Squared Error for Linear Regression is  0.69
Mean Squared Error for Gradient Boosting is  0.81
Mean Squared Error for Random Forest is  0.94
Mean Squared Error for Decision Tree is  1.39
Mean Squared Error for SVR is  0.54
Mean Squared Error for Linear Regression is  0.69
Mean Squared Error for Gradient Boosting is  0.81
Mean Squared Error for KNN is  0.84
Mean Squared Error for Random Forest is  0.94
Mean Squared Error for Decision Tree is  1.39
Mean Squared Error for SVR is  0.54
Mean Squared Error for

# Root Mean Squared Error

In [6]:
#create a disctionary of models
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
    'XGBoost': XGBRegressor()
}

#train and predict the models
models_scores = []
for model_name, model in models.items():
    #train the model
    model.fit(X_train, y_train)
    #predict the model
    y_pred = model.predict(X_test)

     #root mean squared error
    metric=np.sqrt(mean_squared_error(y_test, y_pred))
    models_scores.append((model_name, metric))

    #sorted the models
    sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=False)
    for model in sorted_models:
       print('Root mean square Error for', f"{model[0]} is {model[1]: .2f}")

   

Root mean square Error for Linear Regression is  0.83
Root mean square Error for SVR is  0.73
Root mean square Error for Linear Regression is  0.83
Root mean square Error for SVR is  0.73
Root mean square Error for Linear Regression is  0.83
Root mean square Error for Random Forest is  0.98
Root mean square Error for SVR is  0.73
Root mean square Error for Linear Regression is  0.83
Root mean square Error for Gradient Boosting is  0.90
Root mean square Error for Random Forest is  0.98
Root mean square Error for SVR is  0.73
Root mean square Error for Linear Regression is  0.83
Root mean square Error for Gradient Boosting is  0.90
Root mean square Error for Random Forest is  0.98
Root mean square Error for Decision Tree is  1.17
Root mean square Error for SVR is  0.73
Root mean square Error for Linear Regression is  0.83
Root mean square Error for Gradient Boosting is  0.90
Root mean square Error for KNN is  0.92
Root mean square Error for Random Forest is  0.98
Root mean square Error f

# Hyperparameter Tuning

In [None]:
# f models to perform hyperparameter tuning
# Define the hyperparameters grid for each model
param_grid = {
    'Linear Regression': {'normalize': [True, False]},
    'SVR': {'kernel': ['linear', 'poly', 'rbf']},
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]},
    'Gradient Boosting': {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 200, 300]},
    'Decision Tree': {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]},
    'KNN': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'XGBoost': {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 200, 300]}
}

# Perform hyperparameter tuning for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print(f"Best parameters for {model_name}: {best_params}")

# Add the preprocesser inside the pipeline

In [7]:
#####################