In [8]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

#from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
# load dataset
Tips = sns.load_dataset('tips')
Tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [13]:
# select features and variables
X = Tips.drop('tip', axis=1)
y = Tips['tip']

for col in X.columns:
    if Tips[col].dtypes=="object" or Tips[col].dtypes=='category':
      Tips[col]=LabelEncoder().fit_transform(Tips[col]) 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models={
        'LinearRegression' : LinearRegression(),
        'SVR' : SVR(),
        'DecisionTreeRegressor' : DecisionTreeRegressor(),
        'RandomForestRegressor' : RandomForestRegressor(),
        'KNeighborsRegressor' : KNeighborsRegressor(),
        'GradientBoostingRegressor' : GradientBoostingRegressor()      
}


In [15]:
model_scores = []
for name, model in models.items():
# fit each model from models on training data
    model.fit(X_train, y_train)
# make prediction from each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric)) 

print(model_scores)
    

[('LinearRegression', 0.4441368826121931), ('SVR', 0.5693326496439823), ('DecisionTreeRegressor', -0.14877572203904244), ('RandomForestRegressor', 0.27205574427495316), ('KNeighborsRegressor', 0.3294034029001649), ('GradientBoostingRegressor', 0.3589560317081125)]


In [16]:
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=True)

for model in sorted_models:
    print('R_squared Score', f"{model[0]} is {model[1]: .2f}") 


R_squared Score SVR is  0.57
R_squared Score LinearRegression is  0.44
R_squared Score GradientBoostingRegressor is  0.36
R_squared Score KNeighborsRegressor is  0.33
R_squared Score RandomForestRegressor is  0.27
R_squared Score DecisionTreeRegressor is -0.15


----

# Hyperparameter tuning

In [17]:
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          }

In [29]:
float('inf')

inf

In [28]:
from sklearn.metrics import accuracy_score
for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

    mse = mean_squared_error(y_test, y_pred)
    
    # Print evaluation metrics
    print(f"{name} Mean Squared Error: {mse}")
    
    # Update best model if the current model has lower MSE
    if mse < best_mse:
        best_mse = mse
        best_model_name = name

# Select the best model
print(f"\nBest Model: {best_model_name} with Mean Squared Error: {best_mse}")

LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461157


LinearRegression Mean Squared Error: 0.6948129686287711
SVR MSE:  1.460718141299992
SVR R2:  -0.1686013018011976
SVR MAE:  0.8935334948775431


SVR Mean Squared Error: 1.460718141299992
DecisionTreeRegressor MSE:  0.8774153020453993
DecisionTreeRegressor R2:  0.298051667053291
DecisionTreeRegressor MAE:  0.7189481629481629


DecisionTreeRegressor Mean Squared Error: 0.8774153020453993
RandomForestRegressor MSE:  0.9226772777551033
RandomForestRegressor R2:  0.2618412564059768
RandomForestRegressor MAE:  0.7504061224489799


RandomForestRegressor Mean Squared Error: 0.9226772777551033
KNeighborsRegressor MSE:  0.6640950568462677
KNeighborsRegressor R2:  0.4687117753876745
KNeighborsRegressor MAE:  0.6203721488595437


KNeighborsRegressor Mean Squared Error: 0.6640950568462677
GradientBoostingRegressor MSE:  0.8106801524004932
GradientBoostingRegressor R2:  0.351

----
----
---
