In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [7]:
df = sns.load_dataset('tips')

X = df.drop('tip', axis=1)
y = df['tip']

In [53]:
le = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'category':
        X[col] = le.fit_transform(X[col])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int32  
 3   smoker      244 non-null    int32  
 4   day         244 non-null    int32  
 5   time        244 non-null    int32  
 6   size        244 non-null    int64  
dtypes: float64(2), int32(4), int64(1)
memory usage: 9.7 KB


In [54]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int32  
 3   smoker      244 non-null    int32  
 4   day         244 non-null    int32  
 5   time        244 non-null    int32  
 6   size        244 non-null    int64  
dtypes: float64(2), int32(4), int64(1)
memory usage: 9.7 KB


In [48]:
df[df['sex'] == 'Male'].value_counts().sum()

0

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'KNN': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor()
}

model_scores = []

In [68]:
%%time

#print model dictionary

for name, model in models.items():

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    metric = mean_absolute_error(y_test, y_pred)

    model_scores.append((name, metric))




CPU times: total: 766 ms
Wall time: 196 ms


In [67]:
sorted_scores = sorted(model_scores, key=lambda x: x[1], reverse=False)
sorted_scores

[('SVR', 0.5707097371316318),
 ('Linear Regression', 0.6703807496461158),
 ('XGBoost', 0.6721697168934103),
 ('KNN', 0.7262448979591837),
 ('Gradient Boosting', 0.7281386742081417),
 ('Random Forest', 0.7869551020408166),
 ('Decision Tree', 0.8114285714285715)]

# Hyperparameter Tuning