In [4]:
import numpy as np 
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns 

In [5]:
df = pd.read_csv("tips.csv")

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
x = df.iloc[:,1:]
y = df['total_bill']

In [8]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)
#numric processing pipeline 
numeric_processor = Pipeline(
    steps = [("imputation_mean", SimpleImputer(missing_values=np.nan,strategy="mean")),
            ("scaler", StandardScaler())]
)

In [11]:
categotical_processor = Pipeline(
    steps = [("imputation_constant", SimpleImputer(fill_value="missing",strategy="constant")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

In [16]:
preprocessor  = ColumnTransformer(
    [("categorical",categotical_processor,["sex","smoker","day","time"]),
    ("numerical", numeric_processor,["tip","size"])]
)

In [17]:
pipe = Pipeline(
    steps= [("preprocessor",preprocessor),
           ("regressor",RandomForestRegressor())]
)

In [18]:
pipe

In [19]:
pipe.fit(X_train,y_train)

In [23]:
#"hyperparameter tuning"
param_grid = {
    "regressor__n_estimators":[200,500],
    "regressor__max_features": ["auto", "sqrt","log2"],
    "regressor__max_depth": [4,5,6,7,8]
}

In [24]:
grid_search = GridSearchCV(pipe, param_grid = param_grid, n_jobs=1)

In [27]:
import warnings
warnings.filterwarnings('ignore')


In [28]:
grid_search.fit(X_train,y_train)

In [29]:
grid_search.best_params_

{'regressor__max_depth': 6,
 'regressor__max_features': 'auto',
 'regressor__n_estimators': 200}

In [30]:
pipe = Pipeline(
    steps= [("preprocessor",preprocessor),
           ("regressor",RandomForestRegressor(max_depth = 6,
                                             max_features = 'auto',
                                             n_estimators = 200))]
)

In [32]:
pipe.fit(X_train, y_train)

In [33]:
pred = pipe.predict(X_test)

In [34]:
pred

array([13.43929783, 29.08447783, 28.81006786, 15.02972802, 20.99714389,
       31.1660136 , 13.45031125, 32.5642095 , 17.56349441, 18.15404485,
       11.97333647, 23.61965393, 24.72089337, 17.97537539, 30.49963669,
       15.18766323, 13.46483499, 26.79922563, 15.41083513, 13.86809884,
       13.14403474, 14.55191176, 28.18754334, 13.91003554, 11.54967148,
       11.57783148, 15.33927832, 18.83058357, 15.73042228, 18.97659707,
       14.55191176, 31.04084056, 12.05971608, 12.24879606, 15.54282231,
       17.52239485, 19.27479969, 25.11984637, 21.58307729, 19.64826112,
       31.60552168, 16.48103295, 20.08804048, 13.60843924, 20.48769068,
       29.82104954, 18.80570288, 23.49121372, 25.58118865])