<a href="https://colab.research.google.com/github/liza151/tips/blob/main/tips_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [4]:
tips = sns.load_dataset('tips')
print(tips.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [5]:
# Convert the 'tip' variable into a binary classification problem
tips['tip_class'] = (tips['tip'] > tips['tip'].median()).astype(int)
X = tips.drop(['tip', 'tip_class'], axis=1)
y = tips['tip_class']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 123)

In [8]:
# Define the preprocessing pipelines for numeric and categorical features
numeric_features = ['total_bill', 'size']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['sex', 'smoker', 'day', 'time']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
# Combine the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [18]:
# Step 7: Define the models and parameter grids for hyperparameter tuning
models = {
    'RandomForest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=123))
    ]),
    'DecisionTree': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=123))
    ]),
    'KNN': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])
}

In [27]:
#  Define the parameter grids for hyperparameter tuning
param_grids = {
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30]
    },
    'DecisionTree': {
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 10, 20]
    },
    'KNN': {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    }
}

In [28]:
# Step 8: Train and evaluate the models using GridSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV
best_estimators = {}
for model_name in models:
    clf = GridSearchCV(models[model_name], param_grids[model_name], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    best_estimators[model_name] = clf.best_estimator_
    print(f'Best parameters for {model_name}: {clf.best_params_}')

Best parameters for RandomForest: {'classifier__max_depth': 10, 'classifier__n_estimators': 50}
Best parameters for DecisionTree: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10}
Best parameters for KNN: {'classifier__n_neighbors': 5, 'classifier__weights': 'distance'}


In [29]:
# Step 9: Evaluate the best models on the test set
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f'Classification report for {model_name}:\n{classification_report(y_test, y_pred)}')

Classification report for RandomForest:
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        23
           1       0.73      0.73      0.73        26

    accuracy                           0.71        49
   macro avg       0.71      0.71      0.71        49
weighted avg       0.71      0.71      0.71        49

Classification report for DecisionTree:
              precision    recall  f1-score   support

           0       0.73      0.70      0.71        23
           1       0.74      0.77      0.75        26

    accuracy                           0.73        49
   macro avg       0.73      0.73      0.73        49
weighted avg       0.73      0.73      0.73        49

Classification report for KNN:
              precision    recall  f1-score   support

           0       0.64      0.70      0.67        23
           1       0.71      0.65      0.68        26

    accuracy                           0.67        49
   macro avg     