In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif  # Feature selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
import numpy as np

import seaborn as sns

In [55]:
# Load dataset
data = sns.load_dataset("tips")
data
X = data.drop("tip", axis = 1)
y = data["tip"]

In [56]:
X

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner,3
240,27.18,Female,Yes,Sat,Dinner,2
241,22.67,Male,Yes,Sat,Dinner,2
242,17.82,Male,No,Sat,Dinner,2


In [57]:
y

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64

In [58]:
X.isna().sum()

total_bill    0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [70]:
num_cols = ["total_bill", "size"]
cat_cols = ["sex", "smoker","day","time"]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle= True)
print(X_train.shape, X_test.shape)

(195, 6) (49, 6)


In [66]:
def winsorization(X, percentile = 2):
    lower, upper  = np.percentile(X, percentile), np.percentile(X,100-percentile)
    return np.clip(X, lower, upper)

In [None]:
num_impute_outlier_pipeline = Pipeline(
    [("impute", SimpleImputer(strategy="median")),
    ("outlier", FunctionTransformer(winsorization))]
)
num_preprocessing = ColumnTransformer(
    [("impute_detectOutlier", num_impute_outlier_pipeline, num_cols),
    ("log_transform", FunctionTransformer(func = np.log1p), num_cols),
    ("square_transform", FunctionTransformer(func = np.square), num_cols),
    ("power_transform", PowerTransformer(method="yeo-johnson"), num_cols),
    ("scaler", StandardScaler(), num_cols)], 
    remainder="passthrough"
)

cat_impute_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(sparse_output=False, drop = "first"))
])

cat_preprocessing = ColumnTransformer(
    [("impute", cat_impute_pipeline, cat_cols),
    ("ohe", OneHotEncoder(sparse_output=False, drop = "first"), cat_cols),]
)




# Define numeric transformation pipeline with Feature Selection
# num_pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
#     ("log", FunctionTransformer(func= np.log1p)),
#     ("square", FunctionTransformer(func= np.square)),
#     ('power', PowerTransformer(method='yeo-johnson')),  # Feature transformation
#     ('scaler', StandardScaler()),  # Standardization
#     # ('feature_selection', SelectKBest(score_func=f_classif, k=15)),  # Select top 3 features
#     # ('pca', PCA(n_components=5))  # Reduce dimensions further
# ])


# numerical_pipeline = Pipeline([
#     ("impute", SimpleImputer()),
#     # ("polynomial", PolynomialFeatures(degree=2, interaction_only=True)),
#     ("winsorization", FunctionTransformer(winsorization)),
#     ("log_transform", FunctionTransformer(func = np.log1p)),
#     ("square_transform", FunctionTransformer(func = np.square)),
#     ("power_transform", PowerTransformer(method="yeo-johnson")),
#     ("scaler", StandardScaler())
# ])

# Combine into a full preprocessing pipeline
# preprocessor = ColumnTransformer([
#     ("impute", SimpleImputer(), num_cols),
#     # ("winsorization", FunctionTransformer(winsorization),num_cols),
#     # ("log_transform", FunctionTransformer(func = np.log1p), num_cols),
#     # ("square_transform", FunctionTransformer(func = np.square), num_cols),
#     # ("power_transform", PowerTransformer(method="yeo-johnson"), num_cols),
#     # ("ohe_city_gender", OneHotEncoder(sparse_output=False, drop = "first"), cat_cols),
#     # ("ordinal_en", OrdinalEncoder(categories=[["A", "B"]]),cat_cols),








# ],remainder="passthrough")

# Step 1: Fit and transform the data **once**
X_train_transformed = num_preprocessing.fit_transform(X_train)
X_test_transformed = num_preprocessing.transform(X_test)

In [81]:
X_train_transformed.shape

(195, 14)

In [82]:
cat_preprocessing.fit_transform(X_train)

ValueError: too many values to unpack (expected 2)

In [None]:

# Step 2: Train and evaluate multiple models using the transformed dataset
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    scores = cross_val_score(model, X_transformed, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
    print(f"{name}: Accuracy = {scores.mean():.4f}")