In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load dataset
data = pd.read_csv("bike_buyers.csv", encoding="latin1")

# Display first rows
print(data.head())


   ï»¿ID Marital Status  Gender   Income  Children        Education  \
0  12496        Married  Female  40000.0       1.0        Bachelors   
1  24107        Married    Male  30000.0       3.0  Partial College   
2  14177        Married    Male  80000.0       5.0  Partial College   
3  24381         Single     NaN  70000.0       0.0        Bachelors   
4  25597         Single    Male  30000.0       0.0        Bachelors   

       Occupation Home Owner  Cars Commute Distance   Region   Age  \
0  Skilled Manual        Yes   0.0        0-1 Miles   Europe  42.0   
1        Clerical        Yes   1.0        0-1 Miles   Europe  43.0   
2    Professional         No   2.0        2-5 Miles   Europe  60.0   
3    Professional        Yes   1.0       5-10 Miles  Pacific  41.0   
4        Clerical         No   0.0        0-1 Miles   Europe  36.0   

  Purchased Bike  
0             No  
1             No  
2             No  
3            Yes  
4            Yes  


In [3]:
# 1. Data Cleaning (remove commas, convert types if needed)
# Here, Income might have commas -> clean it
if "Income" in data.columns:
    data["Income"] = data["Income"].replace(',', '', regex=True).astype(float)

# Define numeric & categorical features
num_features = ["Age", "Income", "Cars"]
cat_features = ["Gender", "Marital Status", "Education", "Occupation", "Home Owner", "Commute Distance", "Region"]

# Preprocessing for numeric features
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Preprocessing for categorical features
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine preprocessing for input features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")

# Apply preprocessing to input (X)
X = preprocessor.fit_transform(data)

# Preprocess output (y)
y = data[["Purchased Bike"]]


In [4]:
# 2. Feature Engineering example: Income per Car
if "Income" in data.columns and "Cars" in data.columns:
    X["Income_per_Car"] = data["Income"] / (data["Cars"].replace(0, np.nan))  # avoid divide by zero

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show processed data
print(X_train.head())
print(y_train.head())

     num__Age  num__Income  num__Cars  cat__Gender_Female  cat__Gender_Male  \
29  -1.076979    -1.171476   0.488207                 1.0               0.0   
535 -0.281277    -0.202449   1.384158                 0.0               1.0   
695 -0.016042     0.766578   0.488207                 0.0               1.0   
557 -1.165390    -0.525458  -1.303695                 1.0               0.0   
836 -1.430624    -0.525458   0.488207                 1.0               0.0   

     cat__Marital Status_Married  cat__Marital Status_Single  \
29                           0.0                         1.0   
535                          1.0                         0.0   
695                          1.0                         0.0   
557                          1.0                         0.0   
836                          1.0                         0.0   

     cat__Education_Bachelors  cat__Education_Graduate Degree  \
29                        0.0                             0.0   
535       