In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
import joblib
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer

In [23]:
df_mushrooms = pd.read_csv("artifacts/mushrooms_clean.csv")

In [24]:
missing_cols = df_mushrooms.columns[df_mushrooms.isnull().any()].tolist()
# Drop rows with missing values (if few)
if df_mushrooms.isnull().sum().max() < len(df_mushrooms) * 0.05:  # Less than 5%
    df_mushrooms = df_mushrooms.dropna()
    print(f"Dropped rows with missing values. New shape: {df_mushrooms.shape}")

# Or Impute with mode (most common category)
else:
    for col in missing_cols:
        mode_value = df_mushrooms[col].mode()[0]
        df_mushrooms[col].fillna(mode_value, inplace=True)
        print(f"Filled {col} missing values with mode: {mode_value}")

Dropped rows with missing values. New shape: (8124, 23)


In [25]:
print("\nMissing values after handling:")
print(df_mushrooms.isna().sum())


Missing values after handling:
class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


In [26]:
# SPLIT DATA FIRST (before encoding)
X = df_mushrooms.drop('class', axis=1)  # Assuming 'class' is target
y = df_mushrooms['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Keep class balance in both sets
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


Training set: (6499, 22)
Test set: (1625, 22)


In [27]:
# ENCODE CATEGORICAL FEATURES (fit on train, transform on test)
print("\n=== Label Encoding ===")

# Create dictionary to store encoders for each column
label_encoders = {}
X_train_encoded = pd.DataFrame()
X_test_encoded = pd.DataFrame()

for column in X_train.columns:
    # Create and fit encoder on TRAINING data only
    le = LabelEncoder()
    X_train_encoded[column] = le.fit_transform(X_train[column])

    # Transform test data using the SAME encoder
    # Handle unseen categories gracefully, If test has unseen category ‚Üí use -1
    X_test_encoded[column] = X_test[column].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )
    # Store encoder for later use
    label_encoders[column] = le

    print(f"{column}: {len(le.classes_)} classes")


# 7. Encode target variable
y_encoder = LabelEncoder()
y_train_encoded = y_encoder.fit_transform(y_train)
y_test_encoded = y_encoder.transform(y_test)

print(f"Target classes: {y_encoder.classes_}")
print(f"y_train_encoded shape: {y_train_encoded.shape}")
print(f"y_test_encoded shape: {y_test_encoded.shape}")



=== Label Encoding ===
cap-shape: 6 classes
cap-surface: 4 classes
cap-color: 10 classes
bruises: 2 classes
odor: 9 classes
gill-attachment: 2 classes
gill-spacing: 2 classes
gill-size: 2 classes
gill-color: 12 classes
stalk-shape: 2 classes
stalk-root: 5 classes
stalk-surface-above-ring: 4 classes
stalk-surface-below-ring: 4 classes
stalk-color-above-ring: 9 classes
stalk-color-below-ring: 9 classes
veil-type: 1 classes
veil-color: 4 classes
ring-number: 3 classes
ring-type: 5 classes
spore-print-color: 9 classes
population: 6 classes
habitat: 7 classes
Target classes: ['e' 'p']
y_train_encoded shape: (6499,)
y_test_encoded shape: (1625,)


In [28]:
# Final check
print("\n=== Final Encoded Data ===")
print("Training features shape:", X_train_encoded.shape)
print("Test features shape:", X_test_encoded.shape)
print("\nSample of encoded training data:")
print(X_train_encoded.head())
print("\nNo missing values in encoded data:")
print(X_train_encoded.isnull().sum().sum() == 0)


=== Final Encoded Data ===
Training features shape: (6499, 22)
Test features shape: (1625, 22)

Sample of encoded training data:
   cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0          2            3          9        0     2                1   
1          5            2          5        1     5                1   
2          0            2          3        0     5                1   
3          2            2          4        0     7                1   
4          3            3          4        0     2                1   

   gill-spacing  gill-size  gill-color  stalk-shape  ...  \
0             0          0           2            0  ...   
1             0          0           1            0  ...   
2             1          0          10            0  ...   
3             0          1           0            1  ...   
4             0          1           0            1  ...   

   stalk-surface-below-ring  stalk-color-above-ring  stalk-color-below-ring 

## Feature Engineering

#### 1. Feature Transformation
Tree-based artifacts (Random Forest, XGBoost) don't need scaling, but linear artifacts (Logistic Regression, SVM) do. So we don't need scaling


#### 2. Feature Selection (Mutual Information (Information Gain))

In [29]:
print("\n=== Mutual Information Feature Selection ===")

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded, y_train_encoded, random_state=42)

# Create DataFrame
mi_scores_df = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print(mi_scores_df)

# Select top features (e.g., MI score > 0.1 or top 15)
top_mi_features = mi_scores_df[mi_scores_df['mi_score'] > 0.1]['feature'].tolist()
# OR: top_mi_features = mi_scores_df.head(15)['feature'].tolist()

print(f"\nSelected {len(top_mi_features)} features with high mutual information")

X_train_mi = X_train_encoded[top_mi_features]
X_test_mi = X_test_encoded[top_mi_features]


=== Mutual Information Feature Selection ===
                     feature  mi_score
4                       odor  0.628823
19         spore-print-color  0.340945
8                 gill-color  0.293498
18                 ring-type  0.211812
12  stalk-surface-below-ring  0.198337
11  stalk-surface-above-ring  0.197850
13    stalk-color-above-ring  0.178775
14    stalk-color-below-ring  0.164071
7                  gill-size  0.152748
20                population  0.144867
3                    bruises  0.133102
21                   habitat  0.111289
10                stalk-root  0.102563
6               gill-spacing  0.072009
0                  cap-shape  0.038777
17               ring-number  0.027028
2                  cap-color  0.024342
1                cap-surface  0.013465
16                veil-color  0.012598
9                stalk-shape  0.009762
5            gill-attachment  0.008248
15                 veil-type  0.001011

Selected 13 features with high mutual information


In [30]:
# ========================================
# FEATURE IMPORTANCE SELECTION
# ========================================
print("\n=== Step 2: Feature Importance Selection ===")

# Train RF to get importances
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_temp.fit(X_train_encoded, y_train_encoded)

# Get importances
feature_importance = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'importance': rf_temp.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)

# Select top 15 features
top_important_features = feature_importance.head(15)['feature'].tolist()
print(f"\nSelected {len(top_important_features)} most important features")

X_train_important = X_train_encoded[top_important_features]
X_test_important = X_test_encoded[top_important_features]


=== Step 2: Feature Importance Selection ===
                     feature  importance
4                       odor    0.193744
8                 gill-color    0.114487
19         spore-print-color    0.106751
7                  gill-size    0.098617
18                 ring-type    0.070559
10                stalk-root    0.067075
20                population    0.056547
11  stalk-surface-above-ring    0.052588
3                    bruises    0.046456
12  stalk-surface-below-ring    0.028409
21                   habitat    0.027418
6               gill-spacing    0.025004
13    stalk-color-above-ring    0.024496
17               ring-number    0.019239
14    stalk-color-below-ring    0.017353
2                  cap-color    0.016126
9                stalk-shape    0.015122
1                cap-surface    0.009579
0                  cap-shape    0.003882
5            gill-attachment    0.003327
16                veil-color    0.003219
15                 veil-type    0.000000

Selected 1

### 3. Feature Creation (Interaction Features)

In [31]:
# ========================================
# INTERACTION FEATURES
# ========================================
print("\n===  Creating Interaction Features ===")

X_train_interactions = X_train_encoded.copy()
X_test_interactions = X_test_encoded.copy()

# Create key interactions
X_train_interactions['odor_x_gill-color'] = X_train_encoded['odor'] * X_train_encoded['gill-color']
X_test_interactions['odor_x_gill-color'] = X_test_encoded['odor'] * X_test_encoded['gill-color']

X_train_interactions['spore_x_gill'] = X_train_encoded['spore-print-color'] * X_train_encoded['gill-color']
X_test_interactions['spore_x_gill'] = X_test_encoded['spore-print-color'] * X_test_encoded['gill-color']

X_train_interactions['cap-shape_x_cap-color'] = X_train_encoded['cap-shape'] * X_train_encoded['cap-color']
X_test_interactions['cap-shape_x_cap-color'] = X_test_encoded['cap-shape'] * X_test_encoded['cap-color']

X_train_interactions['odor_x_stalk-root'] = X_train_encoded['odor'] * X_train_encoded['stalk-root']
X_test_interactions['odor_x_stalk-root'] = X_test_encoded['odor'] * X_test_encoded['stalk-root']

X_train_interactions['ring-number_x_ring-type'] = X_train_encoded['ring-number'] * X_train_encoded['ring-type']
X_test_interactions['ring-number_x_ring-type'] = X_test_encoded['ring-number'] * X_test_encoded['ring-type']

print(f"Added {X_train_interactions.shape[1] - X_train_encoded.shape[1]} interaction features")
print(f"Total features: {X_train_interactions.shape[1]}")


===  Creating Interaction Features ===
Added 5 interaction features
Total features: 27


In [32]:
# ========================================
# GROUPED FEATURES
# ========================================
print("\n=== Creating Grouped Features ===")

X_train_grouped = X_train_encoded.copy()
X_test_grouped = X_test_encoded.copy()

# Group cap features
cap_features = ['cap-shape', 'cap-surface', 'cap-color']
X_train_grouped['cap_combined'] = X_train_encoded[cap_features].sum(axis=1)
X_test_grouped['cap_combined'] = X_test_encoded[cap_features].sum(axis=1)

# Group gill features
gill_features = ['gill-attachment', 'gill-spacing', 'gill-size', 'gill-color']
X_train_grouped['gill_combined'] = X_train_encoded[gill_features].sum(axis=1)
X_test_grouped['gill_combined'] = X_test_encoded[gill_features].sum(axis=1)

# Group stalk features
stalk_features = [col for col in X_train_encoded.columns if 'stalk' in col]
X_train_grouped['stalk_combined'] = X_train_encoded[stalk_features].sum(axis=1)
X_test_grouped['stalk_combined'] = X_test_encoded[stalk_features].sum(axis=1)

# Group veil features
X_train_grouped['veil_combined'] = X_train_encoded['veil-type'] + X_train_encoded['veil-color']
X_test_grouped['veil_combined'] = X_test_encoded['veil-type'] + X_test_encoded['veil-color']

# Group ring features
X_train_grouped['ring_combined'] = X_train_encoded['ring-number'] + X_train_encoded['ring-type']
X_test_grouped['ring_combined'] = X_test_encoded['ring-number'] + X_test_encoded['ring-type']

print(f"Added {X_train_grouped.shape[1] - X_train_encoded.shape[1]} grouped features")
print(f"Total features: {X_train_grouped.shape[1]}")


=== Creating Grouped Features ===
Added 5 grouped features
Total features: 27


#### FEATURE EXTRACTION

In [33]:
# ========================================
# PCA FEATURE EXTRACTION
# ========================================
print("\n=== PCA Feature Extraction ===")

# Keep 95% of variance
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_encoded)
X_test_pca = pca.transform(X_test_encoded)

print(f"Original features: {X_train_encoded.shape[1]}")
print(f"PCA components: {X_train_pca.shape[1]}")
print(f"Variance explained: {pca.explained_variance_ratio_.sum():.4f}")

# Convert to DataFrame
pca_columns = [f'PC{i+1}' for i in range(X_train_pca.shape[1])]
X_train_pca = pd.DataFrame(X_train_pca, columns=pca_columns)
X_test_pca = pd.DataFrame(X_test_pca, columns=pca_columns)

print("\n‚úì All feature sets created!")


=== PCA Feature Extraction ===
Original features: 22
PCA components: 10
Variance explained: 0.9512

‚úì All feature sets created!


In [34]:
# ========================================
#  COMPARE ALL METHODS
# ========================================
print("\n" + "="*60)
print("COMPARING FEATURE ENGINEERING METHODS")
print("="*60)

results = []

def clean_features(X_train, X_test, y_train, var_threshold=0.01, k=15):
    vt = VarianceThreshold(threshold=var_threshold)
    X_train_var = vt.fit_transform(X_train)
    X_test_var  = vt.transform(X_test)
    kept_var = X_train.columns[vt.get_support()]
    X_train_var = pd.DataFrame(X_train_var, columns=kept_var)
    X_test_var  = pd.DataFrame(X_test_var,  columns=kept_var)

    selector = SelectKBest(score_func=mutual_info_classif, k=min(k, X_train_var.shape[1]))
    X_train_sel = selector.fit_transform(X_train_var, y_train)
    X_test_sel  = selector.transform(X_test_var)
    kept = kept_var[selector.get_support()]

    # Return DataFrames with reduced features
    return (
        pd.DataFrame(X_train_sel, columns=kept),
        pd.DataFrame(X_test_sel,  columns=kept)
    )


def evaluate_features(X_tr, X_te, y_tr, y_te, method_name):
    X_tr_clean, X_te_clean = clean_features(X_tr, X_te, y_tr, k=15)

    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    cv_scores = cross_val_score(model, X_tr_clean, y_tr, cv=5, scoring='accuracy')
    model.fit(X_tr_clean, y_tr)
    y_pred = model.predict(X_te_clean)

    test_acc = accuracy_score(y_te, y_pred)
    test_f1 = f1_score(y_te, y_pred, average='weighted')

    results.append({
        'Method': method_name,
        'Features Before': X_tr.shape[1],
        'Features After': X_tr_clean.shape[1],
        'CV Accuracy': cv_scores.mean(),
        'Test Accuracy': test_acc,
        'Test F1': test_f1
    })

    print(f"\n{method_name}:")
    print(f"  Features reduced from {X_tr.shape[1]} ‚Üí {X_tr_clean.shape[1]}")
    print(f"  CV Accuracy: {cv_scores.mean():.4f}")
    print(f"  Test Accuracy: {test_acc:.4f}")
    print(f"  Test F1: {test_f1:.4f}")
    return X_tr_clean, X_te_clean

# Evaluate all methods
evaluate_features(X_train_encoded, X_test_encoded,
                  y_train_encoded, y_test_encoded, "Original (All Features)")

evaluate_features(X_train_mi, X_test_mi,
                  y_train_encoded, y_test_encoded, "Mutual Information (Top 15)")

evaluate_features(X_train_important, X_test_important,
                  y_train_encoded, y_test_encoded, "Feature Importance (Top 15)")

evaluate_features(X_train_interactions, X_test_interactions,
                  y_train_encoded, y_test_encoded, "With Interaction Features")

evaluate_features(X_train_grouped, X_test_grouped,
                  y_train_encoded, y_test_encoded, "With Grouped Features")

evaluate_features(X_train_pca, X_test_pca,
                  y_train_encoded, y_test_encoded, "PCA Extraction")


COMPARING FEATURE ENGINEERING METHODS

Original (All Features):
  Features reduced from 22 ‚Üí 15
  CV Accuracy: 1.0000
  Test Accuracy: 1.0000
  Test F1: 1.0000

Mutual Information (Top 15):
  Features reduced from 13 ‚Üí 13
  CV Accuracy: 1.0000
  Test Accuracy: 1.0000
  Test F1: 1.0000

Feature Importance (Top 15):
  Features reduced from 15 ‚Üí 15
  CV Accuracy: 1.0000
  Test Accuracy: 1.0000
  Test F1: 1.0000

With Interaction Features:
  Features reduced from 27 ‚Üí 15
  CV Accuracy: 1.0000
  Test Accuracy: 1.0000
  Test F1: 1.0000

With Grouped Features:
  Features reduced from 27 ‚Üí 15
  CV Accuracy: 1.0000
  Test Accuracy: 1.0000
  Test F1: 1.0000

PCA Extraction:
  Features reduced from 10 ‚Üí 10
  CV Accuracy: 0.9974
  Test Accuracy: 0.9963
  Test F1: 0.9963


(           PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
 0     0.559194  6.874120 -2.051207  1.618437 -1.435714 -1.293377 -0.499180   
 1    -5.488872  1.379923 -0.083739  3.804188  1.958428  2.929394  0.839945   
 2     2.355083 -5.419380  2.216657  2.636469  0.647546 -3.093522 -0.318050   
 3    -6.783112 -0.766537  0.399875  0.332616 -2.642324 -1.394841 -0.557959   
 4    -6.117552  1.944988  0.116055  1.209141  2.826867 -0.861422  1.353859   
 ...        ...       ...       ...       ...       ...       ...       ...   
 6494 -6.629295 -0.727686 -0.311090 -0.049247 -2.888622  0.544776  1.619283   
 6495  1.343031 -3.765500  0.462709  3.530793  0.091051  2.420432  1.959492   
 6496  2.169395  0.991431  3.990295  0.764603 -1.960419  2.151598 -3.509326   
 6497  4.876425 -3.392834 -2.307205 -0.069146 -1.305018  0.692595  1.396193   
 6498  2.386398 -2.091909  0.514656 -0.904071 -1.105885 -1.492024 -0.879387   
 
            PC8       PC9      PC10  
 0    -1.029

In [35]:
# ========================================
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

# Identify best method (highest Test Accuracy)
best_method = results_df.loc[results_df['Test Accuracy'].idxmax()]

print(f"\nüèÜ Best Method: {best_method['Method']}")
print(f"   Test Accuracy: {best_method['Test Accuracy']:.4f}")
print(f"   CV Accuracy: {best_method['CV Accuracy']:.4f}")
print(f"   Features Before: {int(best_method['Features Before'])}")
print(f"   Features After: {int(best_method['Features After'])}")


SUMMARY
                     Method  Features Before  Features After  CV Accuracy  Test Accuracy  Test F1
    Original (All Features)               22              15     1.000000       1.000000 1.000000
Mutual Information (Top 15)               13              13     1.000000       1.000000 1.000000
Feature Importance (Top 15)               15              15     1.000000       1.000000 1.000000
  With Interaction Features               27              15     1.000000       1.000000 1.000000
      With Grouped Features               27              15     1.000000       1.000000 1.000000
             PCA Extraction               10              10     0.997384       0.996308 0.996308

üèÜ Best Method: Original (All Features)
   Test Accuracy: 1.0000
   CV Accuracy: 1.0000
   Features Before: 22
   Features After: 15


In [36]:
# ========================================
#  SAVE BEST FEATURES FOR FINAL MODEL
# ========================================
print("\n=== Saving Best Feature Set ===")

best_method_name = best_method['Method']
print(f"Detected best method: {best_method_name}")

if best_method_name == "Original (All Features)":
    print("All methods performed equally well ‚Äì selecting Mutual Information (Top 15) for interpretability.")
    best_method_name = "Mutual Information (Top 15)"

if 'Mutual Information' in best_method_name:
    X_train_final, X_test_final = clean_features(X_train_encoded, X_test_encoded, y_train_encoded, k=15)
    final_features = X_train_final.columns.tolist()

elif 'Feature Importance' in best_method_name:
    X_train_final, X_test_final = clean_features(X_train_encoded, X_test_encoded, y_train_encoded, k=15)
    final_features = X_train_final.columns.tolist()

elif 'Interaction' in best_method_name:
    X_train_final = X_train_interactions
    X_test_final = X_test_interactions
    final_features = list(X_train_interactions.columns)

elif 'Grouped' in best_method_name:
    X_train_final = X_train_grouped
    X_test_final = X_test_grouped
    final_features = list(X_train_grouped.columns)

elif 'PCA' in best_method_name:
    X_train_final = X_train_pca
    X_test_final = X_test_pca
    final_features = pca_columns

else:
    X_train_final = X_train_encoded
    X_test_final = X_test_encoded
    final_features = list(X_train_encoded.columns)

print(f"Selected method: {best_method_name}")
print(f"Features kept: {len(final_features)}")
print(f"Final training shape: {X_train_final.shape}")
print(f"Final test shape: {X_test_final.shape}")


=== Saving Best Feature Set ===
Detected best method: Original (All Features)
All methods performed equally well ‚Äì selecting Mutual Information (Top 15) for interpretability.
Selected method: Mutual Information (Top 15)
Features kept: 15
Final training shape: (6499, 15)
Final test shape: (1625, 15)


###  Production Preprocessing and Pipeline

All previous feature engineering steps were based on LabelEncoder-encoded data for exploration.
To ensure the same preprocessing can be automatically applied during deployment (API inference),
we now recreate the equivalent preprocessing logic using an OrdinalEncoder inside a scikit-learn
Pipeline (encoding ‚Üí variance threshold ‚Üí SelectKBest).


In [37]:
print("Final training shape:", X_train_final.shape)
print("Final test shape:", X_test_final.shape)

Final training shape: (6499, 15)
Final test shape: (1625, 15)


In [38]:
categorical_features = X_train.columns.tolist()

encoder = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer([
    ('cat', encoder, categorical_features)
])

prep_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('variance', VarianceThreshold(threshold=0.01)),
    ('select_kbest', SelectKBest(score_func=mutual_info_classif, k=15))
])

# Fit pipeline with target y for feature selection (mutual_info_classif needs y)
prep_pipeline.fit(X_train, y_train_encoded)

X_train_final = prep_pipeline.transform(X_train)
X_test_final  = prep_pipeline.transform(X_test)

print(f"‚úì Preprocessing pipeline applied")
print(f"Final training shape: {X_train_final.shape}")
print(f"Final test shape: {X_test_final.shape}")

vt = prep_pipeline.named_steps['variance']
skb = prep_pipeline.named_steps['select_kbest']

mask_var = vt.get_support()                        # boolean mask over original columns
names_after_var = [n for n, keep in zip(categorical_features, mask_var) if keep]

mask_kbest = skb.get_support()                     # boolean mask over post-variance features
selected_feature_names = [names_after_var[i] for i, keep in enumerate(mask_kbest) if keep]

print("Selected feature names (k=15):", selected_feature_names)


prep_package = {
    "pipeline": prep_pipeline,
    "original_feature_names": categorical_features,        # hyphenated column names
    "selected_feature_names": selected_feature_names,      # after variance + kbest
    "target_encoder": y_encoder,                           # keep only y encoder
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

# Save pipeline
joblib.dump(prep_package, "artifacts/mushroom_preprocessing_pipeline.pkl")
joblib.dump(X_train_final, "artifacts/X_train_final.pkl")
joblib.dump(X_test_final,  "artifacts/X_test_final.pkl")
joblib.dump(y_train_encoded, "artifacts/y_train_encoded.pkl")
joblib.dump(y_test_encoded,  "artifacts/y_test_encoded.pkl")
print("‚úì Saved preprocessing artifacts to artifacts/")

‚úì Preprocessing pipeline applied
Final training shape: (6499, 15)
Final test shape: (1625, 15)
Selected feature names (k=15): ['cap-shape', 'bruises', 'odor', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'ring-type', 'spore-print-color', 'population', 'habitat']
‚úì Saved preprocessing artifacts to artifacts/
