In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:

df_original = pd.read_csv("mushroom-dataset/agaricus-lepiota.data", header= None)
column_names = [
    "class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor",
    "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape",
    "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring",
    "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color",
    "ring-number", "ring-type", "spore-print-color", "population", "habitat"
]
df_mushrooms = df_original.copy()

df_mushrooms.columns = column_names
df_mushrooms.head()
df_mushrooms.replace("?", np.nan, inplace = True)

In [4]:
# Drop rows with missing values (if few)
missing_cols = df_mushrooms.columns[df_mushrooms.isnull().any()].tolist()
if df_mushrooms.isnull().sum().max() < len(df_mushrooms) * 0.05:  # Less than 5%
    df_mushrooms = df_mushrooms.dropna()

else:
    for col in missing_cols:
        mode_value = df_mushrooms[col].mode()[0]
        df_mushrooms[col].fillna(mode_value, inplace=True)
        print(f"Filled {col} missing values with mode: {mode_value}")

Filled stalk-root missing values with mode: b


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mushrooms[col].fillna(mode_value, inplace=True)


In [6]:
print(df_mushrooms.isna().sum()) # no missing values now

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


In [5]:
# 5. SPLIT DATA FIRST (before encoding)
X = df_mushrooms.drop('class', axis=1)  # Assuming 'class' is target
y = df_mushrooms['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Keep class balance in both sets
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


Training set: (6499, 22)
Test set: (1625, 22)


In [6]:
# 6. ENCODE CATEGORICAL FEATURES (fit on train, transform on test)
label_encoders = {}
X_train_encoded = pd.DataFrame()
X_test_encoded = pd.DataFrame()

for column in X_train.columns:
    le = LabelEncoder()
    X_train_encoded[column] = le.fit_transform(X_train[column])

    X_test_encoded[column] = X_test[column].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )

    label_encoders[column] = le

    print(f"{column}: {len(le.classes_)} classes")

# we wouldn't use Label encoding since our data doesn't assume order, but our model suggestions were Decision Tree Classifier, Random Forest Classifier, Categorical Naive Bayes - they don't take distance between the values as something meaningful so we can use that.
y_encoder = LabelEncoder()
y_train_encoded = y_encoder.fit_transform(y_train)
y_test_encoded = y_encoder.transform(y_test)

print(f"Target classes: {y_encoder.classes_}")
print(f"y_train_encoded shape: {y_train_encoded.shape}")
print(f"y_test_encoded shape: {y_test_encoded.shape}")


cap-shape: 6 classes
cap-surface: 4 classes
cap-color: 10 classes
bruises: 2 classes
odor: 9 classes
gill-attachment: 2 classes
gill-spacing: 2 classes
gill-size: 2 classes
gill-color: 12 classes
stalk-shape: 2 classes
stalk-root: 4 classes
stalk-surface-above-ring: 4 classes
stalk-surface-below-ring: 4 classes
stalk-color-above-ring: 9 classes
stalk-color-below-ring: 9 classes
veil-type: 1 classes
veil-color: 4 classes
ring-number: 3 classes
ring-type: 5 classes
spore-print-color: 9 classes
population: 6 classes
habitat: 7 classes
Target classes: ['e' 'p']
y_train_encoded shape: (6499,)
y_test_encoded shape: (1625,)


In [7]:
# 9. Final check
print("Training features shape:", X_train_encoded.shape)
print("Test features shape:", X_test_encoded.shape)
print("\nSample of encoded training data:")
print(X_train_encoded.head())
print("\nNo missing values in encoded data:")
print(X_train_encoded.isnull().sum().sum() == 0)

Training features shape: (6499, 22)
Test features shape: (1625, 22)

Sample of encoded training data:
   cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0          2            3          9        0     2                1   
1          5            2          5        1     5                1   
2          0            2          3        0     5                1   
3          2            2          4        0     7                1   
4          3            3          4        0     2                1   

   gill-spacing  gill-size  gill-color  stalk-shape  ...  \
0             0          0           2            0  ...   
1             0          0           1            0  ...   
2             1          0          10            0  ...   
3             0          1           0            1  ...   
4             0          1           0            1  ...   

   stalk-surface-below-ring  stalk-color-above-ring  stalk-color-below-ring  \
0                        

## Feature Engineering

#### 1. Feature Transformation


#### 2. Feature Selection (Mutual Information (Information Gain))

In [8]:
# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded, y_train_encoded, random_state=42)

# Create DataFrame
mi_scores_df = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print(mi_scores_df)

# Select top features (e.g., MI score > 0.1 or top 15)
top_mi_features = mi_scores_df[mi_scores_df['mi_score'] > 0.1]['feature'].tolist()

print(f"\nSelected {len(top_mi_features)} features with high mutual information")

X_train_mi = X_train_encoded[top_mi_features]
X_test_mi = X_test_encoded[top_mi_features]

                     feature  mi_score
4                       odor  0.628823
19         spore-print-color  0.340945
8                 gill-color  0.293498
18                 ring-type  0.211812
12  stalk-surface-below-ring  0.198337
11  stalk-surface-above-ring  0.197850
13    stalk-color-above-ring  0.178775
14    stalk-color-below-ring  0.164071
7                  gill-size  0.152748
20                population  0.144867
3                    bruises  0.133102
21                   habitat  0.111289
10                stalk-root  0.079536
6               gill-spacing  0.072009
0                  cap-shape  0.038777
17               ring-number  0.027028
2                  cap-color  0.024342
1                cap-surface  0.013465
16                veil-color  0.012598
9                stalk-shape  0.009762
5            gill-attachment  0.008248
15                 veil-type  0.001011

Selected 12 features with high mutual information


In [9]:
# Train RF to get importances
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_temp.fit(X_train_encoded, y_train_encoded)

# Get importances
feature_importance = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'importance': rf_temp.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)

# Select top 12 features
top_important_features = feature_importance.head(12)['feature'].tolist()
print(f"\nSelected {len(top_important_features)} most important features")

X_train_important = X_train_encoded[top_important_features]
X_test_important = X_test_encoded[top_important_features]

                     feature  importance
4                       odor    0.179867
8                 gill-color    0.122870
7                  gill-size    0.113607
19         spore-print-color    0.102116
18                 ring-type    0.065316
20                population    0.061165
10                stalk-root    0.054003
11  stalk-surface-above-ring    0.050782
3                    bruises    0.044802
21                   habitat    0.033251
12  stalk-surface-below-ring    0.030241
6               gill-spacing    0.026149
13    stalk-color-above-ring    0.023031
17               ring-number    0.018300
9                stalk-shape    0.017362
14    stalk-color-below-ring    0.016437
2                  cap-color    0.015987
1                cap-surface    0.012648
0                  cap-shape    0.004196
5            gill-attachment    0.003947
16                veil-color    0.003923
15                 veil-type    0.000000

Selected 12 most important features


### 3. Feature Creation (Interaction Features)

In [10]:
print("\nClass balance in training:")
print(y_train.value_counts(normalize=True))


Class balance in training:
class
e    0.517926
p    0.482074
Name: proportion, dtype: float64


In [11]:
# Train model directly (NO outlier removal)
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

print("Starting Feature Engineering...")
print(f"Original data: {X_train_encoded.shape[1]} features")


Starting Feature Engineering...
Original data: 22 features


In [12]:
# Interaction features
X_train_interactions = X_train_encoded.copy()
X_test_interactions = X_test_encoded.copy()

# Create key interactions
X_train_interactions['odor_x_gill-color'] = X_train_encoded['odor'] * X_train_encoded['gill-color']
X_test_interactions['odor_x_gill-color'] = X_test_encoded['odor'] * X_test_encoded['gill-color']

X_train_interactions['spore_x_gill'] = X_train_encoded['spore-print-color'] * X_train_encoded['gill-color']
X_test_interactions['spore_x_gill'] = X_test_encoded['spore-print-color'] * X_test_encoded['gill-color']

X_train_interactions['cap-shape_x_cap-color'] = X_train_encoded['cap-shape'] * X_train_encoded['cap-color']
X_test_interactions['cap-shape_x_cap-color'] = X_test_encoded['cap-shape'] * X_test_encoded['cap-color']

X_train_interactions['odor_x_stalk-root'] = X_train_encoded['odor'] * X_train_encoded['stalk-root']
X_test_interactions['odor_x_stalk-root'] = X_test_encoded['odor'] * X_test_encoded['stalk-root']

X_train_interactions['ring-number_x_ring-type'] = X_train_encoded['ring-number'] * X_train_encoded['ring-type']
X_test_interactions['ring-number_x_ring-type'] = X_test_encoded['ring-number'] * X_test_encoded['ring-type']

print(f"Added {X_train_interactions.shape[1] - X_train_encoded.shape[1]} interaction features")
print(f"Total features: {X_train_interactions.shape[1]}")

Added 5 interaction features
Total features: 27


In [13]:
X_train_grouped = X_train_encoded.copy()
X_test_grouped = X_test_encoded.copy()

# Group cap features
cap_features = ['cap-shape', 'cap-surface', 'cap-color']
X_train_grouped['cap_combined'] = X_train_encoded[cap_features].sum(axis=1)
X_test_grouped['cap_combined'] = X_test_encoded[cap_features].sum(axis=1)

# Group gill features
gill_features = ['gill-attachment', 'gill-spacing', 'gill-size', 'gill-color']
X_train_grouped['gill_combined'] = X_train_encoded[gill_features].sum(axis=1)
X_test_grouped['gill_combined'] = X_test_encoded[gill_features].sum(axis=1)

# Group stalk features
stalk_features = [col for col in X_train_encoded.columns if 'stalk' in col]
X_train_grouped['stalk_combined'] = X_train_encoded[stalk_features].sum(axis=1)
X_test_grouped['stalk_combined'] = X_test_encoded[stalk_features].sum(axis=1)

# Group veil features
X_train_grouped['veil_combined'] = X_train_encoded['veil-type'] + X_train_encoded['veil-color']
X_test_grouped['veil_combined'] = X_test_encoded['veil-type'] + X_test_encoded['veil-color']

# Group ring features
X_train_grouped['ring_combined'] = X_train_encoded['ring-number'] + X_train_encoded['ring-type']
X_test_grouped['ring_combined'] = X_test_encoded['ring-number'] + X_test_encoded['ring-type']

print(f"Added {X_train_grouped.shape[1] - X_train_encoded.shape[1]} grouped features")
print(f"Total features: {X_train_grouped.shape[1]}")

Added 5 grouped features
Total features: 27


#### FEATURE EXTRACTION

In [14]:
# 95% of variance
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_encoded)
X_test_pca = pca.transform(X_test_encoded)

print(f"Original features: {X_train_encoded.shape[1]}")
print(f"PCA components: {X_train_pca.shape[1]}")
print(f"Variance explained: {pca.explained_variance_ratio_.sum():.4f}")

pca_columns = [f'PC{i+1}' for i in range(X_train_pca.shape[1])]
X_train_pca = pd.DataFrame(X_train_pca, columns=pca_columns)
X_test_pca = pd.DataFrame(X_test_pca, columns=pca_columns)

Original features: 22
PCA components: 10
Variance explained: 0.9529


In [15]:
# Comparing all methods
results = []

def evaluate_features(X_tr, X_te, y_tr, y_te, method_name):
    """Evaluate a feature set"""
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

    # Cross-validation
    cv_scores = cross_val_score(model, X_tr, y_tr, cv=5, scoring='accuracy')

    # Train and test
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)

    # Metrics
    test_acc = accuracy_score(y_te, y_pred)
    test_f1 = f1_score(y_te, y_pred, average='weighted')

    results.append({
        'Method': method_name,
        'Features': X_tr.shape[1],
        'CV Accuracy': cv_scores.mean(),
        'CV Std': cv_scores.std(),
        'Test Accuracy': test_acc,
        'Test F1': test_f1
    })

    print(f"\n{method_name}:")
    print(f"  Features: {X_tr.shape[1]}")
    print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Test Accuracy: {test_acc:.4f}")
    print(f"  Test F1: {test_f1:.4f}")

# Evaluate all methods
evaluate_features(X_train_encoded, X_test_encoded,
                  y_train_encoded, y_test_encoded, "Original (All Features)")

evaluate_features(X_train_mi, X_test_mi,
                  y_train_encoded, y_test_encoded, "Mutual Information (Top 15)")

evaluate_features(X_train_important, X_test_important,
                  y_train_encoded, y_test_encoded, "Feature Importance (Top 15)")

evaluate_features(X_train_interactions, X_test_interactions,
                  y_train_encoded, y_test_encoded, "With Interaction Features")

evaluate_features(X_train_grouped, X_test_grouped,
                  y_train_encoded, y_test_encoded, "With Grouped Features")

evaluate_features(X_train_pca, X_test_pca,
                  y_train_encoded, y_test_encoded, "PCA Extraction")


Original (All Features):
  Features: 22
  CV Accuracy: 1.0000 (+/- 0.0000)
  Test Accuracy: 1.0000
  Test F1: 1.0000

Mutual Information (Top 15):
  Features: 12
  CV Accuracy: 1.0000 (+/- 0.0000)
  Test Accuracy: 1.0000
  Test F1: 1.0000

Feature Importance (Top 15):
  Features: 12
  CV Accuracy: 1.0000 (+/- 0.0000)
  Test Accuracy: 1.0000
  Test F1: 1.0000

With Interaction Features:
  Features: 27
  CV Accuracy: 1.0000 (+/- 0.0000)
  Test Accuracy: 1.0000
  Test F1: 1.0000

With Grouped Features:
  Features: 27
  CV Accuracy: 1.0000 (+/- 0.0000)
  Test Accuracy: 1.0000
  Test F1: 1.0000

PCA Extraction:
  Features: 10
  CV Accuracy: 0.9969 (+/- 0.0015)
  Test Accuracy: 0.9963
  Test F1: 0.9963


In [16]:
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

# Find best method
best_method = results_df.loc[results_df['Test Accuracy'].idxmax()]
print(f"\n Best Method: {best_method['Method']}")
print(f"   Test Accuracy: {best_method['Test Accuracy']:.4f}")
print(f"   Features: {int(best_method['Features'])}")

                     Method  Features  CV Accuracy   CV Std  Test Accuracy  Test F1
    Original (All Features)        22     1.000000 0.000000       1.000000 1.000000
Mutual Information (Top 15)        12     1.000000 0.000000       1.000000 1.000000
Feature Importance (Top 15)        12     1.000000 0.000000       1.000000 1.000000
  With Interaction Features        27     1.000000 0.000000       1.000000 1.000000
      With Grouped Features        27     1.000000 0.000000       1.000000 1.000000
             PCA Extraction        10     0.996923 0.001538       0.996308 0.996308

 Best Method: Original (All Features)
   Test Accuracy: 1.0000
   Features: 22


In [17]:
# Identify which method performed best
best_method_name = best_method['Method']

if 'Mutual Information' in best_method_name:
    X_train_final = X_train_mi
    X_test_final = X_test_mi
    final_features = top_mi_features
elif 'Feature Importance' in best_method_name:
    X_train_final = X_train_important
    X_test_final = X_test_important
    final_features = top_important_features
elif 'Interaction' in best_method_name:
    X_train_final = X_train_interactions
    X_test_final = X_test_interactions
    final_features = X_train_interactions.columns.tolist()
elif 'Grouped' in best_method_name:
    X_train_final = X_train_grouped
    X_test_final = X_test_grouped
    final_features = X_train_grouped.columns.tolist()
elif 'PCA' in best_method_name:
    X_train_final = X_train_pca
    X_test_final = X_test_pca
    final_features = pca_columns
else:
    X_train_final = X_train_encoded
    X_test_final = X_test_encoded
    final_features = X_train_encoded.columns.tolist()

print(f"Selected: {best_method_name}")
print(f"Features: {len(final_features)}")
print(f"\nFeature names:")
print(final_features)


Selected: Original (All Features)
Features: 22

Feature names:
['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


### Implement a reproducible data pipeline

In [19]:
import joblib

pkg = joblib.load('mushroom_pipeline.pkl')
pipeline = pkg['pipeline']
target_encoder = pkg['target_encoder']
expected_features = pkg['feature_names']

# Save
joblib.dump(X_train_final[expected_features], 'X_train_final.pkl')
joblib.dump(X_test_encoded[expected_features], 'X_test_final.pkl')
joblib.dump(y_train_encoded, 'y_train_encoded.pkl')
joblib.dump(y_test_encoded, 'y_test_encoded.pkl')

print(f"✓ Pipeline loaded successfully")
print(f"  Trained: {pkg['timestamp']}")
print(f"  Expected features: {len(expected_features)}")
print(f"\nExpected features: {expected_features}")

✓ Pipeline loaded successfully
  Trained: 2025-10-04 17:01:52
  Expected features: 15

Expected features: ['odor', 'spore-print-color', 'gill-color', 'ring-type', 'stalk-surface-below-ring', 'stalk-surface-above-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'gill-size', 'population', 'bruises', 'habitat', 'stalk-root', 'gill-spacing', 'cap-shape']


In [20]:
print(f"✓ Test data available: {X_test_encoded.shape}")
print(f"✓ Pipeline expects: {len(expected_features)} features")

# Select ONLY the features the pipeline was trained on
X_new = X_test_encoded[expected_features].copy()

print(f"✓ Features selected: {X_new.shape}")
print(f"✓ Test labels: {y_test_encoded.shape}")

# Show sample
print(f"\nFirst 3 rows of selected features:")
print(X_new.head(3))
print(f"\nExpected features being used:")
print(expected_features)

✓ Test data available: (1625, 22)
✓ Pipeline expects: 15 features
✓ Features selected: (1625, 15)
✓ Test labels: (1625,)

First 3 rows of selected features:
      odor  spore-print-color  gill-color  ring-type  \
4632     2                  1           7          2   
3444     1                  3           9          4   
1209     5                  3           7          0   

      stalk-surface-below-ring  stalk-surface-above-ring  \
4632                         1                         1   
3444                         2                         2   
1209                         2                         2   

      stalk-color-above-ring  stalk-color-below-ring  gill-size  population  \
4632                       6                       4          0           5   
3444                       7                       7          1           3   
1209                       7                       7          0           3   

      bruises  habitat  stalk-root  gill-spacing  cap-shape 

In [21]:
def validate_input_data(X, expected_columns, pipeline_name="Pipeline"):
    """Validate input data before prediction"""
    import pandas as pd

    errors = []
    warnings = []

    # Check type
    if not isinstance(X, pd.DataFrame):
        errors.append("Input must be a pandas DataFrame")
        return errors, warnings

    # Check columns
    missing_cols = set(expected_columns) - set(X.columns)
    if missing_cols:
        errors.append(f"Missing columns: {missing_cols}")

    extra_cols = set(X.columns) - set(expected_columns)
    if extra_cols:
        warnings.append(f"Extra columns (will be ignored): {extra_cols}")

    # Check for empty data
    if len(X) == 0:
        errors.append("Input data is empty")

    # Check for all-null columns
    null_cols = X.columns[X.isnull().all()].tolist()
    if null_cols:
        warnings.append(f"Columns with all null values: {null_cols}")

    # Print results
    if errors:
        print(f" {pipeline_name} validation FAILED:")
        for err in errors:
            print(f"  - {err}")

    if warnings:
        print(f" {pipeline_name} validation warnings:")
        for warn in warnings:
            print(f"  - {warn}")

    if not errors and not warnings:
        print(f"✓ {pipeline_name} validation passed")

    return errors, warnings

In [22]:
print(f"✓ Using test set with {X_new.shape[1]} features (already selected)")

✓ Using test set with 15 features (already selected)
