<a href="https://colab.research.google.com/github/metalmancode/mushroom-classifier/blob/main/07_asymmetric_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve

# --- STEP 1: LOAD THE LABELED TRAINING DATA ---
url_train = "https://drive.google.com/file/d/1Op1vQftBKN1lrPVGGLJU-UOlv_dScTup/view?usp=sharing"
path_train = "https://drive.google.com/uc?export=download&id=" + url_train.split("/")[-2]
mushroom_data = pd.read_csv(path_train).set_index('Id')

# Define Features (X) and Target (y)
X = mushroom_data.drop(columns=['poisonous'])
y = mushroom_data['poisonous']

# --- STEP 2: BUILD THE CATEGORICAL PIPELINE ---
full_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'),
    RandomForestClassifier(random_state=42)
)

# --- STEP 3: FIND THE "SAFE" THRESHOLD (ASYMMETRIC TUNING) ---
# We use cross-validation to get probabilities for the training set
cv_probs = cross_val_predict(full_pipeline, X, y, cv=5, method='predict_proba')
pos_probs = cv_probs[:, 1]

# Calculate ROC curve to find where Recall is 100%
fpr, tpr, thresholds = roc_curve(y, pos_probs)
# Find the first threshold that gives us a True Positive Rate (Recall) of 1.0
safe_threshold = thresholds[np.argmax(tpr >= 1.0)]

print(f"Safe Threshold found: {safe_threshold}")

# --- STEP 4: TRAIN ON FULL DATA ---
full_pipeline.fit(X, y)

# --- STEP 5: LOAD AND ALIGN THE FORAGED DATA ---
url_foraged = "https://drive.google.com/file/d/1eWxV9FGj6D-YnMsv4mHMWRcGIKbjrXYL/view?usp=drive_link"
path_foraged = "https://drive.google.com/uc?export=download&id=" + url_foraged.split("/")[-2]
X_foraged = pd.read_csv(path_foraged).set_index('Id')

# FIX: Force X_foraged to have the same columns in the same order as X
# This prevents the "Feature names must match" ValueError
X_foraged = X_foraged[X.columns]

# --- STEP 6: PREDICT AND SAVE ---
# Get probabilities for the foraged mushrooms
foraged_probs = full_pipeline.predict_proba(X_foraged)[:, 1]

# Apply the safe threshold: 1 if prob >= threshold, else 0
predictions = (foraged_probs >= safe_threshold).astype(int)

# Format for the competition app
submission = pd.DataFrame({
    'Id': X_foraged.index,
    'poisonous': predictions
})

submission.to_csv('mush_submission.csv', index=False)
print("File 'mush_submission.csv' is ready! üçÑ")

Safe Threshold found: 0.01
File 'mush_submission.csv' is ready! üçÑ


In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import recall_score, precision_recall_curve

# --- 1. DATA LOADING ---
def get_drive_url(url):
    return "https://drive.google.com/uc?export=download&id=" + url.split("/")[-2]

# Training Data
url_train = "https://drive.google.com/file/d/1Op1vQftBKN1lrPVGGLJU-UOlv_dScTup/view?usp=sharing"
df_train = pd.read_csv(get_drive_url(url_train)).set_index('Id')

# Competition (Foraged) Data
url_foraged = "https://drive.google.com/file/d/1eWxV9FGj6D-YnMsv4mHMWRcGIKbjrXYL/view?usp=drive_link"
X_foraged_raw = pd.read_csv(get_drive_url(url_foraged)).set_index('Id')

# Split features and target
X = df_train.drop(columns=['poisonous'])
y = df_train['poisonous']

# Ensure the foraged data columns match the training data exactly
X_foraged = X_foraged_raw[X.columns]

# --- 2. PIPELINE & HYPERPARAMETER TUNING ---
# We use a pipeline to handle missing values and categorical encoding
pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'), # Treat '?' as its own info
    OneHotEncoder(handle_unknown='ignore'),
    RandomForestClassifier(random_state=42, n_jobs=-1)
)

# Search for the best model settings
param_grid = {
    'randomforestclassifier__n_estimators': [200, 300],
    'randomforestclassifier__max_depth': [10, 20, None],
    'randomforestclassifier__min_samples_leaf': [1, 2]
}

print("Searching for best model parameters...")
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X, y)
best_model = grid.best_estimator_
print(f"Best Params: {grid.best_params_}")

# --- 3. OPTIMIZING THE ASYMMETRIC THRESHOLD ---
# We need the "safest" threshold that still lets us eat as much as possible.
# We use cross-validation to get robust probability scores.
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
y_probs = cross_val_predict(best_model, X, y, cv=cv, method='predict_proba')[:, 1]

# Find the threshold where Recall is 100% (Safety)
# precision_recall_curve returns (precision, recall, thresholds)
precisions, recalls, thresholds = precision_recall_curve(y, y_probs)

# Logic: Find all thresholds where Recall is 1.0, then pick the HIGHEST one
# to minimize False Positives (wasted edible mushrooms).
safe_thresholds = thresholds[recalls[:-1] == 1.0]
if len(safe_thresholds) > 0:
    optimal_threshold = np.max(safe_thresholds)
else:
    optimal_threshold = 0.01 # Ultra-conservative fallback

print(f"Optimal Safety Threshold: {optimal_threshold:.4f}")

# --- 4. FINAL PREDICTION & DEPLOYMENT ---
# Final fit on the entire training set
best_model.fit(X, y)

# Predict probabilities on the competition data
final_probs = best_model.predict_proba(X_foraged)[:, 1]

# Apply our optimized safety threshold
final_preds = (final_probs >= optimal_threshold).astype(int)

# Prepare submission
submission = pd.DataFrame({
    'Id': X_foraged.index,
    'poisonous': final_preds
})

submission.to_csv('optimized_mush_submission.csv', index=False)
print("Optimization Complete! 'optimized_mush_submission.csv' is ready.")

Searching for best model parameters...
Best Params: {'randomforestclassifier__max_depth': 20, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 200}
Optimal Safety Threshold: 0.0097
Optimization Complete! 'optimized_mush_submission.csv' is ready.


In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import precision_recall_curve
from sklearn.calibration import CalibratedClassifierCV

# --- 1. DATA ACQUISITION ---
# Helper function to convert a Google Drive file ID into a direct download link.
def get_url(drive_id):
    return f"https://drive.google.com/uc?export=download&id={drive_id}"

# Unique IDs for the training set and the unlabeled "foraged" (test) set.
train_id = "1Op1vQftBKN1lrPVGGLJU-UOlv_dScTup"
foraged_id = "1eWxV9FGj6D-YnMsv4mHMWRcGIKbjrXYL"

# Load dataframes and set 'Id' as the index to prevent it from being treated as a feature.
df_train = pd.read_csv(get_url(train_id)).set_index('Id')
X_foraged_raw = pd.read_csv(get_url(foraged_id)).set_index('Id')

# Separate features (X) and target variable (y).
X = df_train.drop(columns=['poisonous'])
y = df_train['poisonous']

# Critical step: Ensure the test set has exactly the same columns in the same order as the training set.
X_foraged = X_foraged_raw[X.columns]

# --- 2. THE EXPERT PIPELINE ---
# We wrap the preprocessing and the model into a single Pipeline object.
# This prevents "data leakage" and ensures that the test data is treated exactly like the training data.
base_pipe = make_pipeline(
    # Instead of guessing the mean/mode, we treat 'missing' as a unique category.
    # Often, the fact that data is missing is a signal in itself (MNAR - Missing Not At Random).
    SimpleImputer(strategy='constant', fill_value='unknown'),

    # Convert categorical strings into binary columns (1s and 0s).
    # 'handle_unknown=ignore' ensures the model doesn't crash if the test set has a category it hasn't seen before.
    OneHotEncoder(handle_unknown='ignore'),

    # RandomForest is robust to outliers and non-linear relationships.
    RandomForestClassifier(
        n_estimators=500,    # Use 500 trees for a stable, "averaged" result.
        max_depth=15,        # Limit depth to prevent the model from memorizing (overfitting) the training data.
        min_samples_leaf=2,  # Require at least 2 samples at a leaf to smooth out predictions.
        random_state=42,     # Fixed seed for reproducibility.
        n_jobs=-1            # Use all available CPU cores for speed.
    )
)

# --- 3. PROBABILITY CALIBRATION ---
# By default, a Random Forest's 'predict_proba' outputs are not true probabilities.
# CalibratedClassifierCV adjusts these outputs so that if the model says 70% poisonous,
# roughly 70% of those cases actually are poisonous. This is vital for fine-tuning thresholds.
print("Calibrating model for peak reliability...")
calibrated_model = CalibratedClassifierCV(base_pipe, method='sigmoid', cv=5)
calibrated_model.fit(X, y)

# --- 4. THRESHOLD OPTIMIZATION (THE "SURVIVAL" LOGIC) ---
# We use cross-validation to get "out-of-sample" probabilities for our training data.
# This tells us how the model performs on data it hasn't seen yet.
y_probs = cross_val_predict(calibrated_model, X, y, cv=5, method='predict_proba')[:, 1]

# Generate a list of precisions and recalls for every possible probability threshold.
precisions, recalls, thresholds = precision_recall_curve(y, y_probs)

# LOGIC: In mushroom foraging, a False Negative (eating a poisonous mushroom) is fatal.
# We want Recall to be 1.0 (100% of poisonous mushrooms are caught).
# We search for the 'optimal_threshold' which is the highest probability cutoff that still catches ALL toxins.
safe_mask = recalls[:-1] == 1.0
if any(safe_mask):
    optimal_threshold = thresholds[safe_mask].max()
else:
    # Fallback: if 100% recall is impossible, we set the threshold to the lowest probability observed.
    optimal_threshold = y_probs.min()

print(f"Targeting 100% Recall. Optimal Safety Threshold: {optimal_threshold:.4f}")

# --- 5. GENERATE FINAL COMPETITION SUBMISSION ---
# Predict the probability of being 'poisonous' for the new, unknown mushrooms.
final_probs = calibrated_model.predict_proba(X_foraged)[:, 1]

# Instead of the default 0.5 threshold, we use our 'Optimal Safety Threshold'.
# If the probability is even slightly above our "safe" limit, we flag it as poisonous (1).
final_predictions = (final_probs >= optimal_threshold).astype(int)

# Package the results into the format required for competition submission.
submission = pd.DataFrame({
    'Id': X_foraged.index,
    'poisonous': final_predictions
})

# Save to CSV without the pandas index.
submission.to_csv('final_pro_submission.csv', index=False)

# Summary for the user to verify the distribution of the results.
edible_count = (final_predictions == 0).sum()
poison_count = (final_predictions == 1).sum()
print(f"--- RESULTS ---")
print(f"Mushrooms sorted as Edible: {edible_count}")
print(f"Mushrooms flagged as Poisonous (Discarded): {poison_count}")

Calibrating model for peak reliability...
Targeting 100% Recall. Optimal Safety Threshold: 0.0394
--- RESULTS ---
Mushrooms sorted as Edible: 770
Mushrooms flagged as Poisonous (Discarded): 855
Check your folder for 'final_pro_submission.csv' and upload it! üçÑ
